## Load Libraries
Load required librarires

In [1]:
# Load Libraries
import numpy as np
import pandas as pd
import collections
import nltk
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle
import os.path
from keras.models import Sequential,load_model
from keras.layers.core import Activation,Dense,Dropout,SpatialDropout1D
from keras.layers.wrappers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM,GRU
from keras import regularizers
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
test.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


## Prepare Data
prepare train data

In [5]:
maxlen=0
word_freqs=collections.Counter()

In [6]:
train_description=train["Description"]
train_description.head()

0    The room was kind of clean but had a VERY stro...
1    I stayed at the Crown Plaza April -- - April -...
2    I booked this hotel through Hotwire at the low...
3    Stayed here with husband and sons on the way t...
4    My girlfriends and I stayed here to celebrate ...
Name: Description, dtype: object

In [7]:
test_description=test["Description"]
test_description.head()

0    Looking for a motel in close proximity to TV t...
1    Walking distance to Madison Square Garden and ...
2    Visited Seattle on business. Spent - nights in...
3    This hotel location is excellent and the rooms...
4    This hotel is awesome I love the service Antho...
Name: Description, dtype: object

In [8]:
for sentence in train_description:
    words=nltk.word_tokenize(sentence.lower())
    if len(words)>maxlen:
        maxlen=len(words)
    for word in words:
        word_freqs[word]+=1


In [9]:
maxlen,len(word_freqs)

(3767, 64274)

In [10]:
MAX_SENTENCE_LENGTH=500
MAX_FEATURES=30000
vocab_size=min(MAX_FEATURES,len(word_freqs))+2
word2index={x[0]:i+2 for i,x in enumerate(word_freqs.most_common(MAX_FEATURES))}
word2index["PAD"]=0
word2index["UNK"]=1
index2word={v:k for k,v in word2index.items()}

In [11]:
vocab_size

30002

In [12]:
word2index["PAD"],index2word[0]

(0, 'PAD')

In [13]:
X_train_filename="X_train.p"
X_test_filename="X_test.p"
y_train_filename="y_train.p"
model_filename="model.h5"

def normalize(train_description):
    X=np.empty((train_description.size,),dtype=list)
    i=0
    for sentence in train_description:
        words=nltk.word_tokenize(sentence.lower())
        seqs=[]
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index["UNK"])
        X[i]=seqs
        i+=1
    return sequence.pad_sequences(X,maxlen=MAX_SENTENCE_LENGTH)

def denormalize_response(predictions):
    return ['happy' if x > 0.5  else 'not_happy' for x in predictions]

def normalize_response(predictions):
    return [1 if x == 'happy' else 0 for x in predictions]

def load_data(force=False):
    if os.path.exists(X_train_filename) and os.path.exists(X_test_filename) and os.path.exists(y_train_filename) and not force:        
        X_train=pickle.load( open( X_train_filename, "rb" ) )
        X_test=pickle.load( open( X_test_filename, "rb" ) )
        y_train=pickle.load( open( y_train_filename, "rb" ) )
    else:
        X_train=normalize(train_description)
        X_test=normalize(test_description)
        y_train=normalize_response(train["Is_Response"])
        pickle.dump( X_train, open( X_train_filename, "wb" ))
        pickle.dump( X_test, open( X_test_filename, "wb" ))
        pickle.dump( y_train, open( y_train_filename, "wb" ))
    return X_train,X_test,y_train

X_train,X_test,y_train=load_data()
denormalize_response(normalize_response(train["Is_Response"]))[:10]

['not_happy',
 'not_happy',
 'not_happy',
 'happy',
 'not_happy',
 'happy',
 'not_happy',
 'happy',
 'happy',
 'not_happy']

In [14]:
Xtrain,Xtest,ytrain,ytest=train_test_split(X_train,y_train,test_size=0.3,random_state=40)

## Train Model
Train model on prepared data

In [None]:
EMBEDDING_SIZE=128
HIDDEN_LAYER_SIZE=64
BATCH_SIZE=32
NUM_EPOCHS=2
DROPOUT=0.1

def load_train_model(force=False):
    if os.path.exists(model_filename) and not force:
        model=load_model(model_filename)
    else:
        print("Force load model.")
        model=Sequential()
        model.add(Embedding(vocab_size,EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
        model.add(SpatialDropout1D(DROPOUT))
        model.add(Bidirectional(LSTM(HIDDEN_LAYER_SIZE,dropout=DROPOUT,recurrent_dropout=DROPOUT)))
        model.add(Dense(1))
        model.add(Activation("sigmoid"))
        model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["accuracy"])
    return model

checkpoint=ModelCheckpoint(model_filename, monitor='val_acc', verbose=0, save_best_only=False, mode='auto', period=1)
model=load_train_model()
history=model.fit(Xtrain,ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=(Xtest,ytest),callbacks=[checkpoint])

Train on 27252 samples, validate on 11680 samples
Epoch 1/2
Epoch 2/2


In [None]:
model=load_model(model_filename)
model.compile(loss="binary_crossentropy",optimizer="rmsprop",metrics=["accuracy"])
predictions=model.predict(X_test)
predictions=denormalize_response(predictions)
predictions[:10]

In [None]:
test_result_filename="test_result.csv"
test_result=pd.concat([test['User_ID'],pd.DataFrame(predictions)],axis=1)
test_result.columns=['User_ID','Is_Response']
test_result.to_csv(test_result_filename,index=False)
print("File Saved!")
test_result.head()