In [36]:
import pandas as pd
import numpy as np

# Data Preparation

In [None]:
# Load Data
def load_data(csv_file):
    names = ["question", "type"]
    dataset_all = pd.read_csv(csv_file,names=names)
    return dataset_all

dataset_all = load_data('nikiai_train.csv')

In [37]:
# Peek into the Data
dataset_all.head(5)

Unnamed: 0,question,type
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what


In [38]:
print("Shape - {}").format(dataset_all.shape)

Shape - (1483, 2)


In [39]:
# Preapre data - get features and labels
def preapre_data(dataset_all):
    dataset=dataset_all.values
    X_Train = dataset[:,0]
    Y_Train = dataset[:,1]
    return  X_Train, Y_Train

X_Train, Y_Train = preapre_data(dataset_all)

In [40]:
print X_Train[:10]
print Y_Train[:10]

['how did serfdom develop in and then leave russia ? '
 'what films featured the character popeye doyle ? '
 "how can i find a list of celebrities ' real names ? "
 'what fowl grabs the spotlight after the chinese year of the monkey ? '
 'what is the full form of .com ? '
 'what contemptible scoundrel stole the cork from my lunch ? '
 "what team did baseball 's st. louis browns become ? "
 'what is the oldest profession ? ' 'what are liver enzymes ? '
 'name the scar-faced bounty hunter of the old west . ']
['unknown' 'what' 'unknown' 'what' 'what' 'what' 'what' 'what' 'what'
 'unknown']


In [45]:
# Clean Data
def remove_questionmark(slist):
    new_x = []
    for x in slist:
         new_x.append(x.replace("?",""))
    return new_x
    
X_Train = remove_questionmark(X_Train)
X_Train = map(str.rstrip,X_Train)

from string import digits

def remove_numbers(slist):
    res = map(lambda x: x.translate(None, digits), slist)
    return res

X_Train = remove_numbers(X_Train)

In [46]:
# Encode labels
from sklearn.preprocessing import LabelEncoder

def encode_lables(Y_Train):
    encoder = LabelEncoder()
    encoder.fit(Y_Train)
    encoded_y = encoder.transform(Y_Train)
    return  encoder, encoded_y

encoder, encoded_y = encode_lables(Y_Train)
encoded_y = encoded_y.reshape((-1, 1))

In [54]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 50
def tokenise_padding(X_Train):
    #Construct a tokenizer object, initialized with the number of total terms we want.
    tok = Tokenizer(MAX_NB_WORDS)
    tok.fit_on_texts(X_Train)
    X_Train = tok.texts_to_sequences(X_Train)
    X_Train_pad = pad_sequences(X_Train, maxlen=MAX_SEQUENCE_LENGTH)
    return  tok, X_Train_pad

tok, X_Train_pad = tokenise_padding(X_Train)

In [55]:
# Test data preapre
X_Test=["Name 11 famous martyrs",
"Who was the inventor of silly putty ?",
"What 1920s cowboy star rode Tony the Wonder Horse ?",
"How many villi are found in the small intestine ?",
"does this hose have one ?",
"What is your name?",
"When is the show happening?",
"Is there a cab available for airport?",
"What time does the train leave",
"when was the last time you did something for the first time" ]

def preapre_testData(X_Test,tok):
    #X_Test= map(lambda x:x.lower,X_Test)
    X_Test = map(str.lower,X_Test)
    X_Test = remove_questionmark(X_Test)
    X_Test = map(str.rstrip,X_Test)
    X_Test = remove_numbers(X_Test)
    #MAX_SEQUENCE_LENGTH = 50
    X_Test = tok.texts_to_sequences(X_Test)
    X_Test_pad = pad_sequences(X_Test, maxlen=MAX_SEQUENCE_LENGTH)
    return X_Test_pad

X_Test_pad = preapre_testData(X_Test, tok)

# Defining Model architectures

In [58]:
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten, Dropout, Activation
from keras.layers import  Embedding
from keras.layers.recurrent import SimpleRNN
# from keras.layers.recurrent import  LSTM, GRU
# from keras.layers.convolutional import Convolution1D
# from keras.layers import Conv1D, MaxPooling1D

#Model Architecture
#Define the model
def model_vanillaRNN():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, 32, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Dropout(0.25))
    model.add(SimpleRNN(16, return_sequences=False))
    model.add(Dense(256))
    model.add(Dropout(0.25))
    model.add(Activation('relu'))
    model.add(Dense(5))
    model.add(Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

rnn1 = model_vanillaRNN()
print(rnn1.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 50, 32)        160000      embedding_input_2[0][0]          
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 50, 32)        0           embedding_2[0][0]                
____________________________________________________________________________________________________
simplernn_1 (SimpleRNN)          (None, 16)            784         dropout_2[0][0]                  
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 256)           4352        simplernn_1[0][0]                
___________________________________________________________________________________________

In [60]:
#Fitting the Model
rnn1.fit(X_Train_pad, encoded_y, batch_size=32, nb_epoch=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9e96a9e290>

In [68]:
from keras.layers.recurrent import  LSTM, GRU
from keras.layers.convolutional import Convolution1D
from keras.layers import Conv1D, MaxPooling1D

def model_lstm_cnn():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, 32, input_length=MAX_SEQUENCE_LENGTH))
    model.add(Dropout(0.25))
    model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
    model.add(MaxPooling1D(pool_length=2))
    model.add(LSTM(100))
    model.add(Dense(5))
    model.add(Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

cnn_lstm1 = model_lstm_cnn()
print(cnn_lstm1.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_4 (Embedding)          (None, 50, 32)        160000      embedding_input_4[0][0]          
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, 50, 32)        0           embedding_4[0][0]                
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 50, 32)        3104        dropout_5[0][0]                  
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D)    (None, 25, 32)        0           convolution1d_2[0][0]            
___________________________________________________________________________________________

In [69]:
cnn_lstm1.fit(X_Train_pad, encoded_y, batch_size=32, nb_epoch=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9e77235310>

# Evaluation

In [62]:
#Splitting dataset for Evaluation
from sklearn.cross_validation import train_test_split
split_X_train, split_X_test, split_y_train, split_y_test = train_test_split(X_Train_pad, encoded_y, random_state=33) 

In [80]:
#Model 1 - Evaluation
def evaluate_model(model, train, test):
    scores = model.evaluate(train, test )
    print scores
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    
evaluate_model(rnn1, split_X_test, split_y_test)

acc: 100.00%


In [79]:
evaluate_model(cnn_lstm1, split_X_test, split_y_test)

# #Model 2 - Evaluation
# scores = cnn_lstm1.evaluate(split_X_train, split_y_train )
# print scores
# print("%s: %.2f%%" % (cnn_lstm1.metrics_names[1], scores[1]*100))

acc: 100.00%


# Make Predictions

In [81]:
def predict_classes(model, X_Test):
    res = rnn1.predict_classes(X_Test)
    print encoder.inverse_transform(res)

#Model 1 Predictions
predict_classes(rnn1, X_Test_pad)

['unknown' 'who' 'what' 'unknown' 'affirmation' 'who' 'when' 'affirmation'
 'when' 'unknown']


In [82]:
#Model 2 Predictions
predict_classes(cnn_lstm1, X_Test_pad)

['unknown' 'who' 'what' 'unknown' 'affirmation' 'who' 'when' 'affirmation'
 'when' 'unknown']
