In [295]:
import pandas as pd
import numpy as np
from nltk import tag, word_tokenize
from time import time
from os import path, makedirs
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.externals import joblib
from utility import normalize_data

In [140]:
## load data
df1 = pd.read_csv('data/S08/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
df2 = pd.read_csv('data/S09/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
df3 = pd.read_csv('data/S10/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
frames = [df1, df2, df3]
df = pd.concat(frames)

questions = df['Question']
answers = df['Answer']

In [141]:
##
## categorization of penn-tree
##
## - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
##
penn_scale = {
    'CC': 1,
    'CD': 2,
    'DT': 3, 
    'EX': 4,
    'FW': 5,
    'IN': 6,
    'JJ': 7,
    'JJR': 8,
    'JJS': 9,
    'LS': 10,
    'MD': 11,
    'NN': 12,
    'NNS': 13,
    'NNP': 14,
    'NNPS': 15,
    'PDT': 16,
    'POS': 17,
    'PRP': 18,
    'PRP$': 19,
    'RB': 20,
    'RBR': 21,
    'RBS': 22,
    'RP': 23,
    'SYM': 24,
    'TO': 25,
    'UH': 26,
    'VB': 27,
    'VBD': 28,
    'VBG': 29,
    'VBN': 30,
    'VBP': 31,
    'VBZ': 32,
    'WDT': 33,
    'WP': 34,
    'WP$': 35,
    'WRB': 36
}

def tokenizer(sentence):
    '''

    penn-tree: tokenize + parts of speech

    '''

    sent = word_tokenize(sentence)
    pos = tag.pos_tag(sent)
    return([x[1] for x in pos if x[1] and x[1] in penn_scale])

def replace(list, dictionary):
    '''

    replace list item with corresponding dict value

    '''

    return [dictionary.get(item, item) for item in list]

In [142]:
##
## order of appending 'pos' (1) must match the order of appending
##     the 'sent_type' (2) sentence type.
##

## (1) pos: combine questions + answers
questions_pos = questions.apply(tokenizer)
questions_unique = [replace(list(x), penn_scale)  for x in set(tuple(x) for x in questions_pos)]

answers_pos = answers.apply(tokenizer)
answers_unique = [replace(list(x), penn_scale) for x in set(tuple(x) for x in answers_pos)]

## append data
pos = answers_unique + questions_unique

## (2) sentence type
sent_type = []
for i in range(len(questions_unique)):
    sent_type.append('0')

for j in range(len(answers_unique)):
    sent_type.append('1')

In [143]:
## adjusted dataset
df_adjusted = pd.DataFrame({
    'pos': pos,
    'type': sent_type
})

## train + test
X_train, X_test, y_train, y_test = train_test_split(
    df_adjusted['pos'],
    df_adjusted['type'],
    test_size=0.2
)

## print shape
print('X_train: {X}, y_train: {y}'.format(
    X=X_train.shape,
    y=y_train.shape
))

print('X_test: {X}, y_test: {y}'.format(
    X=X_test.shape,
    y=y_test.shape
))

X_train: (1861,), y_train: (1861,)
X_test: (466,), y_test: (466,)


In [144]:
X_train.head(10)

483      [3, 12, 12, 30, 6, 7, 13, 33, 28, 6, 14, 17, 12]
1404                      [7, 12, 14, 14, 12, 28, 14, 12]
908                        [3, 14, 14, 14, 1, 14, 14, 14]
172     [13, 6, 14, 14, 14, 7, 14, 14, 14, 14, 6, 14, ...
432                                     [7, 12, 1, 7, 12]
548                      [14, 14, 17, 12, 20, 28, 25, 27]
404                                            [3, 7, 12]
1647                          [34, 32, 3, 12, 14, 14, 12]
1772                                      [14, 7, 13, 30]
912     [6, 3, 12, 32, 30, 3, 7, 2, 11, 27, 25, 3, 8, 12]
Name: pos, dtype: object

In [145]:
y_train.head(10)

483     0
1404    1
908     0
172     0
432     0
548     0
404     0
1647    1
1772    1
912     0
Name: type, dtype: object

In [296]:
## train: ensure nested lists same length
length = len(sorted(X_train, key=len, reverse=True)[0])
X_train_final=np.array([xi+[stop_gap]*(length-len(xi)) for xi in X_train])
X_train_final=pd.DataFrame(X_train_final)

## test: ensure nested lists same length
length = len(sorted(X_test, key=len, reverse=True)[0])
X_test_final=np.array([xi+[stop_gap]*(length-len(xi)) for xi in X_test])
X_test_final=pd.DataFrame(X_test_final)

In [297]:
## normalize datasets
X_train_final, X_tests_final = normalize_data(X_train_final, X_test_final)

In [298]:
print(X_train_final.head(10))
print(X_test_final.head(10))

   0   1   2   3   4   5   6   7   8   9  ...  30  31  32  33  34  35  36  37  \
0   3  12  12  30   6   7  13  33  28   6 ...  40  40  40  40  40  40  40  40   
1   7  12  14  14  12  28  14  12  40  40 ...  40  40  40  40  40  40  40  40   
2   3  14  14  14   1  14  14  14  40  40 ...  40  40  40  40  40  40  40  40   
3  13   6  14  14  14   7  14  14  14  14 ...  17   7  14   6   3  14  32  14   
4   7  12   1   7  12  40  40  40  40  40 ...  40  40  40  40  40  40  40  40   
5  14  14  17  12  20  28  25  27  40  40 ...  40  40  40  40  40  40  40  40   
6   3   7  12  40  40  40  40  40  40  40 ...  40  40  40  40  40  40  40  40   
7  34  32   3  12  14  14  12  40  40  40 ...  40  40  40  40  40  40  40  40   
8  14   7  13  30  40  40  40  40  40  40 ...  40  40  40  40  40  40  40  40   
9   6   3  12  32  30   3   7   2  11  27 ...  40  40  40  40  40  40  40  40   

   38  39  
0  40  40  
1  40  40  
2  40  40  
3  17   3  
4  40  40  
5  40  40  
6  40  40  
7  40  40  


In [299]:
##
## random forrest: selected due to high accuracy, and hasn't
##     been implemented in the project.
##
clf=RandomForestClassifier(n_estimators=1000)

tr0 = time()
clf.fit(X_train_final, np.asarray(y_train))
tr1 = time()
y_pred=clf.predict(X_test_final)
tr2 = time()

In [300]:
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[220,  71],
       [ 50, 125]], dtype=int64)

In [301]:
print('Accuracy: {accuracy}'.format(accuracy=accuracy_score(y_test, y_pred)))

Accuracy: 0.740343347639485


In [302]:
print('random forrest\ntrain: {rf_train},\npredict: {rf_predict}'.format(
    rf_train=tr1-tr0,
    rf_predict=tr2-tr1
))

random forrest
train: 3.754288673400879,
predict: 0.2472233772277832


In [303]:
## export model
if not path.exists('model'):
    makedirs('model')
joblib.dump(clf, 'model/random_forrest.pkl', compress=9)

['model/random_forrest.pkl']