In [1]:
import pandas as pd
import numpy as np
from time import time
from os import path, makedirs
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.externals import joblib
from utility import normalize_data, tokenizer, replace, penn_scale

In [2]:
## load data
df1 = pd.read_csv('data/S08/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
df2 = pd.read_csv('data/S09/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
df3 = pd.read_csv('data/S10/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
frames = [df1, df2, df3]
df = pd.concat(frames)

questions = df['Question']
answers = df['Answer']

In [3]:
##
## order of appending 'pos' (1) must match the order of appending
##     the 'sent_type' (2) sentence type.
##

## (1) pos: combine questions + answers
questions_pos = questions.apply(tokenizer)
questions_unique = [replace(list(x), penn_scale())  for x in set(tuple(x) for x in questions_pos)]

answers_pos = answers.apply(tokenizer)
answers_unique = [replace(list(x), penn_scale()) for x in set(tuple(x) for x in answers_pos)]

## append data
pos = answers_unique + questions_unique

## (2) sentence type
sent_type = []
for i in range(len(questions_unique)):
    sent_type.append('0')

for j in range(len(answers_unique)):
    sent_type.append('1')

In [4]:
## adjusted dataset
df_adjusted = pd.DataFrame({
    'pos': pos,
    'type': sent_type
})

## train + test
X_train, X_test, y_train, y_test = train_test_split(
    df_adjusted['pos'],
    df_adjusted['type'],
    test_size=0.2
)

## print shape
print('X_train: {X}, y_train: {y}'.format(
    X=X_train.shape,
    y=y_train.shape
))

print('X_test: {X}, y_test: {y}'.format(
    X=X_test.shape,
    y=y_test.shape
))

X_train: (1861,), y_train: (1861,)
X_test: (466,), y_test: (466,)


In [5]:
X_train.head(10)

1389                 [34, 31, 3, 2, 13, 6, 3, 14, 14, 32]
570                            [7, 13, 1, 12, 30, 23, 13]
643     [3, 12, 6, 3, 14, 14, 12, 28, 30, 6, 3, 12, 6,...
682                                   [18, 32, 30, 6, 13]
1377                             [14, 4, 3, 12, 12, 6, 2]
728     [3, 14, 12, 28, 30, 25, 20, 6, 3, 12, 17, 12, ...
1575                      [34, 31, 30, 13, 20, 30, 30, 6]
1457                  [36, 28, 3, 7, 7, 13, 31, 6, 3, 12]
662                           [3, 12, 13, 31, 20, 27, 13]
2017                   [36, 7, 13, 32, 14, 28, 3, 14, 14]
Name: pos, dtype: object

In [6]:
y_train.head(10)

1389    0
570     0
643     0
682     0
1377    0
728     0
1575    1
1457    1
662     0
2017    1
Name: type, dtype: object

In [7]:
## stop gap: used for column padding
stop_gap = 40

In [8]:
## normalize datasets
X_train_final, X_test_final = normalize_data(X_train, X_test, stop_gap=40)

In [9]:
print(X_train_final.head(10))
print(X_test_final.head(10))

   0   1   2   3   4   5   6   7   8   9  ...  30  31  32  33  34  35  36  37  \
0  34  31   3   2  13   6   3  14  14  32 ...  40  40  40  40  40  40  40  40   
1   7  13   1  12  30  23  13  40  40  40 ...  40  40  40  40  40  40  40  40   
2   3  12   6   3  14  14  12  28  30   6 ...  40  40  40  40  40  40  40  40   
3  18  32  30   6  13  40  40  40  40  40 ...  40  40  40  40  40  40  40  40   
4  14   4   3  12  12   6   2  40  40  40 ...  40  40  40  40  40  40  40  40   
5   3  14  12  28  30  25  20   6   3  12 ...  40  40  40  40  40  40  40  40   
6  34  31  30  13  20  30  30   6  40  40 ...  40  40  40  40  40  40  40  40   
7  36  28   3   7   7  13  31   6   3  12 ...  40  40  40  40  40  40  40  40   
8   3  12  13  31  20  27  13  40  40  40 ...  40  40  40  40  40  40  40  40   
9  36   7  13  32  14  28   3  14  14  40 ...  40  40  40  40  40  40  40  40   

   38  39  
0  40  40  
1  40  40  
2  40  40  
3  40  40  
4  40  40  
5  40  40  
6  40  40  
7  40  40  


In [10]:
##
## random forrest: selected due to high accuracy, and hasn't
##     been implemented in the project.
##
clf=RandomForestClassifier(n_estimators=1000)

tr0 = time()
clf.fit(X_train_final, np.asarray(y_train))
tr1 = time()
y_pred=clf.predict(X_test_final)
tr2 = time()

In [11]:
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[235,  49],
       [ 57, 125]], dtype=int64)

In [12]:
print('Accuracy: {accuracy}'.format(accuracy=accuracy_score(y_test, y_pred)))

Accuracy: 0.7725321888412017


In [13]:
print('random forrest\ntrain: {rf_train},\npredict: {rf_predict}'.format(
    rf_train=tr1-tr0,
    rf_predict=tr2-tr1
))

random forrest
train: 3.900865316390991,
predict: 0.20032000541687012


In [14]:
## export model
if not path.exists('model'):
    makedirs('model')
joblib.dump(clf, 'model/random_forest.pkl')

['model/random_forest.pkl']