In [1]:
import pandas as pd
import numpy as np
from time import time
from os import path, makedirs
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
from utility import normalize_data, tokenizer, replace, penn_scale

In [2]:
## load data
df1 = pd.read_csv('data/S08/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
df2 = pd.read_csv('data/S09/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
df3 = pd.read_csv('data/S10/question_answer_pairs.txt', sep='\\t', engine='python').dropna()
frames = [df1, df2, df3]
df = pd.concat(frames)

questions = df['Question']
answers = df['Answer']

In [3]:
##
## order of appending 'pos' (1) must match the order of appending
##     the 'sent_type' (2) sentence type.
##

## (1) pos: combine questions + answers
questions_pos = questions.apply(tokenizer)
questions_unique = [replace(list(x), penn_scale())  for x in set(tuple(x) for x in questions_pos)]

answers_pos = answers.apply(tokenizer)
answers_unique = [replace(list(x), penn_scale()) for x in set(tuple(x) for x in answers_pos)]

## append data
pos = answers_unique + questions_unique

## (2) sentence type
sent_type = []
for i in range(len(questions_unique)):
    sent_type.append('0')

for j in range(len(answers_unique)):
    sent_type.append('1')

In [4]:
## adjusted dataset
df_adjusted = pd.DataFrame({
    'pos': pos,
    'type': sent_type
})

## train + test
X_train, X_test, y_train, y_test = train_test_split(
    df_adjusted['pos'],
    df_adjusted['type'],
    test_size=0.2
)

## print shape
print('X_train: {X}, y_train: {y}'.format(
    X=X_train.shape,
    y=y_train.shape
))

print('X_test: {X}, y_test: {y}'.format(
    X=X_test.shape,
    y=y_test.shape
))

X_train: (1861,), y_train: (1861,)
X_test: (466,), y_test: (466,)


In [5]:
X_train.head(10)

1409                           [37, 29, 4, 8, 15, 15, 29]
2284                    [35, 29, 22, 8, 7, 30, 14, 7, 14]
245                   [5, 32, 8, 8, 14, 7, 15, 2, 15, 15]
2245        [15, 15, 29, 7, 4, 8, 13, 7, 15, 7, 15, 7, 3]
2227                          [35, 33, 4, 13, 26, 28, 14]
824                                                  [10]
1649                          [37, 29, 15, 13, 13, 7, 15]
646                          [4, 10, 13, 7, 4, 13, 7, 13]
559                        [4, 15, 13, 33, 31, 7, 15, 15]
517     [4, 32, 14, 30, 4, 14, 2, 14, 8, 14, 32, 21, 3...
Name: pos, dtype: object

In [6]:
y_train.head(10)

1409    1
2284    1
245     0
2245    1
2227    1
824     0
1649    1
646     0
559     0
517     0
Name: type, dtype: object

In [7]:
## normalize datasets
X_train_final = normalize_data(X_train, stop_gap=40, train=True)
X_test_final = normalize_data(X_test, stop_gap=40, train=True)

In [8]:
print(X_train_final.head(10))
print(X_test_final.head(10))

   0   1   2   3   4   5   6   7   8   9  ...  30  31  32  33  34  35  36  37  \
0  37  29   4   8  15  15  29  40  40  40 ...  40  40  40  40  40  40  40  40   
1  35  29  22   8   7  30  14   7  14  40 ...  40  40  40  40  40  40  40  40   
2   5  32   8   8  14   7  15   2  15  15 ...  40  40  40  40  40  40  40  40   
3  15  15  29   7   4   8  13   7  15   7 ...  40  40  40  40  40  40  40  40   
4  35  33   4  13  26  28  14  40  40  40 ...  40  40  40  40  40  40  40  40   
5  10  40  40  40  40  40  40  40  40  40 ...  40  40  40  40  40  40  40  40   
6  37  29  15  13  13   7  15  40  40  40 ...  40  40  40  40  40  40  40  40   
7   4  10  13   7   4  13   7  13  40  40 ...  40  40  40  40  40  40  40  40   
8   4  15  13  33  31   7  15  15  40  40 ...  40  40  40  40  40  40  40  40   
9   4  32  14  30   4  14   2  14   8  14 ...  40  40  40  40  40  40  40  40   

   38  39  
0  40  40  
1  40  40  
2  40  40  
3  40  40  
4  40  40  
5  40  40  
6  40  40  
7  40  40  


In [62]:
##
## random forrest: selected due to high accuracy, and hasn't
##     been implemented in the project.
##
clf=RandomForestClassifier(n_estimators=1500)

tr0 = time()
clf.fit(X_train_final, np.asarray(y_train))
tr1 = time()
y_pred=clf.predict(X_test_final)
tr2 = time()

In [63]:
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[221,  64],
       [ 56, 125]], dtype=int64)

In [64]:
print('Accuracy: {accuracy}'.format(accuracy=accuracy_score(y_test, y_pred)))

Accuracy: 0.7424892703862661


In [65]:
print('random forrest\ntrain: {rf_train},\npredict: {rf_predict}'.format(
    rf_train=tr1-tr0,
    rf_predict=tr2-tr1
))

random forrest
train: 5.493486642837524,
predict: 0.298356294631958


In [66]:
## export model
if not path.exists('model'):
    makedirs('model')
joblib.dump(clf, 'model/random_forest.pkl', compress=True)

['model/random_forest.pkl']