In [1]:
import pandas as pd
import numpy as np
from nltk import tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
## load data
df = pd.read_csv('data/S10/question_answer_pairs.txt', sep='\\t', engine='python').dropna()

questions = df['Question']
answers = df['Answer']

In [79]:
##
## categorization of penn-tree
##
## - https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
##
penn_scale = {
    'CC': 1,
    'CD': 2,
    'DT': 3, 
    'EX': 4,
    'FW': 5,
    'IN': 6,
    'JJ': 7,
    'JJR': 8,
    'JJS': 9,
    'LS': 10,
    'MD': 11,
    'NN': 12,
    'NNS': 13,
    'NNP': 14,
    'NNPS': 15,
    'PDT': 16,
    'POS': 17,
    'PRP': 18,
    'PRP$': 19,
    'RB': 20,
    'RBR': 21,
    'RBS': 22,
    'RP': 23,
    'SYM': 24,
    'TO': 25,
    'UH': 26,
    'VB': 27,
    'VBD': 28,
    'VBG': 29,
    'VBN': 30,
    'VBP': 31,
    'VBZ': 32,
    'WDT': 33,
    'WP': 34,
    'WP$': 35,
    'WRB': 36
}

def tokenizer(sentence):
    '''

    penn-tree: tokenize + parts of speech

    '''

    sent = word_tokenize(sentence)
    pos = tag.pos_tag(sent)
    return([x[1] for x in pos if x[1] and x[1] in penn_scale])

def replace(list, dictionary):
    '''

    replace list item with corresponding dict value

    '''

    return [dictionary.get(item, item) for item in list]

In [80]:
##
## order of appending 'pos' (1) must match the order of appending
##     the 'sent_type' (2) sentence type.
##

## (1) pos: combine questions + answers
questions_pos = questions.apply(tokenizer)
questions_unique = [replace(list(x), penn_scale)  for x in set(tuple(x) for x in questions_pos)]

answers_pos = answers.apply(tokenizer)
answers_unique = [replace(list(x), penn_scale) for x in set(tuple(x) for x in answers_pos)]

## append data
pos = answers_unique + questions_unique

## (2) sentence type
sent_type = []
for i in range(len(questions_unique)):
    sent_type.append('0')

for j in range(len(answers_unique)):
    sent_type.append('1')

In [81]:
## adjusted dataset
df_adjusted = pd.DataFrame({
    'pos': pos,
    'type': sent_type
})

## train + test
X_train, X_test, y_train, y_test = train_test_split(
    df_adjusted['pos'],
    df_adjusted['type'],
    test_size=0.2
)

## print shape
print('X_train: {X}, y_train: {y}'.format(
    X=X_train.shape,
    y=y_train.shape
))

print('X_test: {X}, y_test: {y}'.format(
    X=X_test.shape,
    y=y_test.shape
))

X_train: (836,), y_train: (836,)
X_test: (209,), y_test: (209,)


In [82]:
X_train.head(10)

323                                [6]
835     [32, 14, 14, 14, 3, 9, 12, 12]
610         [36, 32, 3, 12, 6, 14, 14]
538       [6, 36, 7, 32, 3, 7, 12, 12]
282                    [6, 12, 25, 27]
22                      [30, 6, 7, 12]
424    [4, 31, 20, 3, 2, 2, 7, 13, 31]
114       [3, 20, 3, 13, 31, 6, 7, 12]
933                 [14, 14, 28, 6, 2]
50       [3, 2, 13, 31, 14, 14, 1, 14]
Name: pos, dtype: object

In [83]:
y_train.head(10)

323    0
835    1
610    1
538    0
282    0
22     0
424    0
114    0
933    1
50     0
Name: type, dtype: object

In [88]:
## ensure nested lists same length
length = len(sorted(X_train, key=len, reverse=True)[0])
X_train_final=np.array([xi+[100]*(length-len(xi)) for xi in X_train])
X_train_final=pd.DataFrame(X_train_final)

In [89]:
print(X_train_final)

     0    1    2    3    4    5    6    7    8    9  ...    24   25   26   27  \
0     6  100  100  100  100  100  100  100  100  100 ...   100  100  100  100   
1    32   14   14   14    3    9   12   12  100  100 ...   100  100  100  100   
2    36   32    3   12    6   14   14  100  100  100 ...   100  100  100  100   
3     6   36    7   32    3    7   12   12  100  100 ...   100  100  100  100   
4     6   12   25   27  100  100  100  100  100  100 ...   100  100  100  100   
5    30    6    7   12  100  100  100  100  100  100 ...   100  100  100  100   
6     4   31   20    3    2    2    7   13   31  100 ...   100  100  100  100   
7     3   20    3   13   31    6    7   12  100  100 ...   100  100  100  100   
8    14   14   28    6    2  100  100  100  100  100 ...   100  100  100  100   
9     3    2   13   31   14   14    1   14  100  100 ...   100  100  100  100   
10   34   32   32   14   14   27  100  100  100  100 ...   100  100  100  100   
11   32   14    3    9   12 

In [91]:
##
## random forrest: selected due to high accuracy, and hasn't
##     been implemented in the project.
##
clf=RandomForestClassifier(n_estimators=5)
clf.fit(X_train_final, np.asarray(y_train))
#y_pred=clf.predict(X_test)

ValueError: setting an array element with a sequence.