In [42]:
import csv
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from stop_words import get_stop_words

In [30]:
# Define default variables
prefix = 'small'
cut = 1000

In [46]:
# get stop words
en_stop = get_stop_words('en')

###### Reading data

In [31]:
with open('./data/{}-padded'.format(prefix), 'rb') as f:
    paddedreader = csv.reader(f)

    padded = [row for row in paddedreader]
with open('./data/{}-responses'.format(prefix), 'r') as f2:
    responses = json.load(f2)

In [32]:
raw_docs = map(lambda x: " ".join(x).replace(" <PAD/>", ""),padded)
y_raw = map(lambda x: x.index(1) ,responses)
df =  pd.DataFrame({"Doc" : raw_docs, "Category" : y_raw})

###### Split training and test data

In [33]:
# create training/testing
doc_train, doc_test = train_test_split(df, test_size=0.3, random_state=0)

###### Build classifier

In [47]:
# Build linear classifier using square_hinge loss. Can use SVM by switching to hinge loss.
text_clf = Pipeline([('vect', CountVectorizer(stop_words = en_stop)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])
text_clf.fit(doc_train["Doc"], doc_train["Category"])
predicted_category = text_clf.predict(doc_test["Doc"])
print('Accuracy: %.2f' % accuracy_score(doc_test["Category"], predicted_category))
print(confusion_matrix(y_true=doc_test["Category"], y_pred=predicted_category))

Accuracy: 0.75
[[132   2   0   4   1   0   0   2   3   1   1]
 [  5 102   1   0  11   1   1   8   3   7  12]
 [  0   2 119  10   2   0   1   2   0   1   0]
 [  1   0   7 143   2   0   1   0   0   3   1]
 [  1  22   3   2  68   1   7  17   9   1   8]
 [  0   1   0   0   0 148   6   0   1   3   3]
 [  6   2   5   2   1  14 107   0   2   5  14]
 [  1   9   2   2   5   1   1 116   1   1   2]
 [  9   9   0   1   6   4   1  10  91  18   2]
 [  2   9   2   2   4   3   1   3  14  76  11]
 [  0   9   3   2   7   3   8   0   1   9 115]]


In [48]:
target_names = ['class 0', 'class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6', 'class 7', 'class 8', 'class 9', 'class 10']
print(classification_report(y_true=doc_test["Category"], y_pred=predicted_category, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.84      0.90      0.87       146
    class 1       0.61      0.68      0.64       151
    class 2       0.84      0.87      0.85       137
    class 3       0.85      0.91      0.88       158
    class 4       0.64      0.49      0.55       139
    class 5       0.85      0.91      0.88       162
    class 6       0.80      0.68      0.73       158
    class 7       0.73      0.82      0.78       141
    class 8       0.73      0.60      0.66       151
    class 9       0.61      0.60      0.60       127
   class 10       0.68      0.73      0.71       157

avg / total       0.75      0.75      0.74      1627



In [50]:
# run cross validation
scores = cross_val_score(text_clf, df["Doc"], df["Category"], cv=5)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.80829493  0.73548387  0.76476015  0.70940959  0.59409594]
Accuracy: 0.72 (+/- 0.14)


In [36]:
# Accuracy on training set. Detect potential overfitting
predicted_category = text_clf.predict(doc_train["Doc"])
print('Accuracy: %.2f' % accuracy_score(doc_train["Category"], predicted_category))

Accuracy: 0.96


###### Futher work

- More features, especially adding word vectors
- include topic modeling as features?
- Training on large dataset