In [11]:
import csv
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

In [2]:
# Define default variables
prefix = 'small'
cut = 1000

###### Reading data

In [8]:
with open('./data/{}-padded'.format(prefix), 'rb') as f:
    paddedreader = csv.reader(f)

    padded = [row for row in paddedreader]
with open('./data/{}-responses'.format(prefix), 'r') as f2:
    responses = json.load(f2)

In [9]:
raw_docs = map(lambda x: " ".join(x).replace(" <PAD/>", ""),padded)
y_raw = map(lambda x: x.index(1) ,responses)

###### Split training and test data

In [14]:
np.random.seed(1000)
shuffle_indices = np.random.permutation(np.arange(len(y_raw)))
x_raw_shuffled = np.asarray(raw_docs)[shuffle_indices]
y_raw_shuffled = np.asarray(y_raw)[shuffle_indices]
# Split train/test set

x_raw_train, x_raw_test = x_raw_shuffled[:-cut], x_raw_shuffled[-cut:]
y_raw_train, y_raw_test = y_raw_shuffled[:-cut], y_raw_shuffled[-cut:]

###### Build classifier

In [17]:
# Build linear classifier using square_hinge loss. Can use SVM by switching to hinge loss.
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(list(x_raw_train), y_raw_train)
predicted_svm = text_clf.predict(x_raw_test)
np.mean(predicted_svm == y_raw_test)

0.68000000000000005

In [None]:
# Accuracy on training set. Detect potential overfitting
np.mean(text_clf.predict(x_raw_train) == y_raw_train)

###### Futher work

- Are responses factorized?
- Accuracy on individual classes
- Cross validation
- More features, especially adding word vectors
- Training on large dataset