In [1]:
from utilities import *
from model import *
from metrics import Metrics
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from tqdm import tqdm
import time
import multiprocessing
from sklearn import svm
from gensim.test.utils import get_tmpfile
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.sparse import csr_matrix as sparse
from os import listdir
from os.path import isfile, join

# Prepare data


In [2]:
file_dir_our      = "/Users/igoradamski/Documents/cambridge/MLMI/nlp/coursework/nlp/data"
our_data      = DataHandler(file_dir_our)
our_data.readOurData()

train, test   = our_data.blind_test()

cv_train      = DataHandler()
cv_train(train.x_train, train.y_train)

blind_test = DataHandler()
blind_test(test.x_train, test.y_train)

100%|██████████| 2000/2000 [00:01<00:00, 1828.01it/s]


# Train several NB models

In [12]:
accs = {}
cv_train.roundRobinSplit(10)

accs['N1'] = np.mean(np.array(Metrics.roundRobinCV(cv_train, 10, NBModel, threshold = [0,0], grams = ['uni', 'bi'], smoothing = 1.5)))
accs['N2'] = np.mean(np.array(Metrics.roundRobinCV(cv_train, 10, NBModel, threshold = [0], grams = 'uni', smoothing = 1.5)))
accs['N3'] = np.mean(np.array(Metrics.roundRobinCV(cv_train, 10, NBModel, threshold = [4], grams = 'uni', smoothing = 1.5)))
accs['N4'] = np.mean(np.array(Metrics.roundRobinCV(cv_train, 10, NBModel, threshold = [0], grams = 'bi', smoothing = 1.5)))
accs['N5'] = np.mean(np.array(Metrics.roundRobinCV(cv_train, 10, NBModel, threshold = [8], grams = 'bi', smoothing = 1.5)))
accs['N6'] = np.mean(np.array(Metrics.roundRobinCV(cv_train, 10, NBModel, threshold = [4,8], grams = ['uni', 'bi'], smoothing = 1.5)))


100%|██████████| 10/10 [01:11<00:00,  7.16s/it]
100%|██████████| 10/10 [00:16<00:00,  1.63s/it]
100%|██████████| 10/10 [00:20<00:00,  2.04s/it]
100%|██████████| 10/10 [01:05<00:00,  6.53s/it]
100%|██████████| 10/10 [00:48<00:00,  4.83s/it]
100%|██████████| 10/10 [01:09<00:00,  7.00s/it]


In [13]:
accs

{'N1': 0.8400000000000001,
 'N2': 0.8222222222222222,
 'N3': 0.8072222222222223,
 'N4': 0.8461111111111113,
 'N5': 0.635,
 'N6': 0.7472222222222222}

# Train models


In [3]:
# M1
print('training M1...')
M1 = NBModel(threshold = [0,0], grams = ['uni', 'bi'], smoothing = 1.5)
M1.train(cv_train.x_data, cv_train.y_data)


# M2
print('training M2...')
bow2vec_model = BoW2Vec([4,4], ['uni', 'bi'])
bow2vec_model.getFullVocab(cv_train)

bow_vector_data = DataHandler()
bow_vector_data(bow2vec_model.text2vec(cv_train), cv_train.y_data)
bow_vector_data.x_data = sparse(bow_vector_data.x_data)

M2 = MySVM(kernel = 'linear', decision_function_shape='ovr', break_ties = True, random_state = 1234)
M2.train(bow_vector_data.x_data, bow_vector_data.y_data)

# M3
print('training M3...')
M3 = MySVM(kernel = 'rbf', gamma = 'scale', decision_function_shape='ovr', break_ties = True, random_state = 1234)
M3.train(bow_vector_data.x_data, bow_vector_data.y_data)

# M4
print('training M4...')
model_path = '/Users/igoradamski/Documents/cambridge/MLMI/nlp/coursework/nlp/doc2vec_models/'
model_name = 'model_dm=0,hs=1,min_count=10,vector_size=50,window=5.bin'

doc2vec_model = MyDoc2Vec()
doc2vec_model.load(model_path+model_name)

d2v_vector_data = DataHandler()
d2v_vector_data(DataHandler.applyDoc2Vec(cv_train.x_data, doc2vec_model.model), cv_train.y_data)

M4 = MySVM(kernel = 'linear', decision_function_shape='ovr', break_ties = True, random_state = 1234)
M4.train(d2v_vector_data.x_data, d2v_vector_data.y_data)

# M5
print('training M5...')
M5 = MySVM(kernel = 'rbf', gamma = 'scale', decision_function_shape='ovr', break_ties = True, random_state = 1234)
M5.train(d2v_vector_data.x_data, d2v_vector_data.y_data)

training M1...
training M2...


100%|██████████| 1800/1800 [00:04<00:00, 413.03it/s]


training M3...
training M4...
training M5...


# Evaluate accuracy on test set


In [5]:
np.random.seed(2313)
accuracy = {}
d2v_vector_test = DataHandler()
d2v_vector_test(DataHandler.applyDoc2Vec(blind_test.x_data, doc2vec_model.model), blind_test.y_data)

bow_vector_test = DataHandler()
bow_vector_test(bow2vec_model.text2vec(blind_test), blind_test.y_data)

# Evaluate
predictions_m1 = M1.predict(blind_test.x_data)
predictions_m2 = M2.predict(bow_vector_test.x_data)
predictions_m3 = M3.predict(bow_vector_test.x_data)
predictions_m4 = M4.predict(d2v_vector_test.x_data)
predictions_m5 = M5.predict(d2v_vector_test.x_data)

# Get accuracies
accuracy['M1'] = Metrics.getAccuracy(predictions_m1, blind_test.y_data)
accuracy['M2'] = Metrics.getAccuracy(predictions_m2, blind_test.y_data)
accuracy['M3'] = Metrics.getAccuracy(predictions_m3, blind_test.y_data)
accuracy['M4'] = Metrics.getAccuracy(predictions_m4, blind_test.y_data)
accuracy['M5'] = Metrics.getAccuracy(predictions_m5, blind_test.y_data)

# Get significance with sign test
sign_test_sgn = np.zeros((5,5))
for i, pred_i in enumerate([predictions_m1, predictions_m2, predictions_m3, predictions_m4, predictions_m5]):
    for j, pred_j in enumerate([predictions_m1, predictions_m2, predictions_m3, predictions_m4, predictions_m5]):
        if i <= j:
            sign_test_sgn[i,j] = Metrics.signTest(pred_i, pred_j, blind_test.y_data)
        
# Get significance with permutation test
perm_test_sgn = np.zeros((5,5))
for i, pred_i in enumerate([predictions_m1, predictions_m2, predictions_m3, predictions_m4, predictions_m5]):
    for j, pred_j in enumerate([predictions_m1, predictions_m2, predictions_m3, predictions_m4, predictions_m5]):
        if i <= j:
            perm_test_sgn[i,j] = Metrics.permutationTest(pred_i, pred_j, blind_test.y_data, 5000)



100%|██████████| 200/200 [00:00<00:00, 463.93it/s]


In [9]:
len(blind_test.y_data)

200

In [6]:
accuracy

{'M1': 0.785, 'M2': 0.81, 'M3': 0.755, 'M4': 0.82, 'M5': 0.83}

In [7]:
sign_test_sgn

array([[0.11269696, 0.10567072, 0.1030421 , 0.09957434, 0.09198772],
       [0.        , 0.11269696, 0.08330963, 0.11158115, 0.10829935],
       [0.        , 0.        , 0.11269696, 0.0739665 , 0.06437825],
       [0.        , 0.        , 0.        , 0.11269696, 0.11158115],
       [0.        , 0.        , 0.        , 0.        , 0.11269696]])

In [8]:
np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
perm_test_sgn

array([[1.    , 0.5417, 0.4325, 0.3547, 0.203 ],
       [0.    , 1.    , 0.059 , 0.8838, 0.6573],
       [0.    , 0.    , 1.    , 0.0918, 0.0392],
       [0.    , 0.    , 0.    , 1.    , 0.7267],
       [0.    , 0.    , 0.    , 0.    , 1.    ]])

# Error analysis

In [59]:
errors_m1 = [int(predictions_m1[idx] != blind_test.y_data[idx]) for idx in range(len(predictions_m1))]
errors_m2 = [int(predictions_m2[idx] != blind_test.y_data[idx]) for idx in range(len(predictions_m2))]
errors_m3 = [int(predictions_m3[idx] != blind_test.y_data[idx]) for idx in range(len(predictions_m3))]
errors_m4 = [int(predictions_m4[idx] != blind_test.y_data[idx]) for idx in range(len(predictions_m4))]
errors_m5 = [int(predictions_m5[idx] != blind_test.y_data[idx]) for idx in range(len(predictions_m5))]

In [80]:
np.array(errors_m1)+np.array(errors_m2)

array([0, 2, 0, 1, 1, 0, 2, 0, 0, 2, 1, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 2, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 0,
       0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 2, 0, 0, 2, 1, 0, 2, 0, 2, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0,
       1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0])

In [81]:
# Find common errors
comm_err = np.array(errors_m1)+np.array(errors_m2)+np.array(errors_m3)+np.array(errors_m4)+np.array(errors_m5)

In [84]:
all_errors = [' '.join(str(elem) for elem in test.x_train[idx]) for idx in comm_err if idx > 0]

In [102]:
all_5_errors = [str(test.y_train[idx]) + str(':::') + ' '.join(str(elem) for elem in test.x_train[idx]) for idx, val in enumerate(comm_err) if val == 5]
all_4_errors = [str(test.y_train[idx]) + str(':::') + ' '.join(str(elem) for elem in test.x_train[idx]) for idx, val in enumerate(comm_err) if val == 4]

In [103]:
comm_err

array([0, 4, 0, 1, 1, 0, 4, 0, 0, 4, 2, 1, 5, 0, 1, 0, 1, 2, 0, 2, 4, 0,
       1, 4, 3, 0, 1, 0, 0, 0, 0, 0, 2, 2, 1, 0, 0, 2, 0, 0, 5, 0, 0, 1,
       0, 0, 0, 0, 5, 0, 0, 0, 0, 2, 1, 0, 0, 4, 3, 0, 2, 2, 1, 0, 1, 2,
       0, 0, 0, 0, 3, 0, 2, 0, 1, 2, 1, 0, 2, 0, 0, 2, 0, 0, 3, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 5, 2, 0, 0, 3, 5, 0, 0, 3, 0,
       0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 2, 3, 0, 2, 0, 0, 4, 1, 2, 0, 0, 0,
       0, 0, 3, 0, 1, 5, 4, 1, 3, 0, 4, 2, 0, 0, 4, 0, 2, 0, 1, 0, 0, 0,
       0, 4, 0, 1, 2, 2, 0, 0, 0, 0, 1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 3, 0,
       2, 0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 1,
       0, 0])

In [107]:
all_4_errors

['0:::best remembered for his understated performance as dr hannibal lecter in michael mann s forensics thriller manhunter scottish character actor brian cox brings something special to every movie he works on usually playing a bit role in some studio schlock he dies halfway through the long kiss goodnight he s only occasionally given something meaty and substantial to do if you want to see some brilliant acting check out his work as a dogged police inspector opposite frances mcdormand in ken loach s hidden agenda cox plays the role of big john harrigan in the disturbing new indie flick l i e which lotpicked up at sundance when other distributors were scared to budge big john feels the love that dares not speak its name but he expresses it through seeking out adolescents and bringing them back to his pad what bothered some audience members was the presentation of big john in an oddly empathetic light he s an even tempered funny robust old man who actually listens to the kids problems a

In [105]:
' '.join(str(elem) for elem in test.x_train[197])

'in phantom menace the galaxy is divided into power groups whose interests will inevitably collide in later sequels there is an overarching galactic united nations type organization called the senate presided by a weak chancellor within the senate two camps are at odds a bickering isolationist alliance called the republic and their aggressive rival the trade federation preserving law and order are a council of jedi knights who are meanwhile searching for a prophesied chosen one of virgin birth manipulating events behind the scenes is a dangerous reemerging clan called the dark lords of sith so shadowy and secretive that they comprise a phantom menace jedi knight qui gon jinn liam neeson and his apprentice obi wan kenobi ewan mcgregor witness an invasion of teenage queen amidala s home planet naboo and befriend a gungan named jar jar ahmed best on the desert planet of tatooine the two jedi jar jar and amidala natalie portman attend a lengthy drag race involving the young boy anakin skyw

In [106]:
test.y_train[197]

1