In [1]:
import sentiment as sentimentinterface
import classify
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

import importlib

In [2]:
importlib.reload(sentimentinterface)
print("Reading data")
tarfname = "data/sentiment.tar.gz"
sentiment = sentimentinterface.read_data(tarfname)

Reading data
-- train data
sentiment/train.tsv
4582
-- dev data
sentiment/dev.tsv
458
-- transforming data and labels


In [3]:
stop_words = sentimentinterface.generate_stop_words(sentiment, diff = 0.4)
sentimentinterface.vectorize_data(sentiment, stop_words = stop_words, max_df = 0.2, min_df = 3)
cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, C = 3.7)

classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
print("\nReading unlabeled data")
unlabeled = sentimentinterface.read_unlabeled(tarfname, sentiment)
print("Writing predictions to a file")
sentimentinterface.write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

  Accuracy on dev is: 0.8187772925764192

Reading unlabeled data
sentiment/unlabeled.tsv
(91524, 2123)
Writing predictions to a file


### Now cls is the optimized model

In [8]:
def vectorize_sentence(sentence, count_vect = sentiment.count_vect):
    return count_vect.transform([sentence])

In [9]:
def predict(sentence, cls = cls):
    sentence_vect = vectorize_sentence(sentence)
    result = cls.predict(sentence_vect)
    return result

In [41]:
sentence = "Went last night for the first time with my boyfriend. Let me start off by saying I'm vegetarian, but my boyfriend is not. I ordered the chicken v mushroom, it"
print(predict(sentence))

[0]


### Now explain why it predicted 0

#### clean function

In [70]:
def clean(s):
    
    s_new = []
    s_ignored = []
    for c in s:
        if c not in punctuation:
            s_new.append(c.lower())
        else:
            s_new.append(' ')
            
    s = ''.join(c for c in s_new)
    #s = [''.join(c for c in s if c not in punctuation)][0]

    l = s.split()
    
    res = []
    for w in l:
        if w in sentiment.count_vect.vocabulary_:
            res.append(w)
        else:
            s_ignored.append(w)
#     l = [w for w in l if w in sentiment.count_vect.vocabulary_]
    
    print("Words being ignored due to not appearing in training set are:")
    if len(s_ignored) == 0:
        print("None\n")
    else:
        print(s_ignored)
        print('')
    print("Remaining words are: ")
    if len(res) == 0:
        print("None\n")
    else:
        print(res)
        print('')
    return res

In [71]:
sentence_vect = clean(sentence)

Words being ignored due to not appearing in training set are:
['last', 'night', 'for', 'the', 'first', 'time', 'with', 'my', 'boyfriend', 'me', 'start', 'off', 'by', 'i', 'm', 'but', 'my', 'boyfriend', 'is', 'i', 'the', 'chicken', 'v', 'it']

Remaining words are: 
['went', 'let', 'saying', 'vegetarian', 'not', 'ordered', 'mushroom']



#### find stop_words

In [72]:
def find_stop_words(sentence, stop_words = stop_words):
    sentence_vect = clean(sentence)
    res = []
    for w in sentence_vect:
        if w in stop_words:
            res.append(w)
    print("Words being ignored due to stop words are: ")
    if len(res) == 0:
        print("None\n")
    else:
        print(res)
        print('')
#     return res


In [73]:
find_stop_words(sentence)

Words being ignored due to not appearing in training set are:
['last', 'night', 'for', 'the', 'first', 'time', 'with', 'my', 'boyfriend', 'me', 'start', 'off', 'by', 'i', 'm', 'but', 'my', 'boyfriend', 'is', 'i', 'the', 'chicken', 'v', 'it']

Remaining words are: 
['went', 'let', 'saying', 'vegetarian', 'not', 'ordered', 'mushroom']

Words being ignored due to stop words are: 
None



#### find coef

In [74]:
def find_coef(sentence, stop_words = stop_words, cls = cls):
    sentence_vect = clean(sentence)
    for word in sentence_vect:
        if word in sentiment.count_vect.vocabulary_:
            print(word,"\'s coef:\t\t", cls.coef_[0][sentiment.count_vect.vocabulary_[word]])

In [75]:
find_coef(sentence)

Words being ignored due to not appearing in training set are:
['last', 'night', 'for', 'the', 'first', 'time', 'with', 'my', 'boyfriend', 'me', 'start', 'off', 'by', 'i', 'm', 'but', 'my', 'boyfriend', 'is', 'i', 'the', 'chicken', 'v', 'it']

Remaining words are: 
['went', 'let', 'saying', 'vegetarian', 'not', 'ordered', 'mushroom']

went 's coef:		 -1.251740858407901
let 's coef:		 -0.7935660220282503
saying 's coef:		 -1.3105183912332536
vegetarian 's coef:		 1.4565436858852365
not 's coef:		 -3.322794263373309
ordered 's coef:		 -1.3488052235512404
mushroom 's coef:		 1.7986278279867243
