In [2]:
import sentiment as sentimentinterface
import classify
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

import importlib

In [3]:
importlib.reload(sentimentinterface)
print("Reading data")
tarfname = "data/sentiment.tar.gz"
sentiment = sentimentinterface.read_data(tarfname)

Reading data
-- train data
sentiment/train.tsv
4582
-- dev data
sentiment/dev.tsv
458
-- transforming data and labels


In [117]:
sentiment.stop_words = sentimentinterface.generate_stop_words(sentiment, diff = 0.4)

from sklearn.feature_extraction.text import CountVectorizer

sentiment.cv = CountVectorizer(min_df = 3)
sentiment.cv.fit_transform(sentiment.train_data)
sentiment.mindf_stop_words = sentiment.cv.stop_words_
sentiment.cv = CountVectorizer(max_df = 0.2)
sentiment.cv.fit_transform(sentiment.train_data)
sentiment.maxdf_stop_words = sentiment.cv.stop_words_
sentiment.cv = CountVectorizer()
sentiment.cv.fit_transform(sentiment.train_data)
sentiment.training_set_vocabulary = sentiment.cv.vocabulary_

sentimentinterface.vectorize_data(sentiment, stop_words = sentiment.stop_words, max_df = 0.2, min_df = 3)
cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, C = 3.7)

classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
# print("\nReading unlabeled data")
# unlabeled = sentimentinterface.read_unlabeled(tarfname, sentiment)
# print("Writing predictions to a file")
# sentimentinterface.write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

  Accuracy on dev is: 0.8187772925764192


0.8187772925764192

### Now cls is the optimized model

In [13]:
sentence = 'i hate this place, it tastes like minwoo'

In [14]:
def vectorize_sentence(sentence, count_vect = sentiment.count_vect):
    return count_vect.transform([sentence])

In [15]:
veccc = vectorize_sentence(sentence)

In [16]:
print(veccc)

  (0, 850)	1.0


In [17]:
sentiment.count_vect.get_feature_names()[850]

'hate'

In [19]:
def predict(sentence, cls = cls):
    sentence_vect = vectorize_sentence(sentence)
    result = cls.predict(sentence_vect)
    if result[0] == 0:
        print("Prediction: NEGATIVE")
    else:
        print("Prediction: POSITIVE")
#     return result

In [41]:
sentence = "Went last night for the first time with my boyfriend. Let me start off by saying I'm vegetarian, but my boyfriend is not. I ordered the chicken v mushroom, it"
print(predict(sentence))

[0]


### Now explain why it predicted 0

#### clean function

In [134]:
def clean(s):
    from string import punctuation
    s_new = []
    s_ignored = []
    res = []
    for c in s:
        if c not in punctuation:
            s_new.append(c.lower())
        else:
            s_new.append(' ')
            
    s = ''.join(c for c in s_new)
    #s = [''.join(c for c in s if c not in punctuation)][0]

    l = s.split()

    for w in l:
        if w in sentiment.count_vect.vocabulary_:
            res.append(w)
        else:
            s_ignored.append(w)
#     l = [w for w in l if w in sentiment.count_vect.vocabulary_]

    return res, s_ignored

#### find stop_words

In [120]:
def find_stop_words(s_ignored):
#     sentence_vect = clean(sentence)
    unseen = []
    maxdf = []
    mindf = []
    oliver_algorithm = []
    for w in s_ignored:
        if w not in sentiment.training_set_vocabulary:
            unseen.append(w)
        if w in sentiment.maxdf_stop_words:
            maxdf.append(w)
        if w in sentiment.mindf_stop_words:
            mindf.append(w)
        if w in sentiment.stop_words:
            oliver_algorithm.append(w)
            
    print("Words being ignored due to not appearing in training set are: ")
    if len(unseen) == 0:
        print("None\n")
    else:
        print(unseen)
        print('')
        
    print("Words being ignored due to mindf (unfrequent in corpus) are: ")
    if len(mindf) == 0:
        print("None\n")
    else:
        print(mindf)
        print('')
        
    print("Words being ignored due to maxdf (too frequent in corpus) are: ")
    if len(maxdf) == 0:
        print("None\n")
    else:
        print(maxdf)
        print('')
        
    print("Words being ignored due to our algorithm are: ")
    if len(oliver_algorithm) == 0:
        print("None\n")
    else:
        print(oliver_algorithm)
        print('')
#     return res


#### find coef

#### Explain the coef

In [24]:
p_dict = {}
n_dict = {}
sentences = sentiment.count_vect.inverse_transform(sentiment.trainX)
for counter in range(0, len(sentiment.train_labels)):
    if sentiment.train_labels[counter] == "POSITIVE":
        for w in sentences[counter]:
            if w in p_dict:
                p_dict[w] += 1
            else:
                p_dict[w] = 1
    else:
        for w in sentences[counter]:
            if w in n_dict:
                n_dict[w] += 1
            else:
                n_dict[w] = 1

In [138]:
def find_coef(sentence_vect, tfidf_vect, stop_words = stop_words, cls = cls, sentiment = sentiment):
    import pandas as pd
#     sentence_vect = clean(sentence)
    word_list = []
    coef_list = []
    num_p_list = []
    num_n_list = []
    tfidf_list = []
    count_list = []
    contribution_list = []
    for word in sentence_vect:
        if word in sentiment.count_vect.vocabulary_:
#             print(word,"\'s coef:\n", cls.coef_[0][sentiment.count_vect.vocabulary_[word]])
            word_list.append(word)
            coef = cls.coef_[0][sentiment.count_vect.vocabulary_[word]]
            coef_list.append(coef)
            tfidf = tfidf_vect.toarray()[0][sentiment.count_vect.vocabulary_[word]]
            tfidf_list.append(tfidf)
            contribution_list.append(coef*tfidf)
            vec = sentiment.cv.transform([sentence])
            count_list.append(vec.toarray()[0][sentiment.training_set_vocabulary[word]])
            if word in p_dict:
                num_p = p_dict[word]
            else:
                num_p = 0
            if word in n_dict:
                num_n = n_dict[word]
            else:
                num_n = 0
#             print("Number of ",word,"in POSITIVE reviews: ",num_p,"\tNumber of ",word,"in NEGATIVE reviews: ",num_n,"\n")
            num_p_list.append(num_p)
            num_n_list.append(num_n)
    dic = {'Feature':word_list, 'Coef':coef_list, 'in POSITIVE':num_p_list,
          'in NEGATIVE':num_n_list, 'tfidf val':tfidf_list, 'Original Count':count_list,
          'Contribution':contribution_list}
    df = pd.DataFrame(dic)
    df = df[['Feature','Coef','Original Count','tfidf val','Contribution','in POSITIVE','in NEGATIVE']]
#     print(df)
    return df

In [161]:
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    try:
        color = 'red' if float(val) < 0 else 'black'
    except ValueError:
        color = 'black'

    return 'color: %s' % color

In [162]:
def analysis(sentence):
    sentence_vect, s_ignored = clean(sentence)
    tfidf_vect = sentiment.count_vect.transform([sentence])
    find_stop_words(s_ignored)
    print("Remaining words in the vec:")
    print(sentence_vect)
    df = find_coef(sentence_vect, tfidf_vect)
    df = df.style.applymap(color_negative_red)
    return df

### Usage

In [127]:
sentence = sentiment.train_data[199]

In [128]:
sentence

"Add me to the disappointed by Optical Experts list. And I'm also not happy that many of their reviews aren't showing up. People should be warned about this business. Like"

In [129]:
sentiment.trainy[199]

0

In [130]:
predict(sentence)

Prediction: NEGATIVE


In [163]:
df = analysis(sentence)
df

Words being ignored due to not appearing in training set are: 
['i', 'm', 't']

Words being ignored due to mindf (unfrequent in corpus) are: 
['optical', 'experts']

Words being ignored due to maxdf (too frequent in corpus) are: 
['to', 'the', 'and', 'of', 'this']

Words being ignored due to our algorithm are: 
['add', 'me', 'to', 'the', 'by', 'optical', 'list', 'and', 'that', 'many', 'of', 'their', 'aren', 'showing', 'up', 'people', 'about', 'this', 'business', 'like']

Remaining words in the vec:
['disappointed', 'also', 'not', 'happy', 'reviews', 'should', 'be', 'warned']


Unnamed: 0,Feature,Coef,Original Count,tfidf val,Contribution,in POSITIVE,in NEGATIVE
0,disappointed,-3.11596,1,0.353444,-1.10132,15,60
1,also,0.748521,1,0.325341,0.243524,72,41
2,not,-3.32279,1,0.199847,-0.664049,201,495
3,happy,1.92795,1,0.353444,0.681423,53,22
4,reviews,-0.801296,1,0.32295,-0.258779,37,80
5,should,-1.59571,1,0.365355,-0.583,13,50
6,be,-1.4173,1,0.23612,-0.334653,153,259
7,warned,0.000443161,1,0.557527,0.000247074,1,2


### Testing Space

In [108]:
sentiment.cv = CountVectorizer(min_df = 3)

In [106]:
len(sentiment.cv.vocabulary_)

9882

In [107]:
sentiment.cv.stop_words_

set()

In [152]:
import pandas as pd
import numpy as np

np.random.seed(24)
df = pd.DataFrame({'A': np.linspace(1, 10, 10)})
df = pd.concat([df, pd.DataFrame(np.random.randn(10, 4), columns=list('BCDE'))],
               axis=1)
df.iloc[0, 2] = np.nan
df.style

Unnamed: 0,A,B,C,D,E
0,1,1.32921,,-0.31628,-0.99081
1,2,-1.07082,-1.43871,0.564417,0.295722
2,3,-1.6264,0.219565,0.678805,1.88927
3,4,0.961538,0.104011,-0.481165,0.850229
4,5,1.45342,1.05774,0.165562,0.515018
5,6,-1.33694,0.562861,1.39285,-0.063328
6,7,0.121668,1.2076,-0.00204021,1.6278
7,8,0.354493,1.03753,-0.385684,0.519818
8,9,1.68658,-1.32596,1.42898,-2.08935
9,10,-0.12982,0.631523,-0.586538,0.29072


In [154]:
s = df.style.applymap(color_negative_red)
s

ValueError: ('cannot convert float NaN to integer', 'occurred at index C')

<pandas.io.formats.style.Styler at 0x7f9cdae58a90>