In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import requests
import os

# Building the Soft-SVM

In [2]:
def gradient_comp(X,y,C,w0):
    summands_of_X = X[X.dot(w0) * y <= 1]
    summands_of_y = y[summands_of_X.index]
    Y = []
    for i in range(summands_of_X.shape[1]):
        Y.append(list(summands_of_y))
    summands = -1*np.multiply(pd.DataFrame(Y).values.T,summands_of_X)
    gradient = w0 + C*summands.sum()
    return gradient

In [3]:
def stochastic_gradient_comp(X,y,C,w0,batch):
    Xsamp = X.sample(batch)
    ysamp = y[Xsamp.index]
    summands_of_X = Xsamp[Xsamp.dot(w0) * ysamp <= 1]
    summands_of_y = ysamp[summands_of_X.index]
    Y = []
    for i in range(summands_of_X.shape[1]):
        Y.append(list(summands_of_y))
    summands = -1*np.multiply(pd.DataFrame(Y).values.T,summands_of_X)
    gradient = w0 + C*(1/batch)*summands.sum()
    return gradient

In [4]:
def soft_SVM_training(X,y,C,w0,eps,lr,n,batch):
    # X is training data
    # y is labels
    # C is penalty of how hard to be
    # w is initial weights vector (np.array)
    # eps is convergence criterion
    # lr is learning rate
    
    # Computing gradient of L
    i = 0
    gradw = stochastic_gradient_comp(X,y,C,w0,batch)
    w1 = w0 - lr*gradw
    while(np.linalg.norm(w0-w1) > eps):
        w0 = w1
        w1 = w0 - lr*gradient_comp(X,y,C,w0)
        i = i + 1
        if(i%10==0):
            print(i)
        if i == n:
            break
    print(np.linalg.norm(w0-w1))
    return w1

In [30]:
def Testing_soft_SVM(X,y,w):
    # X is the X_test set (in the form of the training set). There should
    # be a column of ones at the same spot as there is in the training.
    # y is the y_test actual values
    # w is the out put weights from the soft_SVM_training
    vals = X.values.dot(w)
    predictions = pd.Series((vals/abs(vals)).astype(int))
    TP = sum(y[(predictions[(predictions > 0)].index)] > 0)
    TN = sum(y[predictions[(predictions < 0)].index] < 0)
    FP = sum(y[predictions[(predictions > 0)].index] < 0)
    FN = sum(y[predictions[(predictions < 0)].index] > 0)
    print("TP",TP)
    print("TN",TN)
    print("FP",FP)
    print("FN",FN)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = 2*(precision*recall)/(precision+recall)
    return {"accuracy":accuracy,"precision":precision,"recall":recall, "f1":f1}
    #return {"accuracy":accuracy,"recall":recall}
    

# Modeling

Raw Data

In [6]:
training_df = pd.read_csv("Training_data.csv").drop("Unnamed: 0",axis=1)
training_df.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,Walt Disney's CINDERELLA takes a story everybo...,1
1,7944_9.txt,"Have you ever, or do you have, a pet who's bee...",1
2,11725_10.txt,"I suck at gratuitous Boob references, so i'm j...",1
3,1587_10.txt,"Does anyone know, where I can see or download ...",1
4,10297_8.txt,Well not actually. This movie is very entertai...,1


In [7]:
testing_df = pd.read_csv("Test_data.csv").drop("Unnamed: 0",axis=1)
testing_df.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,"""Rush in Rio"" is, no doubt, one of the most ex...",1
1,8705_10.txt,I have seen a number of horror movies to know ...,1
2,11725_10.txt,I'm a fan of B grade 80s films in which the he...,1
3,9859_8.txt,"I think that Pierre Léaud, or his character, t...",1
4,12409_10.txt,This picture doesn't have any big explosions o...,1


In [8]:
# Stop words
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

In [9]:
# Positive semantic words
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Positive\n;\n; This file contains a list of POSITIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;'
pos_words = r.text[len(s)+2:]
pos_words = pos_words.split("\n")

In [10]:
# Negative semantic words
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Negative\n;\n; This file contains a list of NEGATIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n\n'
neg_words = r.text[len(s):]
neg_words = neg_words.split("\n")

Engineering the raw data

In [11]:
# Vectorizing reviews in training set
words_train = (
    training_df.Review.
    str.lower().
    str.replace("[^\w\s]","").
    str.split()
)
bag_of_words_train = words_train.apply(Counter)

In [12]:
# Vectorizing reviews in test set
words_test = (
    testing_df.Review.
    str.lower().
    str.replace("[^\w\s]","").
    str.split()
)
bag_of_words_test = words_test.apply(Counter)

In [13]:
# Removing stop words in training
reviews_train = []
for r in words_train:
    good = []
    for w in r:
        if w not in en_stops:
            good.append(w)
    reviews_train.append(good)
reviews_train = pd.Series(reviews_train)

In [14]:
# Removing stop words in test
reviews_test = []
for r in words_test:
    good = []
    for w in r:
        if w not in en_stops:
            good.append(w)
    reviews_test.append(good)
reviews_test = pd.Series(reviews_test)

In [15]:
# Counting positive and negative words in training set
pos_set = set(pos_words)
neg_set = set(neg_words)
posc_train = []
negc_train = []
for r in reviews_train:
    count_pos = len(pos_set.intersection(set(r)))
    count_neg = len(neg_set.intersection(set(r)))
    posc_train.append(count_pos)
    negc_train.append(count_neg)
    

In [16]:
# Counting positive and negative words in test set
pos_set = set(pos_words)
neg_set = set(neg_words)
posc_test = []
negc_test = []
for r in reviews_test:
    count_pos = len(pos_set.intersection(set(r)))
    count_neg = len(neg_set.intersection(set(r)))
    posc_test.append(count_pos)
    negc_test.append(count_neg)

In [17]:
y_train = training_df.Label
X_train = pd.DataFrame({"Positive_counts":posc_train, "Negative_counts":negc_train, "ones":[1]*len(y_train)})
X_train

Unnamed: 0,Positive_counts,Negative_counts,ones
0,23,6,1
1,13,3,1
2,8,3,1
3,2,2,1
4,7,1,1
...,...,...,...
24995,3,3,1
24996,2,13,1
24997,12,9,1
24998,3,9,1


In [18]:
y_test = testing_df.Label
X_test = pd.DataFrame({"Positive_counts":posc_test, "Negative_counts":negc_test, "ones":[1]*len(y_test)})
X_test

Unnamed: 0,Positive_counts,Negative_counts,ones
0,13,6,1
1,2,0,1
2,7,8,1
3,6,3,1
4,6,2,1
...,...,...,...
24995,7,4,1
24996,11,17,1
24997,4,9,1
24998,9,34,1


Scaling Data

In [63]:
X_train_normalized = X_train.divide(np.sqrt((X_train**2).sum()), axis=1)
X_test_normalized = X_test.divide(np.sqrt((X_test**2).sum()), axis=1)


In [64]:
weight_norm1 = soft_SVM_training(X_train_normalized,
                  y_train,
                  25,
                  np.array(X_train_normalized.mean()),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [65]:
Testing_soft_SVM(X_test_normalized,y_test,weight_norm1)

{'accuracy': 0.73236, 'precision': 0.7295865939451427, 'recall': 0.7384}

Fitting on training data

In [24]:
weight = soft_SVM_training(X_train,
                  y_train,
                  25,
                  np.array(X_train.mean().T),
                  10^-3,
                  0.05,
                  1000,
                  128)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
146266.4211133175


In [25]:
Testing_soft_SVM(X_test,y_test,weight)

{'accuracy': 0.59468,
 'precision': 0.5542914812606082,
 'recall': 0.96664,
 'f1': 0.7045686463162192}

## Adding Interactions

In [23]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()
y_train2 = y_train.copy()
y_test2 = y_test.copy()

In [24]:
X_train2["Interaction_posc_negc"] = X_train2.Positive_counts*X_train2.Negative_counts
X_test2["Interaction_posc_negc"] = X_test2.Positive_counts*X_test2.Negative_counts
X_train2 = X_train2[["Positive_counts","Negative_counts","Interaction_posc_negc","ones"]]
X_test2 = X_test2[["Positive_counts","Negative_counts","Interaction_posc_negc","ones"]]

In [22]:
weight2 = soft_SVM_training(X_train2,
                  y_train2,
                  25,
                  np.array([X_train2.Positive_counts.mean(),X_train2.Negative_counts.mean(),X_train2.Interaction_posc_negc.mean(),1]),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [23]:
Testing_soft_SVM(X_test2,y_test2,weight2)

{'accuracy': 0.57704, 'precision': 0.9070160608622148, 'recall': 0.17168}

Normalized now

In [35]:
X_train2_normalized = X_train2.divide(np.sqrt((X_train2**2).sum()), axis=1)
X_test2_normalized = X_test2.divide(np.sqrt((X_test2**2).sum()), axis=1)

In [36]:
weight_norm2 = soft_SVM_training(X_train2_normalized,
                  y_train,
                  25,
                  np.array(X_train2_normalized.mean()),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [37]:
Testing_soft_SVM(X_test2_normalized,y_test,weight_norm2)

{'accuracy': 0.73424, 'precision': 0.7319391634980988, 'recall': 0.7392}

# Squaring Terms

In [25]:
X_train3 = X_train2.copy()
X_test3 = X_test2.copy()
y_train3 = y_train2.copy()
y_test3 = y_test2.copy()

In [26]:
X_train3["Positive_counts2"] = X_train3.Positive_counts**2
X_train3["Negative_counts2"] = X_train3.Negative_counts**2
X_test3["Positive_counts2"] = X_test3.Positive_counts**2
X_test3["Negative_counts2"] = X_test3.Negative_counts**2
X_train3.drop("ones",axis=1,inplace=True)
X_train3["ones"] = [1]*25000
X_test3.drop("ones",axis=1,inplace=True)
X_test3["ones"] = [1]*25000

In [27]:
X_test3

Unnamed: 0,Positive_counts,Negative_counts,Interaction_posc_negc,Positive_counts2,Negative_counts2,ones
0,13,6,78,169,36,1
1,2,0,0,4,0,1
2,7,8,56,49,64,1
3,6,3,18,36,9,1
4,6,2,12,36,4,1
...,...,...,...,...,...,...
24995,7,4,28,49,16,1
24996,11,17,187,121,289,1
24997,4,9,36,16,81,1
24998,9,34,306,81,1156,1


In [28]:
weight3 = soft_SVM_training(X_train3,
                  y_train3,
                  25,
                  np.array([X_train3.Positive_counts.mean(),
                            X_train3.Negative_counts.mean(),
                            X_train3.Interaction_posc_negc.mean(),
                            X_train3.Positive_counts2.mean(),
                            X_train3.Negative_counts2.mean(),
                            1]),
                  10^-3,
                  0.05,
                  1000,
                128)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
2085603.508153232


In [31]:
Testing_soft_SVM(X_test3,y_test3,weight3)

TP 11980
TN 3348
FP 9152
FN 520


{'accuracy': 0.61312,
 'precision': 0.5669127389740678,
 'recall': 0.9584,
 'f1': 0.7124167459562323}

In [None]:
# BEFORE THE STOCHASTIC
# acc=0.68, pre = 0.851, recall=0.44

Normalized now

In [41]:
X_train3_normalized = X_train3.divide(np.sqrt((X_train3**2).sum()), axis=1)
X_test3_normalized = X_test3.divide(np.sqrt((X_test3**2).sum()), axis=1)

In [42]:
weight_norm3 = soft_SVM_training(X_train3_normalized,
                  y_train,
                  25,
                  np.array(X_train3_normalized.mean()),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [43]:
Testing_soft_SVM(X_test3_normalized,y_test,weight_norm3)

{'accuracy': 0.7342, 'precision': 0.732877257179222, 'recall': 0.73704}

# Cubing Terms

In [31]:
X_train4 = X_train3.copy()
X_test4 = X_test3.copy()
y_train4 = y_train3.copy()
y_test4 = y_test3.copy()

In [32]:
X_train4["Positive_counts3"] = X_train4.Positive_counts**3
X_train4["Negative_counts3"] = X_train4.Negative_counts**3
X_test4["Positive_counts3"] = X_test4.Positive_counts**3
X_test4["Negative_counts3"] = X_test4.Negative_counts**3
X_train4.drop("ones",axis=1,inplace=True)
X_train4["ones"] = [1]*25000
X_test4.drop("ones",axis=1,inplace=True)
X_test4["ones"] = [1]*25000

In [33]:
X_test4

Unnamed: 0,Positive_counts,Negative_counts,Interaction_posc_negc,Positive_counts2,Negative_counts2,Positive_counts3,Negative_counts3,ones
0,13,6,78,169,36,2197,216,1
1,2,0,0,4,0,8,0,1
2,7,8,56,49,64,343,512,1
3,6,3,18,36,9,216,27,1
4,6,2,12,36,4,216,8,1
...,...,...,...,...,...,...,...,...
24995,7,4,28,49,16,343,64,1
24996,11,17,187,121,289,1331,4913,1
24997,4,9,36,16,81,64,729,1
24998,9,34,306,81,1156,729,39304,1


In [34]:
weight4 = soft_SVM_training(X_train4,
                  y_train4,
                  25,
                  np.array(X_train4.mean().T),
                  10^-3,
                  0.05,
                  1000,
                    128)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
48961289.68442898


In [39]:
Testing_soft_SVM(X_test4,y_test4,weight4)

TP 0
TN 12500
FP 0
FN 12500


{'accuracy': 0.5, 'recall': 0.0}

Normalized now

In [47]:
X_train4_normalized = X_train4.divide(np.sqrt((X_train4**2).sum()), axis=1)
X_test4_normalized = X_test4.divide(np.sqrt((X_test4**2).sum()), axis=1)

In [66]:
weight_norm4 = soft_SVM_training(X_train4_normalized,
                  y_train,
                  1,
                  np.array(X_train4_normalized.mean()),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [67]:
Testing_soft_SVM(X_test4_normalized,y_test,weight_norm4)

{'accuracy': 0.72424, 'precision': 0.6925931015528377, 'recall': 0.8064}

# Trying subsets

In [24]:
X_train_sq = X_train4[["Positive_counts2","Negative_counts2","ones"]]
X_test_sq = X_test4[["Positive_counts2","Negative_counts2","ones"]]

In [25]:
weight_sq = soft_SVM_training(X_train_sq,
                  y_train,
                  5,
                  np.array(X_train_sq.mean()),
                  10^-3,
                  0.05,
                  1000)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
164661.74809410312


In [26]:
Testing_soft_SVM(X_test_sq,y_test,weight_sq)

{'accuracy': 0.72648, 'precision': 0.7787514769594328, 'recall': 0.63272}

In [27]:
np.corrcoef(X_train4.Positive_counts, X_train4.Positive_counts2)

array([[1.        , 0.91935693],
       [0.91935693, 1.        ]])

In [None]:
#pd.DataFrame(list(reviews.apply(Counter)))

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(X_train.Positive_counts, X_train.Negative_counts, c=y_train, cmap=plt.cm.Set1,
            edgecolor='k',alpha=0.05)
plt.xlabel('x')
plt.ylabel('y')
plt.show()

# Building the Soft-SVM

In [None]:
def gradient_comp(X,y,C,w0):
    summands_of_X = X[X.dot(w0) * y <= 1]
    summands_of_y = y[summands_of_X.index]
    Y = []
    for i in range(summands_of_X.shape[1]):
        Y.append(list(summands_of_y))
    summands = -1*np.multiply(pd.DataFrame(Y).values.T,summands_of_X)
    gradient = w0 + C*summands.sum()
    return gradient

In [None]:
def soft_SVM_training(X,y,C,w0,eps,lr,n):
    # X is training data
    # y is labels
    # C is penalty of how hard to be
    # w is initial weights vector (np.array)
    # eps is convergence criterion
    # lr is learning rate
    
    # Computing gradient of L
    i = 0
    gradw = gradient_comp(X,y,C,w0)
    w1 = w0 - lr*gradw
    while(np.linalg.norm(w0-w1) > eps):
        w0 = w1
        w1 = w0 - lr*gradient_comp(X,y,C,w0)
        i = i + 1
        print(i)
        if i == n:
            break
    return w1
        
    

In [None]:
weight = soft_SVM_training(X_train,
                  y_train,
                  1,
                  np.array([X_train.Positive_counts.mean(),X_train.Negative_counts.mean(),1]),
                  10^-3,
                  0.05,
                  100)

In [None]:
def plot_data_with_classifier(X,y,w):
    # plot points
    plt.clf()
    plt.scatter(X.loc[:, "Positive_counts"], X.loc[:, "Negative_counts"], c=y, cmap=plt.cm.Set1, edgecolor='k')

    # draw hyperplane
    xrange = np.linspace(np.min(X.loc[:, "Positive_counts"]),X.loc[:, "Negative_counts"])
    yrange = -(w[0]*xrange+w[2])/w[1]
    plt.plot(xrange,yrange,'red')

    plt.show()

In [None]:
plot_data_with_classifier(X_train,y_train,weight)

In [None]:
weight

# -------------------------------------------------------------------------------------------

In [None]:
pos_set = set(pos_words)
neg_set = set(neg_words)
sentiment_words = []
for r in reviews:
    positives = list(pos_set.intersection(set(r)))
    negatives = list(neg_set.intersection(set(r)))
    sentiment_words.append(positives+negatives)
sentiment_words = pd.Series(sentiment_words)

In [None]:
tf = pd.DataFrame(list(sentiment_words.apply(Counter)))
tf = tf.fillna(0)
tf

In [None]:
docFreq = (tf > 0).sum(axis=0)
idf = np.log(len(tf) / docFreq)
tf_idf = tf*idf

In [None]:
tf_idf

In [None]:
X_train = tf_idf
X_train["ones"] = [1]*len(tf_idf)
y_train = training_df.Label

In [None]:
weight = soft_SVM_training(X_train,
                  y_train,
                  1,
                  np.array([0]*X_train.shape[1]),
                  10^-3,
                  0.01,
                  100)

In [None]:
np.save("tf_idf_model.npy",weight)

In [None]:
tf_idf_model = np.load("tf_idf_model.npy")

TESTING

In [None]:
all_words = X_train.columns

In [None]:
test_df = pd.read_csv("Test_data.csv").drop("Unnamed: 0",axis=1)
test_df.head()

In [None]:
words = (
    test_df.Review.
    str.lower().
    str.replace("[^\w\s]","").
    str.split()
)
bag_of_words = words.apply(Counter)

In [None]:
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

In [None]:
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Positive\n;\n; This file contains a list of POSITIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;'
pos_words = r.text[len(s)+2:]
pos_words = pos_words.split("\n")

In [None]:
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Negative\n;\n; This file contains a list of NEGATIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n\n'
neg_words = r.text[len(s):]
neg_words = neg_words.split("\n")

In [None]:
reviews = []
for r in words:
    good = []
    for w in r:
        if w not in en_stops:
            good.append(w)
    reviews.append(good)
reviews = pd.Series(reviews)

In [None]:
sents = set(all_words)
sentiment_words = []
for r in reviews:
    sentiment_words.append(list(sents.intersection(set(r))))
sentiment_words = pd.Series(sentiment_words)

In [None]:
tf = pd.DataFrame(list(sentiment_words.apply(Counter)))
tf = tf.fillna(0)
tf

In [None]:
docFreq = (tf > 0).sum(axis=0)
idf = np.log(len(tf) / docFreq)
tf_idf = tf*idf

In [None]:
tf_idf

In [None]:
X_test = tf_idf
X_test["ones"] = [1]*len(tf_idf)
y_test = test_df.Label