In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import requests
import os

# LDA Classifier

In [3]:
# returns a dictionary of priors (one key, value pair for each category)
def compute_priors(df, response):
    y = df[response]
    priors_dict = {}
    priors = dict(y.value_counts(normalize=True))
    return priors

# returns a dictionary of mu vectors (one key, value pair for each category)
def compute_mu_vectors(df, response):
    y = df[response]
    mu_vectors = {}
    for category_k in y.unique():
        df_k = df[y == category_k] # df with y = category_k
        X_k = df_k.drop(response, axis=1)
        mu_vectors[category_k] = dict(X_k.mean())
    return mu_vectors

# returns the inverse of the covariance matrix
def compute_inv_sigma(df, response):
    X = df.drop(response, axis=1)
    return inv(X.cov())

# returns the classification of a single obs
def classify_obs(x_i, y, mu_vectors, priors, inv_sigma):
        prob_dict = {}
        for category_k in y.unique():
            mu_k = pd.Series(mu_vectors[category_k]).to_numpy()
            first_term = x_i.transpose().dot(inv_sigma).dot(mu_k)
            second_term = .5 * mu_k.transpose().dot(inv_sigma).dot(mu_k)
            third_term = np.log(priors[category_k])
            prob_k = first_term - second_term + third_term
            prob_dict[category_k] = prob_k

        best_class, max_prob = next(iter(prob_dict.items()))
        for class_k, prob_k in prob_dict.items():
            if max_prob < prob_k:
                max_prob = prob_k
                best_class = class_k
        return best_class

class LDA:
    
    def fit(self, X_train, y_train):
        df_train = X_train.copy()
        response = y_train.name
        df_train[response] = y_train
        self.y = df_train[response]
        self.priors = compute_priors(df_train, response)
        self.mu_vectors = compute_mu_vectors(df_train, response)
        self.inv_sigma = compute_inv_sigma(df_train, response)
        
    def predict(self, df_test):
        y_pred = {}
        for i in range(len(df_test)):
            x_i = df_test.loc[i, :].to_numpy()
            y_pred[i] = classify_obs(x_i, self.y, self.mu_vectors, self.priors, self.inv_sigma)
        return pd.Series(y_pred)
    

# Modeling

In [4]:
training_df = pd.read_csv("Training_data.csv").drop("Unnamed: 0",axis=1)
training_df.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,Walt Disney's CINDERELLA takes a story everybo...,1
1,7944_9.txt,"Have you ever, or do you have, a pet who's bee...",1
2,11725_10.txt,"I suck at gratuitous Boob references, so i'm j...",1
3,1587_10.txt,"Does anyone know, where I can see or download ...",1
4,10297_8.txt,Well not actually. This movie is very entertai...,1


In [5]:
testing_df = pd.read_csv("Test_data.csv").drop("Unnamed: 0",axis=1)
testing_df.head()

Unnamed: 0,File,Review,Label
0,2893_10.txt,"""Rush in Rio"" is, no doubt, one of the most ex...",1
1,8705_10.txt,I have seen a number of horror movies to know ...,1
2,11725_10.txt,I'm a fan of B grade 80s films in which the he...,1
3,9859_8.txt,"I think that Pierre Léaud, or his character, t...",1
4,12409_10.txt,This picture doesn't have any big explosions o...,1


In [6]:
# Stop words
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

In [7]:
# Positive semantic words
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Positive\n;\n; This file contains a list of POSITIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;'
pos_words = r.text[len(s)+2:]
pos_words = pos_words.split("\n")

In [8]:
# Negative semantic words
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Negative\n;\n; This file contains a list of NEGATIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n\n'
neg_words = r.text[len(s):]
neg_words = neg_words.split("\n")

Engineering the raw data

In [9]:
# Vectorizing reviews in training set
words_train = (
    training_df.Review.
    str.lower().
    str.replace("[^\w\s]","").
    str.split()
)
bag_of_words_train = words_train.apply(Counter)

In [10]:
# Vectorizing reviews in test set
words_test = (
    testing_df.Review.
    str.lower().
    str.replace("[^\w\s]","").
    str.split()
)
bag_of_words_test = words_test.apply(Counter)

In [11]:
# Removing stop words in training
reviews_train = []
for r in words_train:
    good = []
    for w in r:
        if w not in en_stops:
            good.append(w)
    reviews_train.append(good)
reviews_train = pd.Series(reviews_train)

In [12]:
# Removing stop words in test
reviews_test = []
for r in words_test:
    good = []
    for w in r:
        if w not in en_stops:
            good.append(w)
    reviews_test.append(good)
reviews_test = pd.Series(reviews_test)

In [13]:
# Counting positive and negative words in training set
pos_set = set(pos_words)
neg_set = set(neg_words)
posc_train = []
negc_train = []
for r in reviews_train:
    count_pos = len(pos_set.intersection(set(r)))
    count_neg = len(neg_set.intersection(set(r)))
    posc_train.append(count_pos)
    negc_train.append(count_neg)
    

In [14]:
# Counting positive and negative words in test set
pos_set = set(pos_words)
neg_set = set(neg_words)
posc_test = []
negc_test = []
for r in reviews_test:
    count_pos = len(pos_set.intersection(set(r)))
    count_neg = len(neg_set.intersection(set(r)))
    posc_test.append(count_pos)
    negc_test.append(count_neg)

In [23]:
y_train = training_df.Label
X_train = pd.DataFrame({"Positive_counts":posc_train, "Negative_counts":negc_train, "ones":[1]*len(y_train)})
X_train

Unnamed: 0,Positive_counts,Negative_counts,ones
0,23,6,1
1,13,3,1
2,8,3,1
3,2,2,1
4,7,1,1
...,...,...,...
24995,3,3,1
24996,2,13,1
24997,12,9,1
24998,3,9,1


In [24]:
y_test = testing_df.Label
X_test = pd.DataFrame({"Positive_counts":posc_test, "Negative_counts":negc_test, "ones":[1]*len(y_test)})
X_test

Unnamed: 0,Positive_counts,Negative_counts,ones
0,13,6,1
1,2,0,1
2,7,8,1
3,6,3,1
4,6,2,1
...,...,...,...
24995,7,4,1
24996,11,17,1
24997,4,9,1
24998,9,34,1


Fitting on training data

In [25]:
weight = soft_SVM_training(X_train,
                  y_train,
                  25,
                  np.array([X_train.Positive_counts.mean(),X_train.Negative_counts.mean(),1]),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [26]:
Testing_soft_SVM(X_test,y_test,weight)

{'accuracy': 0.63436, 'precision': 0.8860032176511147, 'recall': 0.3084}

## Adding Interactions

In [20]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()
y_train2 = y_train.copy()
y_test2 = y_test.copy()

In [21]:
X_train2["Interaction_posc_negc"] = X_train2.Positive_counts*X_train2.Negative_counts
X_test2["Interaction_posc_negc"] = X_test2.Positive_counts*X_test2.Negative_counts
X_train2 = X_train2[["Positive_counts","Negative_counts","Interaction_posc_negc","ones"]]
X_test2 = X_test2[["Positive_counts","Negative_counts","Interaction_posc_negc","ones"]]

In [22]:
weight2 = soft_SVM_training(X_train2,
                  y_train2,
                  25,
                  np.array([X_train2.Positive_counts.mean(),X_train2.Negative_counts.mean(),X_train2.Interaction_posc_negc.mean(),1]),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [23]:
Testing_soft_SVM(X_test2,y_test2,weight2)

{'accuracy': 0.57704, 'precision': 0.9070160608622148, 'recall': 0.17168}

# Squaring Terms

In [35]:
X_train3 = X_train2.copy()
X_test3 = X_test2.copy()
y_train3 = y_train2.copy()
y_test3 = y_test2.copy()

In [36]:
X_train3["Positive_counts2"] = X_train3.Positive_counts**2
X_train3["Negative_counts2"] = X_train3.Negative_counts**2
X_test3["Positive_counts2"] = X_test3.Positive_counts**2
X_test3["Negative_counts2"] = X_test3.Negative_counts**2
X_train3.drop("ones",axis=1,inplace=True)
X_train3["ones"] = [1]*25000
X_test3.drop("ones",axis=1,inplace=True)
X_test3["ones"] = [1]*25000

In [38]:
X_test3

Unnamed: 0,Positive_counts,Negative_counts,Interaction_posc_negc,Positive_counts2,Negative_counts2,ones
0,13,6,78,169,36,1
1,2,0,0,4,0,1
2,7,8,56,49,64,1
3,6,3,18,36,9,1
4,6,2,12,36,4,1
...,...,...,...,...,...,...
24995,7,4,28,49,16,1
24996,11,17,187,121,289,1
24997,4,9,36,16,81,1
24998,9,34,306,81,1156,1


In [39]:
weight3 = soft_SVM_training(X_train3,
                  y_train3,
                  25,
                  np.array([X_train3.Positive_counts.mean(),
                            X_train3.Negative_counts.mean(),
                            X_train3.Interaction_posc_negc.mean(),
                            X_train3.Positive_counts2.mean(),
                            X_train3.Negative_counts2.mean(),
                            1]),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [40]:
Testing_soft_SVM(X_test3,y_test3,weight3)

{'accuracy': 0.68476, 'precision': 0.851254752851711, 'recall': 0.44776}

# Cubing Terms

In [45]:
X_train4 = X_train3.copy()
X_test4 = X_test3.copy()
y_train4 = y_train3.copy()
y_test4 = y_test3.copy()

In [46]:
X_train4["Positive_counts3"] = X_train4.Positive_counts**3
X_train4["Negative_counts3"] = X_train4.Negative_counts**3
X_test4["Positive_counts3"] = X_test4.Positive_counts**3
X_test4["Negative_counts3"] = X_test4.Negative_counts**3
X_train4.drop("ones",axis=1,inplace=True)
X_train4["ones"] = [1]*25000
X_test4.drop("ones",axis=1,inplace=True)
X_test4["ones"] = [1]*25000

In [47]:
X_test4

Unnamed: 0,Positive_counts,Negative_counts,Interaction_posc_negc,Positive_counts2,Negative_counts2,Positive_counts3,Negative_counts3,ones
0,13,6,78,169,36,2197,216,1
1,2,0,0,4,0,8,0,1
2,7,8,56,49,64,343,512,1
3,6,3,18,36,9,216,27,1
4,6,2,12,36,4,216,8,1
...,...,...,...,...,...,...,...,...
24995,7,4,28,49,16,343,64,1
24996,11,17,187,121,289,1331,4913,1
24997,4,9,36,16,81,64,729,1
24998,9,34,306,81,1156,729,39304,1


In [48]:
weight4 = soft_SVM_training(X_train4,
                  y_train4,
                  25,
                  np.array([X_train4.Positive_counts.mean(),
                            X_train4.Negative_counts.mean(),
                            X_train4.Interaction_posc_negc.mean(),
                            X_train4.Positive_counts2.mean(),
                            X_train4.Negative_counts2.mean(),
                            X_train4.Positive_counts3.mean(),
                            X_train4.Negative_counts3.mean(),
                            1]),
                  10^-3,
                  0.05,
                  100)

10
20
30
40
50
60
70
80
90
100


In [49]:
Testing_soft_SVM(X_test4,y_test4,weight4)

{'accuracy': 0.71404, 'precision': 0.6728917609046849, 'recall': 0.83304}

In [None]:
#pd.DataFrame(list(reviews.apply(Counter)))

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(X_train.Positive_counts, X_train.Negative_counts, c=y_train, cmap=plt.cm.Set1,
            edgecolor='k',alpha=0.05)
plt.xlabel('x')
plt.ylabel('y')
plt.show()

# Building the Soft-SVM

In [None]:
def gradient_comp(X,y,C,w0):
    summands_of_X = X[X.dot(w0) * y <= 1]
    summands_of_y = y[summands_of_X.index]
    Y = []
    for i in range(summands_of_X.shape[1]):
        Y.append(list(summands_of_y))
    summands = -1*np.multiply(pd.DataFrame(Y).values.T,summands_of_X)
    gradient = w0 + C*summands.sum()
    return gradient

In [None]:
def soft_SVM_training(X,y,C,w0,eps,lr,n):
    # X is training data
    # y is labels
    # C is penalty of how hard to be
    # w is initial weights vector (np.array)
    # eps is convergence criterion
    # lr is learning rate
    
    # Computing gradient of L
    i = 0
    gradw = gradient_comp(X,y,C,w0)
    w1 = w0 - lr*gradw
    while(np.linalg.norm(w0-w1) > eps):
        w0 = w1
        w1 = w0 - lr*gradient_comp(X,y,C,w0)
        i = i + 1
        print(i)
        if i == n:
            break
    return w1
        
    

In [None]:
weight = soft_SVM_training(X_train,
                  y_train,
                  1,
                  np.array([X_train.Positive_counts.mean(),X_train.Negative_counts.mean(),1]),
                  10^-3,
                  0.05,
                  100)

In [None]:
def plot_data_with_classifier(X,y,w):
    # plot points
    plt.clf()
    plt.scatter(X.loc[:, "Positive_counts"], X.loc[:, "Negative_counts"], c=y, cmap=plt.cm.Set1, edgecolor='k')

    # draw hyperplane
    xrange = np.linspace(np.min(X.loc[:, "Positive_counts"]),X.loc[:, "Negative_counts"])
    yrange = -(w[0]*xrange+w[2])/w[1]
    plt.plot(xrange,yrange,'red')

    plt.show()

In [None]:
plot_data_with_classifier(X_train,y_train,weight)

In [None]:
weight

# -------------------------------------------------------------------------------------------

In [None]:
pos_set = set(pos_words)
neg_set = set(neg_words)
sentiment_words = []
for r in reviews:
    positives = list(pos_set.intersection(set(r)))
    negatives = list(neg_set.intersection(set(r)))
    sentiment_words.append(positives+negatives)
sentiment_words = pd.Series(sentiment_words)

In [None]:
tf = pd.DataFrame(list(sentiment_words.apply(Counter)))
tf = tf.fillna(0)
tf

In [None]:
docFreq = (tf > 0).sum(axis=0)
idf = np.log(len(tf) / docFreq)
tf_idf = tf*idf

In [None]:
tf_idf

In [None]:
X_train = tf_idf
X_train["ones"] = [1]*len(tf_idf)
y_train = training_df.Label

In [None]:
weight = soft_SVM_training(X_train,
                  y_train,
                  1,
                  np.array([0]*X_train.shape[1]),
                  10^-3,
                  0.01,
                  100)

In [None]:
np.save("tf_idf_model.npy",weight)

In [None]:
tf_idf_model = np.load("tf_idf_model.npy")

TESTING

In [None]:
all_words = X_train.columns

In [None]:
test_df = pd.read_csv("Test_data.csv").drop("Unnamed: 0",axis=1)
test_df.head()

In [None]:
words = (
    test_df.Review.
    str.lower().
    str.replace("[^\w\s]","").
    str.split()
)
bag_of_words = words.apply(Counter)

In [None]:
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

In [None]:
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/positive-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Positive\n;\n; This file contains a list of POSITIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;'
pos_words = r.text[len(s)+2:]
pos_words = pos_words.split("\n")

In [None]:
url = "https://raw.githubusercontent.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/master/data/opinion-lexicon-English/negative-words.txt"
r = requests.get(url)
s=';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n; \n; Opinion Lexicon: Negative\n;\n; This file contains a list of NEGATIVE opinion words (or sentiment words).\n;\n; This file and the papers can all be downloaded from \n;    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html\n;\n; If you use this list, please cite one of the following two papers:\n;\n;   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." \n;       Proceedings of the ACM SIGKDD International Conference on Knowledge \n;       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, \n;       Washington, USA, \n;   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing \n;       and Comparing Opinions on the Web." Proceedings of the 14th \n;       International World Wide Web conference (WWW-2005), May 10-14, \n;       2005, Chiba, Japan.\n;\n; Notes: \n;    1. The appearance of an opinion word in a sentence does not necessarily  \n;       mean that the sentence expresses a positive or negative opinion. \n;       See the paper below:\n;\n;       Bing Liu. "Sentiment Analysis and Subjectivity." An chapter in \n;          Handbook of Natural Language Processing, Second Edition, \n;          (editors: N. Indurkhya and F. J. Damerau), 2010.\n;\n;    2. You will notice many misspelled words in the list. They are not \n;       mistakes. They are included as these misspelled words appear \n;       frequently in social media content. \n;\n;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;\n\n'
neg_words = r.text[len(s):]
neg_words = neg_words.split("\n")

In [None]:
reviews = []
for r in words:
    good = []
    for w in r:
        if w not in en_stops:
            good.append(w)
    reviews.append(good)
reviews = pd.Series(reviews)

In [None]:
sents = set(all_words)
sentiment_words = []
for r in reviews:
    sentiment_words.append(list(sents.intersection(set(r))))
sentiment_words = pd.Series(sentiment_words)

In [None]:
tf = pd.DataFrame(list(sentiment_words.apply(Counter)))
tf = tf.fillna(0)
tf

In [None]:
docFreq = (tf > 0).sum(axis=0)
idf = np.log(len(tf) / docFreq)
tf_idf = tf*idf

In [None]:
tf_idf

In [None]:
X_test = tf_idf
X_test["ones"] = [1]*len(tf_idf)
y_test = test_df.Label