In [1]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold,cross_val_score
from gensim.models import Word2Vec

In [2]:
train = pd.read_csv('sentiment-train.csv')
test = pd.read_csv('sentiment-test.csv')

In [3]:
# part 1
cv = CountVectorizer(stop_words='english',max_features = 1000)
train_cvtf = cv.fit_transform(train.text)
test_cvtf = cv.transform(test.text)
mnb = MultinomialNB()
mnb.fit(train_cvtf,train.sentiment)
pred = mnb.predict(test_cvtf)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 1 Accuracy:',(cm[0][0]+cm[1][1])/len(test))

Part 1 Accuracy: 0.7827298050139275


In [4]:
# part 2
v = TfidfVectorizer(stop_words='english',max_features = 1000)
train_vtf = v.fit_transform(train.text)
test_vtf = v.transform(test.text)
mnb = MultinomialNB()
mnb.fit(train_vtf,train.sentiment)
pred = mnb.predict(test_vtf)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 2 Accuracy:',(cm[0][0]+cm[1][1])/len(test))

Part 2 Accuracy: 0.7688022284122563


In [5]:
# part 3
lg = LogisticRegression()
lg.fit(train_cvtf,train.sentiment)
pred = lg.predict(test_cvtf)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 3 Accuracy:',(cm[0][0]+cm[1][1])/len(test))

Part 3 Accuracy: 0.766016713091922


In [6]:
# part 4
lg = LogisticRegression()
lg.fit(train_vtf,train.sentiment)
pred = lg.predict(test_vtf)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 4 Accuracy:',(cm[0][0]+cm[1][1])/len(test))

Part 4 Accuracy: 0.7688022284122563


In [7]:
# part 5a
mnb = MultinomialNB()
cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
print('Part 5a')
for i in range(1,5):
    v = TfidfVectorizer(stop_words='english',max_features = 1000*i)
    train_vtf = v.fit_transform(train.text)
    scores = cross_val_score(mnb, train_vtf, train.sentiment, scoring='accuracy', cv=cv)
    print('Avg Accuracy for max %d000 features is' %i ,sum(scores)/len(scores))

Part 5a
Avg Accuracy for max 1000 features is 0.7218833333333332
Avg Accuracy for max 2000 features is 0.7355166666666666
Avg Accuracy for max 3000 features is 0.7385166666666667
Avg Accuracy for max 4000 features is 0.7403666666666666


In [8]:
# part 5b
v = TfidfVectorizer(stop_words='english',max_features = 4000)
train_vtf = v.fit_transform(train.text)
test_vtf = v.transform(test.text)
mnb = MultinomialNB()
mnb.fit(train_vtf,train.sentiment)
pred = mnb.predict(test_vtf)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 5b Accuracy:',(cm[0][0]+cm[1][1])/len(test))

Part 5b Accuracy: 0.7715877437325905


In [9]:
# part 6a
nlp = spacy.load("en_core_web_sm")
processed = []
for i in train.text:
    doc = nlp(i)
    tokens = [token.text.lower() for token in doc]
    processed += [tokens]

In [None]:
model = Word2Vec(processed,size=300)
wv = model.wv

In [None]:
# part 6b
repres = [sum([wv[i] for i in j if wv.__contains__(i)])/sum([1 for i in j if wv.__contains__(i)]) if sum([1 for i in j if wv.__contains__(i)]) != 0 else np.zeros(300) for j in processed]
train_repres = np.stack(repres)

In [None]:
# part 6c
test_processed = []
for i in test.text:
    doc = nlp(i)
    tokens = [token.text.lower() for token in doc]
    test_processed += [tokens]
repres = [sum([wv[i] for i in j if wv.__contains__(i)])/sum([1 for i in j if wv.__contains__(i)]) if sum([1 for i in j if wv.__contains__(i)]) != 0 else np.zeros(300) for j in test_processed]
test_repres = np.stack(repres)

In [None]:
lg = LogisticRegression(max_iter=500)
lg.fit(train_repres,train.sentiment)
pred = lg.predict(test_repres)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 6c Accuracy:',(cm[0][0]+cm[1][1])/len(test))

In [None]:
# part 6d
all_stopwords = nlp.Defaults.stop_words
processed = []
for i in train.text:
    doc = nlp(i)
    tokens = [token.text.lower() for token in doc if token.text.lower() not in all_stopwords]
    processed += [tokens]

In [None]:
model = Word2Vec(processed,size=300)
wv = model.wv
repres = [sum([wv[i] for i in j if wv.__contains__(i)])/sum([1 for i in j if wv.__contains__(i)]) if sum([1 for i in j if wv.__contains__(i)]) != 0 else np.zeros(300) for j in processed]
train_repres = np.stack(repres)

In [None]:
test_processed = []
for i in test.text:
    doc = nlp(i)
    tokens = [token.text.lower() for token in doc if token.text.lower() not in all_stopwords]
    test_processed += [tokens]
repres = [sum([wv[i] for i in j if wv.__contains__(i)])/sum([1 for i in j if wv.__contains__(i)]) if sum([1 for i in j if wv.__contains__(i)]) != 0 else np.zeros(300) for j in test_processed]
test_repres = np.stack(repres)

In [None]:
lg = LogisticRegression(max_iter=500)
lg.fit(train_repres,train.sentiment)
pred = lg.predict(test_repres)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 6d Accuracy:',(cm[0][0]+cm[1][1])/len(test))

In [None]:
# part 7
train_all = pd.read_csv('training.1600000.processed.noemoticon.csv',header=None)
train_all.loc[train_all[0] == 4, 0]=1

In [None]:
cv = CountVectorizer(stop_words='english',max_features = 4000)
train_cvtf = cv.fit_transform(train_all[5])
test_cvtf = cv.transform(test.text)
mnb.fit(train_cvtf,train_all[0])
pred = mnb.predict(test_cvtf)
cm = confusion_matrix(y_true=test.sentiment, y_pred=pred)
print('Part 7 Accuracy:',(cm[0][0]+cm[1][1])/len(test))