In [1]:
import pandas as pd

df_train_sentiment = pd.read_csv("all_train_sentiment.csv")
df_train_sentiment

Unnamed: 0,description,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1.0
1,Homelessness (or Houselessness as George Carli...,1.0
2,Brilliant over-acting by Lesley Ann Warren. Be...,1.0
3,This is easily the most underrated film inn th...,1.0
4,This is not the typical Mel Brooks film. It wa...,1.0
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0.0
24996,This is the kind of movie that my enemies cont...,0.0
24997,I saw 'Descent' last night at the Stockholm Fi...,0.0
24998,Some films that you pick up for a pound turn o...,0.0


In [2]:
df_test_sentiment = pd.read_csv("all_test_sentiment.csv")
df_test_sentiment

Unnamed: 0,description,sentiment
0,I went and saw this movie last night after bei...,1.0
1,Actor turned director Bill Paxton follows up h...,1.0
2,As a recreational golfer with some knowledge o...,1.0
3,"I saw this film in a sneak preview, and it is ...",1.0
4,Bill Paxton has taken the true story of the 19...,1.0
...,...,...
24995,I occasionally let my kids watch this garbage ...,0.0
24996,When all we have anymore is pretty much realit...,0.0
24997,The basic genre is a thriller intercut with an...,0.0
24998,Four things intrigued me as to this film - fir...,0.0


In [3]:
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
import re
import numpy as np

In [4]:
def stem_tokenizer(text):
    stemmer = EnglishStemmer(ignore_stopwords=True)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", text).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words 

In [5]:
tokenized_train = []

for i in range(len(df_train_sentiment)):
    tokenized_train.append(stem_tokenizer(df_train_sentiment['description'][i]))

In [6]:
tokenized_test = []

for i in range(len(df_test_sentiment)):
    tokenized_test.append(stem_tokenizer(df_test_sentiment['description'][i]))

In [7]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=1, workers=6)

In [8]:
def calculate_vector(model, tokens):
    vectors = [model.wv[word] if word in model.wv else np.zeros(model.vector_size) for word in tokens]
    if len(vectors) == 0:
        # Return a vector of zeros if no words are in the Word2Vec vocabulary
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [9]:
X_train = pd.DataFrame([calculate_vector(word2vec_model, tokens) for tokens in tokenized_train])
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.020417,-0.179532,-0.493935,-0.320694,-0.245235,-0.341736,-0.026162,-0.546855,0.457011,0.428676,...,-1.202730,0.026205,0.341869,-0.925982,0.746705,-0.483548,-0.292556,-0.497460,0.707420,-0.451561
1,0.898162,-0.193663,-0.575916,-0.168891,-0.604566,-0.679401,-0.050501,-0.272664,0.431180,0.420579,...,-0.788178,-0.050126,0.028983,-0.940722,0.700914,-0.485776,-0.301917,-0.326786,0.599600,-0.516706
2,0.571427,-0.209354,-0.354114,-0.489477,-0.653494,-0.603144,0.016395,-0.486552,0.681395,0.160719,...,-0.835470,-0.223907,-0.165682,-0.986035,0.756866,-0.083158,0.111548,-0.308654,0.157652,-0.537856
3,0.879383,-0.277023,-0.419722,-0.344581,-0.458726,-0.744127,0.212061,-0.571960,0.867753,0.162325,...,-1.271584,-0.063450,-0.214027,-1.132068,1.018906,-0.036697,-0.238538,-0.266126,0.436261,-0.640036
4,0.760448,-0.179804,-0.438169,-0.391562,-0.589743,-0.609952,0.017232,-0.927526,0.903294,0.260396,...,-1.154655,-0.286207,-0.377260,-1.169107,1.172096,-0.115020,-0.208322,-0.467857,0.400277,-0.527937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.899217,-0.046688,-0.367160,-0.142246,-0.311537,-0.473143,0.061787,-0.772750,0.694112,0.415340,...,-1.187912,0.034210,-0.044149,-0.965867,0.791439,-0.371905,-0.072948,-0.441381,0.565005,-0.620042
24996,0.932727,-0.282891,-0.531983,-0.189682,-0.474468,-0.657230,0.111704,-0.684006,0.643464,0.304981,...,-1.213185,-0.183435,0.075048,-0.964015,0.816113,-0.258610,-0.278762,-0.478876,0.566389,-0.666755
24997,0.988677,-0.127423,-0.416472,-0.184071,-0.516677,-0.475940,0.114810,-0.960945,0.702226,0.270885,...,-1.151418,-0.156242,-0.025289,-0.914679,0.965759,-0.312187,-0.080689,-0.471471,0.472293,-0.468197
24998,0.859755,-0.058566,-0.431041,-0.241094,-0.311229,-0.465631,0.153736,-0.723630,0.737552,0.166763,...,-1.234697,-0.200026,-0.044919,-1.116871,1.014080,-0.531489,-0.155634,-0.469641,0.491803,-0.608139


In [10]:
X_test = pd.DataFrame([calculate_vector(word2vec_model, tokens) for tokens in tokenized_test])
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.746839,-0.283176,-0.370350,-0.176712,-0.424762,-0.433979,-0.010329,-0.840295,0.776711,0.363830,...,-1.327923,0.160352,0.243221,-0.950630,0.774809,-0.684103,0.065148,-0.496064,0.573019,-0.369935
1,0.784237,-0.124468,-0.485198,-0.152238,-0.566610,-0.539365,-0.097805,-0.300341,0.521960,0.215340,...,-1.109468,-0.344574,-0.197956,-1.110106,1.014672,-0.261382,-0.276110,-0.375719,0.337810,-0.535335
2,1.042778,-0.201807,-0.618132,-0.322577,-0.776528,-0.656868,-0.157830,-0.317048,0.466037,0.257160,...,-1.144324,-0.204736,-0.112123,-1.109199,1.006035,-0.304321,-0.184553,-0.301059,0.409280,-0.480507
3,0.866693,-0.064561,-0.328112,-0.181013,-0.485194,-0.556440,-0.039717,-0.873873,1.084503,0.466392,...,-1.386549,-0.008440,-0.282563,-1.114492,0.867677,-0.049938,-0.139828,-0.275712,0.623429,-0.616578
4,0.755816,-0.207973,-0.602963,-0.425648,-0.604260,-0.568757,-0.090420,-0.340542,0.682019,-0.005676,...,-1.184220,-0.392919,-0.189350,-1.039851,1.050828,-0.165331,-0.236090,-0.394323,0.168774,-0.563277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.917478,-0.332296,-0.593570,-0.289239,-0.234645,-0.627528,0.039857,-0.423316,0.794119,0.212375,...,-0.823352,0.153378,-0.216502,-0.924294,0.987318,-0.483569,-0.445720,-0.253147,0.618825,-0.723506
24996,1.073482,-0.065538,-0.485422,0.030888,-0.219873,-0.390423,0.139909,-0.795439,0.836967,0.599154,...,-0.920584,0.037181,0.186059,-0.854897,0.887142,-0.726005,-0.260712,-0.422300,0.642373,-0.768416
24997,0.925119,-0.206799,-0.659636,-0.170900,-0.605264,-0.459967,0.048128,-0.493970,0.679648,0.132576,...,-1.089072,-0.389771,-0.339025,-1.136353,1.094504,-0.285265,-0.290733,-0.455076,0.472941,-0.662331
24998,0.725792,-0.094847,-0.222892,-0.458678,-0.460939,-0.328899,-0.087401,-0.691188,0.817039,0.149288,...,-1.209006,-0.051056,-0.241451,-1.028480,0.945663,-0.325899,-0.048278,-0.255028,0.519232,-0.535125


In [11]:
y_train = df_train_sentiment['sentiment']
y_test = df_test_sentiment['sentiment']

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [13]:
lgr = LogisticRegression(max_iter=50000)
lgr.fit(X_train, y_train)
y_train_pred = lgr.predict(X_train)
y_test_pred = lgr.predict(X_test)

In [14]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       0.82      0.82      0.82     12500
         1.0       0.82      0.82      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [15]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.83      0.82     12500
         1.0       0.82      0.81      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

