In [1]:
import pandas as pd

df_train_sentiment = pd.read_csv("all_train_sentiment.csv")
df_train_sentiment

Unnamed: 0,description,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1.0
1,Homelessness (or Houselessness as George Carli...,1.0
2,Brilliant over-acting by Lesley Ann Warren. Be...,1.0
3,This is easily the most underrated film inn th...,1.0
4,This is not the typical Mel Brooks film. It wa...,1.0
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0.0
24996,This is the kind of movie that my enemies cont...,0.0
24997,I saw 'Descent' last night at the Stockholm Fi...,0.0
24998,Some films that you pick up for a pound turn o...,0.0


In [2]:
df_test_sentiment = pd.read_csv("all_test_sentiment.csv")
df_test_sentiment

Unnamed: 0,description,sentiment
0,I went and saw this movie last night after bei...,1.0
1,Actor turned director Bill Paxton follows up h...,1.0
2,As a recreational golfer with some knowledge o...,1.0
3,"I saw this film in a sneak preview, and it is ...",1.0
4,Bill Paxton has taken the true story of the 19...,1.0
...,...,...
24995,I occasionally let my kids watch this garbage ...,0.0
24996,When all we have anymore is pretty much realit...,0.0
24997,The basic genre is a thriller intercut with an...,0.0
24998,Four things intrigued me as to this film - fir...,0.0


In [3]:
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
import re
import numpy as np

In [4]:
def stem_tokenizer(text):
    stemmer = EnglishStemmer(ignore_stopwords=True)
    words = re.sub(r"[^A-Za-z0-9\-]", " ", text).lower().split()
    words = [stemmer.stem(word) for word in words]
    return words 

In [5]:
tokenized_train = []

for i in range(len(df_train_sentiment)):
    tokenized_train.append(stem_tokenizer(df_train_sentiment['description'][i]))

In [6]:
tokenized_test = []

for i in range(len(df_test_sentiment)):
    tokenized_test.append(stem_tokenizer(df_test_sentiment['description'][i]))

In [7]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=tokenized_train, vector_size=100, window=5, min_count=1, workers=6)

In [8]:
def calculate_vector(model, tokens):
    vectors = [model.wv[word] if word in model.wv else np.zeros(model.vector_size) for word in tokens]
    if len(vectors) == 0:
        # Return a vector of zeros if no words are in the Word2Vec vocabulary
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [9]:
X_train = pd.DataFrame([calculate_vector(word2vec_model, tokens) for tokens in tokenized_train])
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.698461,-0.120083,-0.397266,-0.161504,-0.386168,-0.761624,0.068571,-0.660611,0.336467,-0.151267,...,-0.756262,-0.151465,0.137032,-1.026044,0.147464,-0.830438,-0.724810,0.044324,0.063612,0.100988
1,0.549153,-0.282399,-0.591924,-0.083861,-0.479049,-0.961237,0.059135,-0.441911,0.392757,-0.036857,...,-0.370926,-0.384177,-0.068331,-1.119830,0.129233,-0.838772,-0.852629,0.148970,-0.005951,0.061146
2,0.178162,0.021380,-0.192205,-0.318229,-0.595528,-0.918042,0.111595,-0.285907,0.477264,-0.473275,...,-0.621810,-0.542088,-0.348664,-0.918265,0.246952,-0.507867,-0.335022,0.298219,-0.297929,-0.019257
3,0.417766,-0.002975,-0.278847,-0.048914,-0.444749,-0.908600,0.115640,-0.896471,0.596252,-0.351559,...,-1.062976,-0.314899,-0.390110,-1.287473,0.494465,-0.420239,-0.570049,0.293347,-0.100766,-0.173047
4,0.263892,0.095758,-0.489389,-0.299617,-0.448679,-0.965362,0.063397,-0.968478,0.449365,-0.434905,...,-0.861225,-0.488023,-0.661511,-1.256144,0.340910,-0.518013,-0.774052,0.278151,-0.264558,0.037004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.482802,0.142565,-0.361356,0.072467,-0.152410,-0.828432,0.197544,-0.750372,0.471780,-0.182925,...,-0.825043,-0.325628,-0.274603,-1.074296,0.138192,-0.775955,-0.692981,0.164803,-0.044800,-0.095961
24996,0.434502,-0.042414,-0.401966,0.091697,-0.407728,-0.974361,0.094681,-0.733591,0.395567,-0.266438,...,-0.894705,-0.466987,-0.085746,-0.911725,0.258761,-0.654831,-0.642609,0.067387,-0.128552,-0.162623
24997,0.494678,0.205802,-0.279475,0.009258,-0.397873,-0.858495,0.154078,-0.881179,0.355645,-0.441954,...,-0.872785,-0.489399,-0.201058,-0.957039,0.335805,-0.668500,-0.598565,0.281179,-0.109087,-0.017110
24998,0.402426,0.137276,-0.360204,-0.036631,-0.106770,-0.885181,0.172697,-0.811726,0.440963,-0.474093,...,-0.948738,-0.445584,-0.164088,-1.139958,0.261411,-0.914221,-0.661695,0.185703,-0.126731,-0.124847


In [10]:
X_test = pd.DataFrame([calculate_vector(word2vec_model, tokens) for tokens in tokenized_test])
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.329568,0.029657,-0.382187,-0.059780,-0.412267,-0.804948,-0.073133,-0.789009,0.633556,-0.264907,...,-0.902942,-0.191216,0.140602,-1.118343,0.116845,-1.081110,-0.460185,0.284228,-0.025482,0.091601
1,0.381978,-0.173879,-0.430369,0.000439,-0.500023,-0.867551,-0.015815,-0.392515,0.241073,-0.313140,...,-0.814423,-0.554155,-0.355968,-1.097896,0.329727,-0.693357,-0.742147,0.211737,-0.209203,0.005277
2,0.748433,-0.230553,-0.575325,-0.113222,-0.675384,-1.123892,-0.078380,-0.403804,0.122331,-0.325682,...,-0.741481,-0.369358,-0.267720,-1.221765,0.388096,-0.717852,-0.697106,0.346239,-0.190824,0.068146
3,0.402289,0.307659,-0.477960,-0.030950,-0.299674,-0.728280,0.005660,-1.246008,0.757374,-0.232379,...,-1.051772,-0.289333,-0.555223,-1.277660,0.334056,-0.615827,-0.758801,0.212443,-0.029157,0.002089
4,0.480187,-0.216899,-0.432205,-0.229887,-0.690112,-0.990873,-0.017719,-0.482112,0.304429,-0.535760,...,-0.968377,-0.549101,-0.292470,-1.022388,0.400328,-0.552247,-0.705925,0.324495,-0.413963,0.004786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.566223,-0.336750,-0.592017,-0.097559,-0.159393,-0.877975,0.092510,-0.587884,0.616796,-0.298495,...,-0.504322,-0.062740,-0.386060,-0.997630,0.458970,-0.731621,-0.902051,0.316586,-0.012771,-0.191784
24996,0.606752,0.091202,-0.528487,0.092240,-0.061597,-0.822919,0.243890,-0.777940,0.629578,-0.023200,...,-0.674350,-0.361669,-0.045873,-1.030874,0.267869,-1.123653,-0.898294,0.233573,-0.017642,-0.139126
24997,0.460286,-0.144423,-0.441285,-0.035993,-0.577319,-0.818908,0.120222,-0.614796,0.260825,-0.428225,...,-0.937995,-0.646465,-0.411989,-1.173807,0.422118,-0.705047,-0.750434,0.313309,-0.103742,-0.186917
24998,0.364063,0.237006,-0.299289,-0.298918,-0.308317,-0.715676,-0.069142,-0.934881,0.556151,-0.475540,...,-0.984245,-0.266937,-0.416249,-0.996985,0.314376,-0.689777,-0.382733,0.379540,0.029844,0.028465


In [11]:
y_train = df_train_sentiment['sentiment']
y_test = df_test_sentiment['sentiment']

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [13]:
lgr = LogisticRegression(max_iter=50000)
lgr.fit(X_train, y_train)
y_train_pred = lgr.predict(X_train)
y_test_pred = lgr.predict(X_test)

In [14]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.82      0.82     12500
         1.0       0.82      0.83      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



In [15]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         0.0       0.82      0.83      0.82     12500
         1.0       0.82      0.81      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [16]:
from gensim import downloader

pretrained_model = downloader.load("word2vec-google-news-300")

In [17]:
def calculate_vector(model, tokens):
    vectors = [model[word] if word in model else np.zeros(model.vector_size) for word in tokens]
    if len(vectors) == 0:
        # Return a vector of zeros if no words are in the Word2Vec vocabulary
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [18]:
X_train = pd.DataFrame([calculate_vector(pretrained_model, tokens) for tokens in tokenized_train])
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.025683,0.018135,0.037648,0.089552,-0.028056,-0.004031,0.047836,-0.069374,0.039422,0.035935,...,-0.035991,0.020192,-0.085614,0.006180,-0.032711,-0.014034,-0.008082,-0.012942,0.019834,-0.008284
1,0.023002,0.026768,0.026368,0.071530,-0.038484,-0.003416,0.029154,-0.060818,0.052146,0.042936,...,-0.031272,0.036899,-0.068199,0.019088,-0.035176,-0.011713,0.001848,-0.040796,0.022878,-0.026255
2,0.032501,0.023293,0.019086,0.050018,-0.022138,0.011547,0.000839,-0.054878,0.044257,0.047512,...,-0.043525,-0.002587,-0.067765,0.017993,-0.035141,-0.021946,-0.006979,-0.039497,0.018079,-0.011436
3,0.044249,0.012847,0.033239,0.065025,-0.051775,0.025759,0.044752,-0.070463,0.057056,0.034750,...,-0.046191,0.016986,-0.060158,0.002692,-0.054793,-0.028672,0.001066,-0.033229,0.033635,-0.013718
4,0.041099,0.017667,0.022943,0.054756,-0.034708,-0.007732,0.019602,-0.047182,0.056062,0.052298,...,-0.070224,0.012872,-0.057156,0.029842,-0.056668,0.004266,0.008986,-0.026687,0.036977,-0.024063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.013238,0.012926,0.019082,0.084249,-0.048540,0.001806,0.016730,-0.054080,0.051650,0.048612,...,-0.041991,0.049716,-0.071074,0.033823,-0.028840,-0.005291,-0.002440,-0.029132,0.014458,-0.027261
24996,0.021434,0.017614,0.041629,0.095526,-0.053890,-0.009582,0.019572,-0.061873,0.066913,0.045699,...,-0.027995,0.037188,-0.058759,0.020396,-0.025712,-0.015507,-0.010795,-0.025391,0.026062,-0.024918
24997,0.005610,0.032786,0.017601,0.073585,-0.044777,-0.006359,0.016703,-0.073435,0.049124,0.047798,...,-0.036271,0.032114,-0.065529,0.034508,-0.025056,-0.029233,-0.019028,-0.032907,0.020206,-0.008515
24998,0.013624,0.048362,0.020202,0.079880,-0.042252,-0.000507,0.025705,-0.048720,0.038890,0.059962,...,-0.044948,0.042072,-0.066451,0.013042,-0.031247,-0.018112,-0.007126,-0.023715,0.010069,-0.009586


In [19]:
X_test = pd.DataFrame([calculate_vector(pretrained_model, tokens) for tokens in tokenized_test])
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.012936,0.024248,0.027201,0.075597,-0.035484,-0.019670,0.007230,-0.070810,0.044662,0.053385,...,-0.046129,0.038725,-0.084053,0.023959,-0.034587,-0.003946,-0.013585,-0.033706,0.027527,-0.025323
1,0.019315,0.031285,0.025377,0.059136,-0.029211,-0.009444,0.017030,-0.062592,0.053124,0.048701,...,-0.052510,0.032373,-0.061750,0.003691,-0.024516,-0.012486,0.001930,-0.022668,0.009853,-0.018639
2,0.012941,0.038451,0.029210,0.061272,-0.034558,0.007606,0.027128,-0.068821,0.046923,0.035225,...,-0.054659,0.013025,-0.063801,0.001993,-0.042357,-0.008591,-0.003918,-0.022096,0.015514,-0.010436
3,0.024234,0.020949,0.017137,0.071556,-0.053530,-0.000675,0.034833,-0.050730,0.049228,0.037063,...,-0.046943,0.020775,-0.063238,0.026748,-0.026619,-0.003344,-0.005778,-0.026979,0.021405,-0.019382
4,0.040968,0.038416,0.016552,0.052041,-0.034349,0.000055,0.012937,-0.069784,0.038735,0.045879,...,-0.044385,0.011274,-0.048148,0.011277,-0.031048,0.001580,0.000426,-0.015940,0.013019,-0.016805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.008755,0.023947,0.032491,0.091090,-0.056390,0.018331,0.038365,-0.070103,0.045431,0.059423,...,-0.026203,0.023188,-0.073564,0.010708,-0.039379,-0.016058,0.000394,-0.021007,0.016886,-0.016388
24996,0.012044,0.011872,0.042687,0.122101,-0.064933,0.011985,0.050332,-0.060950,0.043031,0.061797,...,-0.025456,0.074445,-0.073106,0.025929,-0.038745,-0.024266,-0.009012,-0.038225,0.007439,-0.037148
24997,0.025937,0.024872,0.016680,0.072689,-0.054633,0.001271,0.041721,-0.059430,0.038371,0.048702,...,-0.057512,0.020293,-0.046940,0.021502,-0.036295,-0.017004,0.008184,-0.027992,0.020440,-0.004699
24998,0.029205,0.009843,0.016493,0.090750,-0.047610,0.009809,0.019135,-0.058812,0.067265,0.054791,...,-0.038852,0.036235,-0.068761,-0.002620,-0.033350,0.008001,-0.011239,-0.022346,0.030469,0.006786


In [20]:
y_train = df_train_sentiment['sentiment']
y_test = df_test_sentiment['sentiment']

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [22]:
lgr = LogisticRegression(max_iter=50000)
lgr.fit(X_train, y_train)
y_train_pred = lgr.predict(X_train)
y_test_pred = lgr.predict(X_test)

In [23]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.81      0.81     12500
         1.0       0.81      0.81      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000



In [24]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

         0.0       0.80      0.81      0.80     12500
         1.0       0.81      0.80      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000

