<a href="https://colab.research.google.com/github/jacmal/imdb_sentiment/blob/main/Sentiment_Word2Vec_Text_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install text preprocessing toolkit
!git clone https://github.com/jacmal/preprocess_text_jm.git

Cloning into 'preprocess_text_jm'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 17 (delta 6), reused 6 (delta 0), pack-reused 0[K
Unpacking objects: 100% (17/17), done.


In [2]:
!pip install sklearn



In [3]:
# import of libraries 
import pandas as pd
import numpy as np

import requests
import io

import sys
sys.path.append('/content/preprocess_text_jm')
import preprocess_text_jm as txt_prep_pkg

import spacy
import spacy.cli
spacy.cli.download("en_core_web_lg")

from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [4]:
# downloading the dataset 
url="https://raw.githubusercontent.com/jacmal/imdb_sentiment/main/imdb_reviews.txt"
site = requests.get(url).content

# creating a dataframe 
df = pd.read_csv(io.StringIO(site.decode('utf-8')), sep='\t', header=None)
df.head(2)

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0


In [5]:
# adding new names to columns
df.columns = ['reviews', 'sentiment']
df.head(2)

Unnamed: 0,reviews,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0


In [6]:
df_clean = df.copy()

In [7]:
# cleaning of the reviews
df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.cont_exp(text))
df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.remove_special_chars(text))
df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.remove_accented_chars(text))
df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.remove_emails(text))
df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.remove_html_tags(text))
df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.remove_urls(text))
df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.make_base(text))
#df_clean['reviews'] = df_clean['reviews'].apply(lambda text: txt_prep_pkg.spelling_correction(text))

In [8]:
df_clean.head()

Unnamed: 0,reviews,sentiment
0,a very very very slowmove aimless movie about ...,0
1,not sure who was more lose the flat characte...,0
2,attempt artiness with black white and clever...,0
3,very little music or anything to speak of,0
4,the good scene in the movie was when Gerardo i...,1


In [9]:
# English language pipeline
nlp = spacy.load("en_core_web_lg")

In [10]:
# 
def vectorize_text(text):
  '''
  Takes a string variable and returns it as a vector.
  '''
  doc_nlp = nlp(text)
  vector = doc_nlp.vector
  
  return vector

In [11]:
# creating a new column with vectorized text
df_clean['reviews2vec'] = df_clean['reviews'].apply(lambda text: vectorize_text(text))

In [12]:
df_clean.head()

Unnamed: 0,reviews,sentiment,reviews2vec
0,a very very very slowmove aimless movie about ...,0,"[-0.029546848, 0.09122606, -0.1881294, 0.15151..."
1,not sure who was more lose the flat characte...,0,"[0.058918916, 0.18493456, -0.13811785, -0.0045..."
2,attempt artiness with black white and clever...,0,"[-0.16601251, 0.009866389, -0.09277145, -0.063..."
3,very little music or anything to speak of,0,"[-0.09093174, 0.25162372, -0.25681874, 0.15846..."
4,the good scene in the movie was when Gerardo i...,1,"[0.086556055, 0.1315579, -0.12832318, -0.02360..."


In [13]:
# create a new variable as type: np.array
X = df_clean['reviews2vec'].to_numpy()

In [14]:
X.shape

(748,)

In [15]:
# reshape X array
X = X.reshape(-1, 1)

In [16]:
X.shape

(748, 1)

In [17]:
# create a new variable as type: np.array
y = df_clean['sentiment'].to_numpy()

In [18]:
y.shape

(748,)

In [19]:
# reshape X array
Xc = np.concatenate(X, axis=0)
Xc.shape

(748,)

In [20]:
# reshape X array
Xc2 = np.concatenate(Xc, axis=0)
Xc2.shape

(224400,)

In [21]:
# X variable is in right shape now
X = Xc2.reshape(-1, 300)
X.shape

(748, 300)

In [22]:
# create training and testing data sets
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.25, random_state=0, stratify=y)

In [23]:
# check the shape of data sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((561, 300), (187, 300), (561,), (187,))

In [24]:
# create Logistic Regression Classifier
clf_lr = LR(solver='liblinear')
clf_lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
# prediction of the dependent variable
y_pred_lr = clf_lr.predict(X_test)

In [26]:
# display classyfication report for Logistic Regression
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.78      0.81      0.80        91
           1       0.82      0.78      0.80        96

    accuracy                           0.80       187
   macro avg       0.80      0.80      0.80       187
weighted avg       0.80      0.80      0.80       187



In [27]:
# create SVC
clf_svc = LinearSVC()
clf_svc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [28]:
# SVC model prediction
y_pred_svc = clf_svc.predict(X_test)

In [29]:
# display SVC classyfication report
print(classification_report(y_test, y_pred_svc))

              precision    recall  f1-score   support

           0       0.75      0.80      0.78        91
           1       0.80      0.75      0.77        96

    accuracy                           0.78       187
   macro avg       0.78      0.78      0.78       187
weighted avg       0.78      0.78      0.78       187



In [30]:
# create Random Forest Classifier
clf_rfc = RFC(criterion='entropy')
clf_rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
# RFC model prediction
y_pred_rfc = clf_rfc.predict(X_test)

In [32]:
# RFC classyfication report
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.74      0.74      0.74        91
           1       0.75      0.76      0.76        96

    accuracy                           0.75       187
   macro avg       0.75      0.75      0.75       187
weighted avg       0.75      0.75      0.75       187



In [33]:
# create Gaussian Naive Bayes Classifier
clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
# GNBC model prediction
y_pred_gnb = clf_gnb.predict(X_test)

In [35]:
# GNBC classyfication report
print(classification_report(y_test, y_pred_gnb))

              precision    recall  f1-score   support

           0       0.70      0.70      0.70        91
           1       0.72      0.72      0.72        96

    accuracy                           0.71       187
   macro avg       0.71      0.71      0.71       187
weighted avg       0.71      0.71      0.71       187



In [36]:
# hyperparameter tuning for Logistic Regression Classifier
clf_lr = LR(solver='liblinear')

In [37]:
# heperparameter to check
hyper_lr = {'penalty': ['l1', 'l2'],
            'C': (1,2,3,4)}

In [38]:
# grid search parameters for LR
clf = GridSearchCV(clf_lr, hyper_lr, n_jobs=-1, cv=5)

In [39]:
# fit training and testing data sets into griid search
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': (1, 2, 3, 4), 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [40]:
# best params
clf.best_params_

{'C': 3, 'penalty': 'l2'}