In [27]:
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA

import gensim
from gensim.utils import simple_preprocess
from nltk import sent_tokenize
import plotly.express as px

import pandas as pd
import numpy as np
import re
import string

In [2]:
df = pd.read_csv("data/csv/imdb_dataset.csv")
df = df.iloc[:1000]

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df["sentiment"].value_counts()

sentiment
positive    501
negative    499
Name: count, dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
df = df.drop_duplicates()

In [7]:
df.shape

(1000, 2)

In [8]:
def remove_html(data):
    pattern = re.compile("<.*?>")
    return pattern.sub(r"", data)

def remove_url(data):
    pattern = re.compile(r"https?://\S+|www\.\S+")
    return pattern.sub(r"", data)

punctuations = string.punctuation
def remove_punctuation(data):
    return data.translate(str.maketrans("", "", punctuations))

en_stopwords = stopwords.words("english")
def remove_stopwords(data):
    end_string = []
    for word in data.split():
        if word not in en_stopwords:
            end_string.append(word)
    return " ".join(end_string)

In [9]:
df["review"] = df["review"].str.lower()
df["review"] =df["review"].apply(remove_html)
df["review"] =df["review"].apply(remove_url)
df["review"] =df["review"].apply(remove_punctuation)
df["review"] =df["review"].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive


In [10]:
X = df.iloc[:, 0:1]
y = df["sentiment"]

In [11]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [13]:
cv = CountVectorizer()

In [14]:
X_train_bow = cv.fit_transform(X_train["review"]).toarray()
X_test_bow = cv.transform(X_test["review"]).toarray()

In [15]:
gnb = GaussianNB()
gnb.fit(X_train_bow, y_train)

In [16]:
y_pred = gnb.predict(X_test_bow)

In [17]:
print(accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.575


array([[57, 47],
       [38, 58]])

In [18]:
rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)

In [19]:
y_pred = rf.predict(X_test_bow)
print(accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

0.795


array([[83, 21],
       [20, 76]])

In [23]:
tfidf = TfidfVectorizer()

In [24]:
X_train_tfidf = tfidf.fit_transform(X_train["review"]).toarray()
X_test_tfidf = tfidf.transform(X_test["review"]).toarray()

In [25]:
rf_idf = RandomForestClassifier()
rf_idf.fit(X_train_tfidf, y_train)
y_pred = rf_idf.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))

0.81


Better accuracy using Tfidf. Now lets see using Word2Vec

In [28]:
story = []
for doc in df["review"]:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [29]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
)

In [30]:
model.build_vocab(story)

In [31]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(512214, 592120)

In [33]:
def document_vector(data):
    doc = [word for word in data.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [34]:
X = []
for doc in df["review"].values:
    X.append(document_vector(doc))

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [36]:
rf_idf = RandomForestClassifier()
rf_idf.fit(X_train, y_train)
y_pred = rf_idf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.545


The accuracy is worse because I could not use the whole dataset to train the model.