In [1]:
import os
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import gensim
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('IMDB-Dataset.csv')
df = df.iloc[0:10000]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
df.duplicated().sum()

17

In [10]:
df.drop_duplicates(inplace=True)

In [12]:
# Remove HTML tags with the help of RegEX Library
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [13]:
df['review'] = df['review'].apply(lambda x: remove_html_tags(x))

In [15]:
df['review'] = df['review'].apply(lambda x: x.lower())

In [16]:
# Remove Punctuation with the help of RegEX Library
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

In [17]:
df['review'] = df['review'].apply(lambda x: remove_punctuation(x))

In [18]:
# Import nltk Library for removing stopwords
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [19]:
# Function of removing stopwords
def remove_stopwords(text):
    str = []
    for i in text.split():
        if i not in stopwords:
            str.append(i)
    return " ".join(str)

In [20]:
df['review'] = df['review'].apply(lambda x: remove_stopwords(x))

In [26]:
df['review'].apply(lambda x: x.replace('"',""))

0       one reviewers mentioned watching 1 oz episode ...
1       wonderful little production filming technique ...
2       thought wonderful way spend time hot summer we...
3       basically theres family little boy jake thinks...
4       petter matteis love time money visually stunni...
                              ...                        
9995    fun entertaining movie wwii german spy julie a...
9996    give break anyone say good hockey movie know m...
9997    movie bad movie watching endless series bad ho...
9998    movie probably made entertain middle school ea...
9999    smashing film filmmaking shows intense strange...
Name: review, Length: 9983, dtype: object

In [37]:
X = df.iloc[:,0:1]
y = df.iloc[:,1]

In [38]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [41]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=1)

In [55]:
X_train.shape

(7986, 1)

## Apply Bag of Word (BOW)

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [57]:
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [60]:
X_train_bow.shape

(7986, 72762)

In [61]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [63]:
gnb.fit(X_train_bow,y_train)

In [65]:
y_pred = gnb.predict(X_test_bow)

In [66]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_score(y_test,y_pred)

0.656985478217326

In [67]:
confusion_matrix(y_test,y_pred)

array([[698, 254],
       [431, 614]], dtype=int64)

In [72]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(X_train_bow,y_train)
y_pred = rfc.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8487731597396094

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rfc.fit(X_train_bow,y_train)
y_pred = rfc.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.8417626439659489

## Apply N-Grams

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,3),max_features=500)
X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

rfc.fit(X_train_bow,y_train)
y_pred = rfc.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.801201802704056

## Using TF-IDF

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()

In [81]:
rfc.fit(X_train_tfidf,y_train)
y_pred = rfc.predict(X_test_tfidf)
accuracy_score(y_test,y_pred)

0.8417626439659489

## Using Word2vec

In [130]:
import gensim
import numpy as np

In [87]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [109]:
story = []
j = 0
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [105]:
len(story)

9983

In [111]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [112]:
model.build_vocab(story)

In [115]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(5472004, 5900590)

In [117]:
len(model.wv.index_to_key)

35162

### We have a individual word vector we transform into document vector

In [162]:
def document_vector(doc):
    # remove out of vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc],axis=0)

In [163]:
document_vector(df['review'].values[0])

array([-1.34063289e-01,  2.52067655e-01,  2.35134438e-01,  2.22789481e-01,
        7.86812790e-03, -5.45608222e-01,  1.08470671e-01,  9.93431568e-01,
       -4.35586303e-01, -1.40116438e-01, -7.52530545e-02, -5.22707701e-01,
        3.18119698e-03,  2.96117961e-01,  1.34801134e-01, -1.89815417e-01,
        1.56415910e-01, -2.98786998e-01, -1.14200711e-01, -7.52813578e-01,
        3.00474346e-01,  2.67839044e-01,  1.39190793e-01, -2.03524947e-01,
       -3.52008007e-02,  1.58504676e-02, -3.54860038e-01, -1.28364772e-01,
       -3.69031906e-01,  8.80973563e-02,  4.45673943e-01, -2.30607539e-02,
        1.43223092e-01, -2.97657430e-01, -6.78314492e-02,  4.13052589e-01,
        8.96111876e-02, -3.21283966e-01, -2.07524315e-01, -7.27236688e-01,
        3.92737798e-02, -3.38204533e-01, -2.21792459e-01, -1.82137080e-02,
        4.58425075e-01,  2.35552192e-02, -4.37312841e-01, -1.64974462e-02,
        2.66464025e-01,  9.44750085e-02,  8.78566429e-02, -3.84085268e-01,
       -1.23212494e-01,  

In [164]:
from tqdm import tqdm

In [167]:
X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|████████████████████████████████████████████████████████████| 9983/9983 [07:18<00:00, 22.76it/s]


In [175]:
X = np.array(X)

In [176]:
X.shape

(9983, 100)

In [180]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=20)

In [187]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
accuracy_score(y_test,y_pred)

0.7916875312969455