In [149]:
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [128]:
temp_df = pd.read_csv('IMDb dataset/IMDB Dataset.csv')

In [129]:
df = temp_df.iloc[:10000]

In [130]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [131]:
df.shape

(10000, 2)

In [132]:
df['sentiment'].value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

shows almost balanced class

In [133]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [134]:
df.duplicated().sum()

17

In [135]:
df.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace = True)


In [136]:
df.duplicated().sum()

0

In [137]:
# Remove tags
# Lowercase
# remove stopwords
# lemmatization

In [138]:
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [None]:
# Word2Vec

import gensim
from gensim.utils import simple_preprocess
from nltk import sent_tokenize

In [193]:
story = []

for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

In [195]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2
)

In [None]:
model.build_vocab(story)
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

# Averaging the vectors
def document_vector(doc):
    tokens = simple_preprocess(doc)
    tokens = [w for w in tokens if w in model.wv.index_to_key]
    if len(tokens) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[tokens], axis=0)

# create a column of averaged Word2Vec vectors for each review
df['w2v_vec'] = df['review'].apply(document_vector)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['w2v_vec'] = df['review'].apply(document_vector)


In [203]:
from tqdm import tqdm

X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████| 9983/9983 [00:37<00:00, 265.52it/s]


In [139]:
df['review'] = df['review'].apply(remove_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(remove_tags)


In [140]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [141]:
df['review'] = df['review'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x:x.lower())


In [142]:
sw_list = stopwords.words('english')

In [143]:
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))


In [144]:
lemmatizer = WordNetLemmatizer()
df['review'] = df['review'].apply(lambda x: " ".join([lemmatizer.lemmatize(word, pos='v') for word in x.split()]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(lambda x: " ".join([lemmatizer.lemmatize(word, pos='v') for word in x.split()]))


In [145]:
df['review']

0       one reviewers mention watch 1 oz episode hooke...
1       wonderful little production. film technique un...
2       think wonderful way spend time hot summer week...
3       basically there's family little boy (jake) thi...
4       petter mattei's "love time money" visually stu...
                              ...                        
9995    fun, entertain movie wwii german spy (julie an...
9996    give break. anyone say "good hockey movie"? kn...
9997    movie bad movie. watch endless series bad horr...
9998    movie probably make entertain middle school, e...
9999    smash film film-making. show intense strange r...
Name: review, Length: 9983, dtype: object

In [146]:
df

Unnamed: 0,review,sentiment
0,one reviewers mention watch 1 oz episode hooke...,positive
1,wonderful little production. film technique un...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive
...,...,...
9995,"fun, entertain movie wwii german spy (julie an...",positive
9996,"give break. anyone say ""good hockey movie""? kn...",negative
9997,movie bad movie. watch endless series bad horr...,negative
9998,"movie probably make entertain middle school, e...",negative


In [147]:
X = df['review']
y = df['sentiment']

In [None]:
encoder = LabelEncoder()        
y = encoder.fit_transform(y)

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [152]:
# BoW

from sklearn.feature_extraction.text import CountVectorizer

In [167]:
cv = CountVectorizer(max_features = 3000)

In [168]:
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

In [169]:
X_train_bow.shape

(7986, 3000)

In [170]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [171]:
gnb.fit(X_train_bow, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [172]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, y_pred)

0.7826740110165248

In [173]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.84      0.79       985
           1       0.83      0.72      0.77      1012

    accuracy                           0.78      1997
   macro avg       0.79      0.78      0.78      1997
weighted avg       0.79      0.78      0.78      1997



In [174]:
# Tf- Idf

from sklearn.feature_extraction.text import TfidfVectorizer

In [182]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [184]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

In [186]:
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)

In [189]:
accuracy_score(y_test, y_pred)

0.8547821732598898

In [190]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       985
           1       0.87      0.84      0.85      1012

    accuracy                           0.85      1997
   macro avg       0.85      0.85      0.85      1997
weighted avg       0.86      0.85      0.85      1997

