# Import Libraries

In [1]:
## importing necessery libraries and data
import numpy as np
import pandas as pd
import re

In [2]:
train =pd.read_csv('TrainData.tsv', sep='\t')

In [3]:
train.head(3)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...


In [4]:
train.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

# Preprocessing

In [5]:
from nltk.corpus import stopwords

def preprocess(df):
    df['review'].replace( { r"#(\w+)" : '' }, inplace= True, regex = True) #Remove Hashtags
    df['review'].replace( { r"@(\w+)" : '' }, inplace= True, regex = True) #Remove Mention
    df['review'].astype(str).replace( { r"http\S+" : '' }, inplace= True, regex = True) #Remove URL
    df['review'].replace( { r'[^\w\s]' : '' }, inplace= True, regex = True) # remove punctuation
    df['review'] = df['review'].str.lower() # To lower case

    stop = stopwords.words('english')
    df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [6]:
## preprocessing train and test data
preprocess(train)
train.review[0]

'stuff going moment mj ive started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mjs feeling towards press also obvious message drugs bad mkaybr br visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice himbr br actual feature film bit finally starts 20 minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pescis character ranted wanted people know supplying drugs etc dunno maybe hates mjs musicbr br lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence

## TrainTestSplit

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.review, train.sentiment, test_size = 0.2, stratify = train.sentiment)

print(X_train.shape[0])
print(X_test.shape[0])

20000
5000


In [8]:
print('review : \n',X_train.values[0])
print('\nsentiment: \n',y_train.values[0])

review : 
 movie madeleine carroll cast could possibly unwatchable said add british film comes close story takes place board ss atlantic loosely based titanics unfinished voyage word unsinkable spoken liner strikes iceberg hear heavenly choir sing nearer god thee doomed passengers eventually take anthem clever bit sound work year films release 1929 means modern viewer accept otherwise primitive sound many acting conventions silent films stage arent problems films major flaw pacing pacing well developed silents however dialog delivered realistic speed movies running time would cut half intended effect drama clarity new medium result unhappily tiresome films structure preposterously illogical inept paradoxically found certain details editing quite modern technique fine abrupt cuts one area ship another sometimes even sound effects although board atlantic first shot well 4 minutes movie discovered fact long intrusive musical passages ships dance orchestra entertaining easy sound personal 

# CountVenctorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train.values)
X_test_cv = cv.transform(X_test.values)

In [10]:
print(type(X_train_cv))
X_train_cv.toarray()[:2]

<class 'scipy.sparse._csr.csr_matrix'>


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
X_train_cv.shape

(20000, 106941)

In [12]:
print(cv.get_feature_names_out()[10000:10050],"\n")
cv.vocabulary_

['bedlam' 'bednob' 'bedonly' 'bedouin' 'bedpersonbehindyouinthemirror'
 'bedpost' 'bedraggled' 'bedridden' 'bedroom' 'bedroomone' 'bedrooms'
 'bedrunkorbedamned' 'beds' 'bedsheet' 'bedsheets' 'bedside' 'bedsit'
 'bedsty' 'bedtime' 'bedtimebr' 'bedwetting' 'bee' 'beeb' 'beebs' 'beech'
 'beecham' 'beechnick' 'beef' 'beefcake' 'beefed' 'beefheart' 'beefing'
 'beefs' 'beefy' 'beefyhave' 'beegees' 'beehive' 'beehives' 'beejesus'
 'beek' 'beekeepers' 'beeks' 'beeline' 'beemans' 'beems' 'beenand'
 'beenbetter' 'beenbr' 'beendirector' 'beendonebeforebutstillfunny'] 



{'movie': 62537,
 'madeleine': 57031,
 'carroll': 15956,
 'cast': 16162,
 'could': 21790,
 'possibly': 73134,
 'unwatchable': 100272,
 'said': 81447,
 'add': 3176,
 'british': 13671,
 'film': 35001,
 'comes': 19791,
 'close': 18911,
 'story': 90591,
 'takes': 93240,
 'place': 71908,
 'board': 12073,
 'ss': 89474,
 'atlantic': 7606,
 'loosely': 55937,
 'based': 9432,
 'titanics': 96024,
 'unfinished': 99475,
 'voyage': 102184,
 'word': 105123,
 'unsinkable': 100107,
 'spoken': 89116,
 'liner': 55125,
 'strikes': 90956,
 'iceberg': 46456,
 'hear': 43221,
 'heavenly': 43346,
 'choir': 17875,
 'sing': 86263,
 'nearer': 64169,
 'god': 39782,
 'thee': 94676,
 'doomed': 27951,
 'passengers': 69879,
 'eventually': 32173,
 'take': 93211,
 'anthem': 5903,
 'clever': 18730,
 'bit': 11332,
 'sound': 88381,
 'work': 105154,
 'year': 106005,
 'films': 35162,
 'release': 78165,
 '1929': 565,
 'means': 59272,
 'modern': 61448,
 'viewer': 101594,
 'accept': 2595,
 'otherwise': 68158,
 'primitive': 7416

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report,accuracy_score

rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()
lr_model = LogisticRegression(max_iter=1000)
model = MultinomialNB()

In [14]:
model.fit(X_train_cv,y_train)
y_pred = model.predict(X_test_cv)
print("Naive Bayes Model\n")
print(classification_report(y_test,y_pred))

Naive Bayes Model

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      2500
           1       0.87      0.84      0.86      2500

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



In [15]:
rf_model.fit(X_train_cv,y_train)
y_pred = rf_model.predict(X_test_cv)
print("Random Forest Model\n")
print(classification_report(y_test,y_pred))

Random Forest Model

              precision    recall  f1-score   support

           0       0.86      0.85      0.86      2500
           1       0.85      0.86      0.86      2500

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



In [16]:
gb_model.fit(X_train_cv,y_train)
y_pred = gb_model.predict(X_test_cv)
print("Gradient Boosting Model\n")
print(classification_report(y_test,y_pred))

Gradient Boosting Model

              precision    recall  f1-score   support

           0       0.85      0.74      0.79      2500
           1       0.77      0.87      0.82      2500

    accuracy                           0.81      5000
   macro avg       0.81      0.80      0.80      5000
weighted avg       0.81      0.81      0.80      5000



In [17]:
lr_model.fit(X_train_cv,y_train)
y_pred = lr_model.predict(X_test_cv)
print("Logistic Regression Model\n")
print(classification_report(y_test,y_pred))

Logistic Regression Model

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      2500
           1       0.88      0.88      0.88      2500

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [18]:
print(accuracy_score(y_test,y_pred))

0.8796


In [19]:
## some random reviews taken from rottentomatos ;)

rev = ["McQuarrie puts enough bloody crunch into the action to dispel any suggestions of creeping comic decadence. Top-flight supporting performances help.", 
       "Open the door, don’t open the door…we fail to care and wish the other dimensional demon would put us all out of our misery.",
       "The long-awaited direct sequel to Insidious: Chapter 2 is a tepid supernatural-horror disappointment.", 
       "“Mission: Impossible – Dead Reckoning Part One” is just incredibly fun. It feels half its length and contains enough memorable action sequences for some entire franchises.",
       "The recut American version is truly awful, but a good 75 percent of the awfulness is attributable to Miramax, the film's distributor.",
       "Despite being the equivalent of a walk through a county fair horror house, this is the Insidious installment with the best executed and sustained suspense."]
rev = cv.transform(rev)
lr_model.predict(rev)

array([1, 0, 0, 1, 0, 1], dtype=int64)