### Sentiment Analysis on IMDB reviw dataset

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # We also have snowbaallstemmer
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix
from nltk.stem import SnowballStemmer
from sklearn.tree import DecisionTreeClassifier

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kpoli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df= pd.read_csv('IMDBDataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Preprocessing

In [5]:
ps = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))
html_tags_re = re.compile('<.*?>')

corpus = [
    " ".join([ps.stem(word) for word in re.sub('[^a-zA-z]'," ", re.sub(html_tags_re, '', document)).lower().split() 
              if word not in stop_words])
    for document in df['review']
]

In [59]:
corpus[0]

'one review mention watch oz episod hook right exact happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

In [7]:
df['review'] = corpus

In [16]:
X = df['review']

y = pd.get_dummies(df['sentiment'])
y = y.iloc[:,1].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

### Bag of Words

In [19]:
cf = CountVectorizer(max_features= 2500,binary = True,ngram_range=(1,2))
X_train_cf = cf.fit_transform(X_train).toarray()
X_test_cf = cf.transform(X_test).toarray()

In [20]:
mnb = MultinomialNB()
mnb.fit(X_train_cf,y_train)
y_pred = mnb.predict(X_test_cf)
accuracy_score(y_test,y_pred)

0.8478787878787879

In [21]:
tree = DecisionTreeClassifier()
tree.fit(X_train_cf,y_train)
y_pred_dt = tree.predict(X_test_cf)
accuracy_score(y_test,y_pred_dt)

0.7136363636363636

In [22]:
confusion_matrix(y_test,y_pred_dt)

array([[5837, 2371],
       [2354, 5938]], dtype=int64)

### Tf-IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X_train_tf = tf.fit_transform(X_train).toarray()
X_test_tf = tf.transform(X_test).toarray()

In [24]:
mnb_tf = MultinomialNB()
mnb_tf.fit(X_train_tf,y_train)
y_pred_tf = mnb_tf.predict(X_test_tf)
accuracy_score(y_test,y_pred_tf)

0.8483030303030303

### Word2Vec

In [25]:
# Converting each of the sentence into words to feed them to the word2vec model
words = [sent.split() for sent in corpus]

In [31]:
words[0]

['one',
 'review',
 'mention',
 'watch',
 'oz',
 'episod',
 'hook',
 'right',
 'exact',
 'happen',
 'first',
 'thing',
 'struck',
 'oz',
 'brutal',
 'unflinch',
 'scene',
 'violenc',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'heart',
 'timid',
 'show',
 'pull',
 'punch',
 'regard',
 'drug',
 'sex',
 'violenc',
 'hardcor',
 'classic',
 'use',
 'word',
 'call',
 'oz',
 'nicknam',
 'given',
 'oswald',
 'maximum',
 'secur',
 'state',
 'penitentari',
 'focus',
 'main',
 'emerald',
 'citi',
 'experiment',
 'section',
 'prison',
 'cell',
 'glass',
 'front',
 'face',
 'inward',
 'privaci',
 'high',
 'agenda',
 'em',
 'citi',
 'home',
 'mani',
 'aryan',
 'muslim',
 'gangsta',
 'latino',
 'christian',
 'italian',
 'irish',
 'scuffl',
 'death',
 'stare',
 'dodgi',
 'deal',
 'shadi',
 'agreement',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goe',
 'show',
 'dare',
 'forget',
 'pretti',
 'pictur',
 'paint',
 'mainstream',
 'audi

In [32]:
# Loading the word2vec pretrained model
import gensim.downloader as api
from gensim.models import KeyedVectors

model_path = api.load("word2vec-google-news-300", return_path=True)

model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [36]:
model.most_similar('book')

[('tome', 0.7485830783843994),
 ('books', 0.7379177808761597),
 ('memoir', 0.7302926778793335),
 ('paperback_edition', 0.6868364214897156),
 ('autobiography', 0.6741527318954468),
 ('memoirs', 0.6505153179168701),
 ('Book', 0.6479282975196838),
 ('paperback', 0.6471226811408997),
 ('novels', 0.6341459155082703),
 ('hardback', 0.6283079981803894)]

### Avg Word2Vec

In [44]:
import numpy as np
def avg_word2vec(doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in model]
    return np.mean(model[doc], axis=0)


In [45]:
from tqdm import tqdm

X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [00:10<00:00, 4774.36it/s]


In [48]:
#### Model Building for this
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)


In [53]:
X_train_wv = np.array(X_train)
X_test_wv = np.array(X_test)

In [54]:
X_train_wv.shape

(33500, 300)

In [55]:
X_test_wv.shape

(16500, 300)

In [57]:
dt = DecisionTreeClassifier()
dt.fit(X_train_wv,y_train)


In [58]:
y_pred_wv = dt.predict(X_test_wv)

accuracy_score(y_test,y_pred_wv)

0.6417575757575757