In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import BatchNormalization

es = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

EPOCHS = 1000



In [2]:
df = pd.read_csv('./datasets/IMDB Dataset.csv', sep=",")

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from tqdm import tqdm
from nltk import sent_tokenize,word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...


True

In [6]:
main = []

# Storing all punctuations using RE library like !;,"% etc
re_puncs = re.compile('[%s]' % re.escape(string.punctuation))
# Storing all stop words like a, an, the, when, there, this etc
stop_word  = set(stopwords.words('english'))
stop_word.add("im")
# print(stop_word)


In [7]:
sentences_listed = [line.split(" ") for line in df['review']]

import gensim

word_model = gensim.models.Word2Vec(sentences = sentences_listed, vector_size = 10)
words = list(word_model.wv.key_to_index)
print(len(words))

76833


In [8]:
word_model.wv.most_similar("actress")

[('actor', 0.9822564721107483),
 ('role', 0.9570860266685486),
 ('actor,', 0.9441996812820435),
 ('role,', 0.9428852796554565),
 ('accent', 0.9201539754867554),
 ('voice', 0.9137205481529236),
 ('comedian', 0.8975550532341003),
 ('villain', 0.8922207355499268),
 ('director,', 0.8850878477096558),
 ('performance', 0.8849977254867554)]

In [9]:
# Making Lemmatizing object
lem = WordNetLemmatizer()
# Using Porter Stemmer
p_stem = PorterStemmer()

# Traversing whole dataset
for i in tqdm(range(len(df['review']))):
    # Tokenization
    tokens = word_tokenize(str(df['review'][i]))
    # Converting all characters to lower case
    tokens = [w.lower() for w in tokens]
    # Remove all punctuations from sentenses
    tokens = [re_puncs.sub('', w) for w in tokens]
    # Checking all words is alphabets or not
    tokens = [i for i in tokens if i.isalpha()]
    # Removing all stop words from the sentenses
    tokens = [w for w in tokens if w not in stop_word]
    # Doing Lemmatizing of words
    tokens = [lem.lemmatize(w) for w in tokens]
    # Stemming process
    tokens = [p_stem.stem(w) for w in tokens]
    # Finally convert to string
    r = ' '.join(tokens)
    # Storing the final string into main list
    main.append(r)
    
#https://www.kaggle.com/code/tanujdhiman/twitter-sentiment-analysis

100%|██████████| 50000/50000 [11:28<00:00, 72.60it/s] 


In [10]:
df['translated'] = main

In [11]:
df

Unnamed: 0,review,sentiment,translated
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod hook right ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product br br film techniqu unass...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movi right good job nt creativ origin ...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogu bad act idiot direct anno...
49997,I am a Catholic taught in parochial elementary...,negative,cathol taught parochi elementari school nun ta...
49998,I'm going to have to disagree with the previou...,negative,go disagre previou comment side maltin one sec...


In [12]:
df['translated'][0]

'one review mention watch oz episod hook right exactli happen br br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word br br call oz nicknam given oswald maximum secur state penitentari focu mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact go show would nt dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz nt mess around first episod ever saw struck nasti surreal could nt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort vie

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['translated'])

In [14]:
print(len(tokenizer.word_counts))

101287


In [15]:
counter = 0
for key, value in tokenizer.word_counts.items():
    if value > 40:
        print(key, value)
        counter+=1
        
print(counter)

one 54100
review 4292
mention 3013
watch 27683
oz 268
episod 4980
hook 587
right 6890
exactli 1964
happen 7166
br 201951
first 17257
thing 16459
struck 267
brutal 937
scene 21204
violenc 2063
set 7645
word 3629
go 22621
trust 731
show 19740
faint 117
heart 2709
timid 51
pull 1849
punch 536
regard 957
drug 1673
sex 3297
hardcor 255
classic 4255
use 9079
call 5461
nicknam 83
given 3568
maximum 103
secur 494
state 2070
focu 1324
mainli 774
citi 2466
experiment 209
section 459
prison 1407
cell 461
glass 503
front 1232
face 4264
high 3875
agenda 186
em 243
home 3767
mani 13457
muslim 202
gangsta 41
latino 91
christian 945
italian 1144
irish 384
death 4229
stare 421
dodgi 91
deal 2595
shadi 72
agreement 79
never 12907
far 5802
away 5458
would 26492
say 14969
main 4637
appeal 1367
due 1792
fact 7309
nt 66249
dare 648
forget 1653
pretti 7226
pictur 3790
paint 988
mainstream 378
audienc 5207
charm 1766
romanc 1452
mess 1520
around 7090
ever 11818
saw 6344
nasti 705
surreal 525
could 18543
readi

In [16]:
tokenizer.texts_to_sequences(df['translated'])

[[5,
  226,
  347,
  13,
  2774,
  189,
  1574,
  114,
  544,
  104,
  1,
  1,
  32,
  36,
  2785,
  2774,
  1060,
  10985,
  19,
  506,
  91,
  114,
  290,
  18,
  1311,
  21,
  4909,
  390,
  8225,
  21,
  566,
  1675,
  1040,
  631,
  317,
  506,
  2896,
  229,
  73,
  290,
  1,
  1,
  167,
  2774,
  6088,
  294,
  10783,
  5315,
  1785,
  503,
  37036,
  799,
  1253,
  16416,
  428,
  3324,
  1887,
  746,
  1884,
  1759,
  853,
  228,
  17007,
  12040,
  260,
  3588,
  2993,
  428,
  275,
  43,
  10078,
  3396,
  9310,
  5757,
  1052,
  898,
  2151,
  14533,
  231,
  2005,
  5758,
  404,
  6639,
  6279,
  51,
  153,
  168,
  1,
  1,
  14,
  39,
  202,
  773,
  21,
  589,
  100,
  18,
  21,
  14,
  4,
  1452,
  645,
  102,
  272,
  1019,
  2180,
  180,
  645,
  598,
  645,
  731,
  2774,
  4,
  697,
  108,
  32,
  189,
  56,
  129,
  2785,
  1358,
  1706,
  25,
  4,
  39,
  1409,
  13,
  359,
  899,
  2774,
  101,
  7545,
  260,
  459,
  1162,
  506,
  506,
  5027,
  2856,
  1696,
 

In [17]:
tokenized_reviews = tokenizer.texts_to_sequences(df['translated'])
print(tokenized_reviews[0])

[5, 226, 347, 13, 2774, 189, 1574, 114, 544, 104, 1, 1, 32, 36, 2785, 2774, 1060, 10985, 19, 506, 91, 114, 290, 18, 1311, 21, 4909, 390, 8225, 21, 566, 1675, 1040, 631, 317, 506, 2896, 229, 73, 290, 1, 1, 167, 2774, 6088, 294, 10783, 5315, 1785, 503, 37036, 799, 1253, 16416, 428, 3324, 1887, 746, 1884, 1759, 853, 228, 17007, 12040, 260, 3588, 2993, 428, 275, 43, 10078, 3396, 9310, 5757, 1052, 898, 2151, 14533, 231, 2005, 5758, 404, 6639, 6279, 51, 153, 168, 1, 1, 14, 39, 202, 773, 21, 589, 100, 18, 21, 14, 4, 1452, 645, 102, 272, 1019, 2180, 180, 645, 598, 645, 731, 2774, 4, 697, 108, 32, 189, 56, 129, 2785, 1358, 1706, 25, 4, 39, 1409, 13, 359, 899, 2774, 101, 7545, 260, 459, 1162, 506, 506, 5027, 2856, 1696, 2558, 18364, 4260, 99, 458, 12, 168, 20, 970, 674, 629, 4260, 95, 746, 3986, 589, 289, 594, 962, 746, 337, 13, 2774, 120, 135, 1765, 2562, 224, 1406, 12, 433, 3335, 379]
