In [19]:
import numpy as np
import regex as re
import pandas as pd
import gc
import logging
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
import string
import nltk as nlp
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
#from gensim.test.utils import common_texts
#from collections import Counter #like map but worse cuz it senses only the tally --> not for computation :(
from tensorflow.python.client import device_lib
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

AttributeError: module 'numpy.random' has no attribute 'default_rng'

In [158]:
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17051795235338798196
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2907098318
locality {
  bus_id: 1
  links {
  }
}
incarnation: 17411982008057118797
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [159]:
print(tf.test.is_built_with_cuda())

True


#### LOADING IMDB DATASET

In [160]:
dataframe = pd.read_csv(r'..\\IMDB Dataset.csv')

In [161]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [162]:
dataframe.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [163]:
np.shape(dataframe)

(50000, 2)

In [164]:
dataframe['sentiment'] = dataframe['sentiment'].replace('positive', 1)
dataframe['sentiment'] = dataframe['sentiment'].replace('negative', 0)
dataframe.head()
#in case of non-binary classes it makes more sense to use label encoder rather than replace

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### CHECK FOR NULLS AND DUPLICATES

In [165]:
dataframe.isnull().sum()

review       0
sentiment    0
dtype: int64

In [166]:
dataframe.duplicated().sum()

418

In [167]:
dataframe.drop_duplicates(subset='review', keep='first', inplace=True)

In [168]:
np.shape(dataframe)

(49582, 2)

In [169]:
dataframe.duplicated().sum()

0

In [170]:
dataframe.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### split into test and train samples

In [171]:
train, test = train_test_split(dataframe, test_size = 0.3, random_state = 156, shuffle=True)

In [172]:
train.head()

Unnamed: 0,review,sentiment
3298,This movie is about a group of people who are ...,0
20811,This was a less than exciting short film I saw...,0
49519,"<br /><br />Crackerjack, starring Mick Malloy ...",1
32422,Why do I watch movies like this ? - other than...,0
49066,"""Why did they make them so big? Why didn't the...",1


In [173]:
test.head()

Unnamed: 0,review,sentiment
10310,"Chaplin was great a silent comedian, but many ...",0
20472,This has got to be one of Australia's best pro...,1
31604,A surprising misfire from the usually reliable...,0
26404,Why do people bitch about this movie and not a...,1
30664,"Criticism of the film EVENING, based on the no...",1


In [174]:
del dataframe

In [175]:
X_train = train['review'].values
y_train = train['sentiment'].values
X_test = test['review'].values
y_test = test['sentiment'].values

In [176]:
del train, test

In [177]:
gc.collect()

51

#### REMOVING NON-WORD CHARACTERS FROM THE DATASET

In [178]:
def preprocess(text, pattern):
    if pattern=='[.]+':
        text = re.sub(pattern, '. ', text)
    elif pattern =="[']":
        text =  re.sub(pattern, ' ', text)
    else:
        text = re.sub(pattern, '', text)
    #print(text, '\n')
    return text

In [179]:
X_train = np.vectorize(preprocess)(X_train, '<[^>]*>') #remove markup
#print(dataframe.loc[0, 'review'])
#dataframe.head()
X_test = np.vectorize(preprocess)(X_test, '<[^>]*>') #remove markup

In [180]:
X_train = np.vectorize(preprocess)(X_train, '[.]+') #remove ... and replace with .
#print(dataframe.loc[0, 'review'])
#dataframe.head()
X_test = np.vectorize(preprocess)(X_test, '[.]+') #remove ... and replace with .

In [181]:
X_train = np.vectorize(preprocess)(X_train, '[0-9]+') #remove numbers and replace with none
#print(dataframe.loc[0, 'review'])
#dataframe.head()
X_test = np.vectorize(preprocess)(X_test, '[0-9]+') #remove numbers and replace with none
#print(X_train)

In [182]:
X_train = np.vectorize(preprocess)(X_train, "[']") #remove ' and replace with 
#print(dataframe.loc[0, 'review'])
#train.head()
X_test = np.vectorize(preprocess)(X_test, "[']") #remove ' and replace with 

In [183]:
gc.collect()

0

In [184]:
X_train = np.vectorize(preprocess)(X_train, '[^\w\s]*') #remove everything that's not word space
# ' is left to handle contractions
#print(dataframe.loc[0, 'review'])
X_test = np.vectorize(preprocess)(X_test, '[^\w\s]*') #remove everything that's not word space or '

In [185]:
X_train[0]

'This movie is about a group of people who are infected by a powerful manmade virus  They are pursued by government men into the desert The premise of the film is quite interesting but is hampered by the fact that the delivery is extremely boring  At no point does the film engage with the viewer on any level  Granted the miniscule budget is a problem but is not the reason for the film s failure  Much more at fault is the very pofaced delivery  There is a great deal of narration but unfortunately the narrator has an annoyingly overdramatic voice  Very little seems to happen to these people and well before the end you will be rooting for the government men  the sooner they kill the protagonists the sooner the movie will end  A much better title for this film would have been Four People Run About In The Desert With Some Stock Footage Of A Helicopter  Overall very tedious '

In [189]:
X_train = [sentence.lower() for sentence in X_train] #make it lower
print(X_train[0])
X_test = [sentence.lower() for sentence in X_test] #make it lower
#print(dataframe.loc[0, 'review'])
#X_test = X_test.str.lower()#make it lower
#test.head()

this movie is about a group of people who are infected by a powerful manmade virus  they are pursued by government men into the desert the premise of the film is quite interesting but is hampered by the fact that the delivery is extremely boring  at no point does the film engage with the viewer on any level  granted the miniscule budget is a problem but is not the reason for the film s failure  much more at fault is the very pofaced delivery  there is a great deal of narration but unfortunately the narrator has an annoyingly overdramatic voice  very little seems to happen to these people and well before the end you will be rooting for the government men  the sooner they kill the protagonists the sooner the movie will end  a much better title for this film would have been four people run about in the desert with some stock footage of a helicopter  overall very tedious 


#### DOES IT MAKE SENSE TO REMOVE SOME WORDS TO REDUCE COMPUTATION?

In [191]:
count = CountVectorizer()
bag = count.fit_transform(X_train)
len(count.vocabulary_)

121963

#### Got over 2 Lakh words --> it makes sense to remove some words like articles and prepositions out
#### Better to remove stop words first (Why? --> documentation wip)

In [192]:
#something with tfidf
#question: does it make sense to do tfidf first and then remove stop words using the nltk corpus or 
#remove stop words using the corpus first then perform tfidf next

In [193]:
nlp.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rps24\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [194]:
stop = set(stopwords.words('english')) #set makes serach O(1)
#originally stopwords.words('english') yields a list
#print(stop)

In [195]:
ps  = PorterStemmer()

In [196]:
def remove_stopwords_and_stem(text):
    text_ = word_tokenize(text)
    tokens = []
    #print(text_)
    for word in text_:
        if word not in stop:
            tokens.append(ps.stem(word))
    #return lemmatization(tokens)
    #print(tokens)
    text = ' '.join(tokens) #send only tokens sent as a joined sentence
    return text 

In [197]:
X_train = np.vectorize(remove_stopwords_and_stem)(X_train)
X_test = np.vectorize(remove_stopwords_and_stem)(X_test)
print(X_train[0])

movi group peopl infect power manmad viru pursu govern men desert premis film quit interest hamper fact deliveri extrem bore point film engag viewer level grant miniscul budget problem reason film failur much fault pofac deliveri great deal narrat unfortun narrat annoyingli overdramat voic littl seem happen peopl well end root govern men sooner kill protagonist sooner movi end much better titl film would four peopl run desert stock footag helicopt overal tediou


In [198]:
lemmatizer = WordNetLemmatizer()
nlp.download('wordnet')
reviews = []

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rps24\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [199]:
def lemmatization(text):
    text_ = word_tokenize(text)
    tokens = []
    for word in text_:
        tokens.append(lemmatizer.lemmatize(word))
    text = ' '.join(tokens)
    reviews.append(tokens)
    return text

In [200]:
X_train = np.vectorize(lemmatization)(X_train)
X_test = np.vectorize(lemmatization)(X_test)
print(X_train[0])

movi group peopl infect power manmad viru pursu govern men desert premis film quit interest hamper fact deliveri extrem bore point film engag viewer level grant miniscul budget problem reason film failur much fault pofac deliveri great deal narrat unfortun narrat annoyingli overdramat voic littl seem happen peopl well end root govern men sooner kill protagonist sooner movi end much better titl film would four peopl run desert stock footag helicopt overal tediou


#### word2vec

In [201]:
all_reviews = np.array(reviews)
del reviews

  all_reviews = np.array(reviews)


In [202]:
gc.collect()

0

In [203]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [204]:
word2vec_model = Word2Vec(all_reviews, window = 3, min_count = 1, sg = 2, size = 256, workers = 5) #sg --> skipgram
#workers --> number of threads in useb

2021-10-04 20:48:33,397 : INFO : collecting all words and their counts
2021-10-04 20:48:33,397 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-10-04 20:48:33,576 : INFO : PROGRESS: at sentence #10000, processed 1160527 words, keeping 46426 word types
2021-10-04 20:48:33,762 : INFO : PROGRESS: at sentence #20000, processed 2337204 words, keeping 68209 word types
2021-10-04 20:48:33,950 : INFO : PROGRESS: at sentence #30000, processed 3510222 words, keeping 86075 word types
2021-10-04 20:48:34,142 : INFO : PROGRESS: at sentence #40000, processed 4660315 words, keeping 102157 word types
2021-10-04 20:48:34,318 : INFO : collected 116586 word types from a corpus of 5783944 raw words and 49584 sentences
2021-10-04 20:48:34,318 : INFO : Loading a fresh vocabulary
2021-10-04 20:48:34,477 : INFO : effective_min_count=1 retains 116586 unique words (100% of original 116586, drops 0)
2021-10-04 20:48:34,478 : INFO : effective_min_count=1 leaves 5783944 word corpus (

2021-10-04 20:49:35,761 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-10-04 20:49:35,786 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-10-04 20:49:35,791 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-10-04 20:49:35,792 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-10-04 20:49:35,793 : INFO : EPOCH - 4 : training on 5783944 raw words (5463404 effective words) took 11.3s, 483087 effective words/s
2021-10-04 20:49:36,801 : INFO : EPOCH 5 - PROGRESS: at 8.77% examples, 475410 words/s, in_qsize 9, out_qsize 0
2021-10-04 20:49:37,828 : INFO : EPOCH 5 - PROGRESS: at 19.42% examples, 520102 words/s, in_qsize 9, out_qsize 0
2021-10-04 20:49:38,834 : INFO : EPOCH 5 - PROGRESS: at 29.81% examples, 535514 words/s, in_qsize 9, out_qsize 0
2021-10-04 20:49:39,835 : INFO : EPOCH 5 - PROGRESS: at 39.82% examples, 539607 words/s, in_qsize 9, out_qsize 0
2021-10-04 20:49:40,854 : INFO : EPOCH 5 - PROG

In [205]:
#need to save model here
word2vec_model.wv.save_word2vec_format('../word_embeddings.txt')

2021-10-04 20:49:45,686 : INFO : storing 116586x256 projection weights into ../word_embeddings.txt


In [206]:
word2vec_model = KeyedVectors.load_word2vec_format('../word_embeddings.txt', binary = False, unicode_errors = 'ignore')

2021-10-04 20:50:03,374 : INFO : loading projection weights from ../word_embeddings.txt
2021-10-04 20:50:18,493 : INFO : loaded (116586, 256) matrix from ../word_embeddings.txt


In [207]:
word2vec_model.wv.similarity('saw', 'may')

  word2vec_model.wv.similarity('saw', 'may')


0.4202183

In [208]:
word2vec_model.wv.similarity('saw', 'say')

  word2vec_model.wv.similarity('saw', 'say')


0.4640008

In [209]:
word2vec_model.wv.similarity('say', 'may')

  word2vec_model.wv.similarity('say', 'may')


0.43606555

In [210]:
word2vec_model.wv.similarity('gangsta', 'latino')

  word2vec_model.wv.similarity('gangsta', 'latino')


0.7215072

In [211]:
len(word2vec_model.wv.vocab)

  len(word2vec_model.wv.vocab)


116586

In [212]:
gc.collect()

0

#### load dictionary of word to vectors --> from gensim instance to dictionary

In [213]:
embedding = {}
with open('../word_embeddings.txt', encoding = 'utf-8') as f:
    #page = f.read()
    for line in f:
        record = line.split()
        #print(record[0])
        #word = record[0]
        embedding[record[0]] = np.asarray(record[1:])        

In [214]:
print(len(embedding))

116587


In [215]:
gc.collect()

0

#### CREATING TENSORS

In [216]:
token_tensor = Tokenizer()
token_tensor.fit_on_texts(all_reviews)
maxim = 0
for  review in all_reviews:
    maxim = max(maxim, len(review))
X_train_token = token_tensor.texts_to_sequences(all_reviews)
X_train_pad = pad_sequences(X_train_token, maxlen = maxim, padding='post')
X_test_token = token_tensor.texts_to_sequences(all_reviews)
X_test_pad = pad_sequences(X_train_token, maxlen = maxim, padding='post')

#### simple rnn

#### metrics