## Imports

In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import glob
from scipy.spatial.distance import cdist
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding,LSTM
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle
import csv
import pandas as pd

KeyboardInterrupt: 

In [5]:
def load_data_from_csv(filename):    
    with open(filename, newline='', encoding="utf-8") as f:
        reader = csv.reader(f)
        data = list(reader)
    df = pd.read_csv(filename)
    rates = df['rate'].tolist()
    reviews = df['review'].tolist()
    return reviews,rates

In [6]:
%%time
xall,yall = load_data_from_csv('data.csv')
print("Set size: ", len(xall))

Set size:  50000
Wall time: 2.99 s


In [7]:
def count_number_of_samples_for_each_class(yall):
    numberOfSamples = [yall.count(1), yall.count(2), yall.count(3), yall.count(4), yall.count(7), yall.count(8),yall.count(9),yall.count(10)]
    return numberOfSamples

In [8]:
number_of_samples_for_each_class = count_number_of_samples_for_each_class(yall)

In [9]:
number_of_samples_for_each_class

[10122, 4586, 4961, 5331, 4803, 5859, 4607, 9731]

In [10]:
def assign_weight(number_of_all_samples, number_of_clesses, number_of_samples_for_class):
    return number_of_all_samples / (number_of_clesses * number_of_samples_for_class)

In [11]:
def assign_weight_for_each_class(number_of_all_samples, number_of_samples_for_each_class):
    weights = []
    for i in range (0, len(number_of_samples_for_each_class)):    
        x = assign_weight(number_of_all_samples, len(number_of_samples_for_each_class), number_of_samples_for_each_class[i])
        weights.append(x)
    return weights

In [12]:
weights = assign_weight_for_each_class(len(yall), number_of_samples_for_each_class)

In [13]:
weights

[0.6174669037739577,
 1.3628434365460096,
 1.2598266478532554,
 1.1723879197148752,
 1.3012700395586092,
 1.0667349377026796,
 1.3566312133709573,
 0.64227725824684]

In [14]:
########################################################
from keras.utils import to_categorical

for i in range(0,50000):
    yall[i]=yall[i]-1
    
yall=to_categorical(yall)
yall
#############################

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

In [16]:
stop_words = set(stopwords.words('english'))
processed_reviews = []
single_review = "string to iniialize <br /> my email id is charilie@waoow.com. You can also reach to me at charlie's"

for review in range(0,50000):
    single_review = xall[review]
    
    #removing html tags:
    single_review = re.sub('<.*?>',' ',single_review)
    #removing special characters (punctuation) '@,!' e.t.c.
    single_review = re.sub('\W',' ',single_review)
    #removing single characters
    single_review = re.sub('\s+[a-zA-Z]\s+',' ', single_review)
    #substituting multiple spaces with single space
    single_review = re.sub('\s+',' ', single_review)
    
    #removing stop words
    word_tokens = word_tokenize(single_review)
    
    filtered_sentence = " ".join([w for w in word_tokens if w not in stop_words])
    #compile all the sentences to make a complete dictionary of processed reviews
    processed_reviews.append(filtered_sentence)
    
print(processed_reviews[0])

I went saw movie last night coaxed friends mine admit reluctant see knew Ashton Kutcher able comedy wrong Kutcher played character Jake Fischer well Kevin Costner played Ben Randall professionalism The sign good movie toy emotions This one exactly The entire theater sold overcome laughter first half movie moved tears second half While exiting theater saw many women tears many full grown men well trying desperately let anyone see crying This movie great suggest go see judge


In [17]:
from sklearn.model_selection import train_test_split
x_train_text, x_test_text, y_train, y_test = train_test_split(processed_reviews,yall,test_size=0.2,random_state=42)
print("Train-set size: ", len(x_train_text))
print("Test-set size:  ", len(x_test_text))

Train-set size:  40000
Test-set size:   10000


In [18]:
data_text = x_train_text + x_test_text
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(data_text)
#tokenizer.word_index
x_train_tokens = tokenizer.texts_to_sequences(x_train_text)
x_test_tokens = tokenizer.texts_to_sequences(x_test_text)
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [19]:
np.mean(num_tokens)

115.01494

In [20]:
np.max(num_tokens)

1136

In [21]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

282

In [22]:
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)

x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [13]:
#from sklearn import preprocessing
#x_train_pad = preprocessing.scale(x_train_pad)
#x_test_pad = preprocessing.scale(x_test_pad)

In [23]:
x_train_pad.shape

(40000, 282)

In [24]:
x_test_pad.shape

(10000, 282)

In [25]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [26]:
embedding_size = 8

In [27]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_size)),
    tf.keras.layers.Dense(embedding_size, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 282, 8)            80000     
_________________________________________________________________
bidirectional (Bidirectional (None, 16)                1088      
_________________________________________________________________
dense (Dense)                (None, 8)                 136       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 54        
Total params: 81,278
Trainable params: 81,278
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
#model.add(LSTM(units=64, return_sequences=True))
#model.add(LSTM(units=32, return_sequences=True))
#model.add(LSTM(units=16, return_sequences=True))
#model.add(LSTM(units=8, return_sequences=True))
model.add(LSTM(units=10))
model.add(Dense(10, activation='softmax'))


In [28]:
optimizer = Adam(lr=0.01)

In [30]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 282, 8)            80000     
_________________________________________________________________
bidirectional (Bidirectional (None, 16)                1088      
_________________________________________________________________
dense (Dense)                (None, 8)                 136       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 54        
Total params: 81,278
Trainable params: 81,278
Non-trainable params: 0
_________________________________________________________________


In [32]:
x_train_pad

array([[   0,    0,    0, ...,    1,    3, 4947],
       [   0,    0,    0, ..., 1259,  672,    3],
       [   0,    0,    0, ...,   98,    3,  521],
       ...,
       [   0,    0,    0, ...,   37,  511, 2829],
       [   0,    0,    0, ...,  471,  363,  177],
       [   0,    0,    0, ...,  600,   15, 1649]])

In [33]:
class_weight = {0: weights[0],
                1: weights[1],
                2: weights[2],
                3: weights[3],
                6: weights[4],
                7: weights[5],
                8: weights[6],
                9: weights[7]}

In [34]:
%%time
y_train=np.array(y_train)
model.fit(x_train_pad, y_train,
          class_weight=class_weight, validation_split=0.05, epochs=10, batch_size=8)

ValueError: A target array with shape (40000, 10) was passed for an output of shape (None, 6) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.

In [None]:
%%time
y_test=np.array(y_test)
result = model.evaluate(x_test_pad, y_test)

In [None]:
print("Accuracy: {0:.2%}".format(result[1]))

In [None]:
%%time
y_pred = model.predict(x=x_test_pad[0:2000])
#y_pred = y_pred.T[0]

In [None]:
#cls_pred = np.array([1 if p<0.1 else 2 if p<0.15 else 3 if p<0.25 else 4 if p<0.35 else 5 if p<0.45 else 6 if p<0.55 else 7 if p<0.65 else 8 if p<0.75 else 9 if p<0.85 else 10 for p in y_pred])

In [None]:
len(cls_pred)

In [None]:
cls_pred

In [None]:
cls_true = np.array(y_test[0:1000])

In [None]:
incorrect = np.where(y_pred != cls_true)
incorrect = incorrect[0]

In [None]:
len(incorrect)

In [None]:
idx = incorrect[0]
idx

The mis-classified text is:

In [None]:
text = x_test_text[idx]
text

In [None]:
##Saving and loading model
model.save("ModelBest")
#model = keras.models.load_model("path_to_my_model")

In [None]:
#exampleReview="I had high expectations for this movie. When I saw Shrek 2, I was pleasantly surprised. Few sequels are as good as the original, but Shrek 2 was. Shrek the third did not continue this trend. While there were sill plenty of funny moments, it was nowhere near as amusing or original as the first two. They could have done so much more with this movie if they had tried. At the end, I felt sort of cheated. However, Shek the third isnt a bad movie in itself, only when compared to the others. I would still recommend going to see it. Unlike most, I liked Arties character, even if he did talk too much (Im not even a Justin Timberlake fan)."
#10
#exampleReview="The Godfather: Part II is a very suspenseful drama with a very exciting story, with great acting and great special effects. I would definitely recommend you watch this movie...but first watch the original classic from 1972 The Godfather . The movie may not be as good as the first movie but is still an amazing sequel."
#7
exampleReview="If you like Karate, you will love this. I like the fight scenes, and the bad guy in the film is always a bad guy in Karate movies. ( ie Enter The Dragon) It is a bit sloppy in spots, but considering the budget you can not hit this film too hard. It is a good film to see if your mad or angry at someone."
#1
#exampleReview="If I thought the overlong, frightfully dull Avatar contained and rehashed every Hollywood cliché ever used, albeit in blue, Prince of Persia: Sands of Time runs the gamut of every mindless Hollywood action-feature effect, computerized or not, and even as an admittedly escapism event, it falls short of any intended mark. It is obvious that there is little new under the Hollywood sun. Frankly, 1942's Arabian Nights did this plot line much better and Shemp Howard, of Three Stooges fame, surpassed Alfred Molina's deadly dull and ineffective attempt at humor. Jake Gyllenhaal, in spite of much daring-do stunts and macho posturing, is still not the virile stud hero that his handlers are trying to turn him into, and Arterton's clichéd bitchy dialog becomes tedious after 10 minutes. It's a shallow waste of time from beginning to end and how Ben Kingsley could keep a straight face throughout this loud, noisy romp is a testament to his enormous experience in front of a camera."
#6
#exampleReview="Reading some of the harsh reviews I thought I would comment. The big disappointment for me in the movie was the lack of gore , fox did get it wrong to make it a pg13 even though they tried to substitute it with monster gore. All that being said this movie is a visual master-peace, the effects are top notch with very good fight scenes and creature effects with the best looking Alien Queen to hit the screens, The story is pretty cool and it gives more insight of the Predators, It takes a while for the introduction of the creatures but once the ball is rolling its none stop action. Not the horror feast we all wanted but fun."
#3
#exampleReview="In this final chapter of the ROCKY series, the Italian Stallion, now broke and brain damaged from his last fight, faces a promoter who wants him to do another bout, an amateur boxer whom he agrees to train, and his own son who starts to feel neglected. This time around, even Stallone gives a weak performance, and Tommy Morrison fails to make an impression as Rocky's Robot Tommy Gunn. Sly's real-life son Sage, who plays Rocky Jr,, and Burgess Meredith's brief appearance as Mickey are the only characters with any interest. The bottom spot of the movie is the climaxing streetfight, which barely makes the film worth a watch."

In [None]:
reviews = []
reviews.append(exampleReview)

In [None]:
test_tokens = tokenizer.texts_to_sequences(reviews)

In [None]:
test_pad = pad_sequences(test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [None]:
test_pred = model.predict(test_pad)
#pred_rate = map_rate(test_pred[0][0])
#pred_rate
test_pred

In [None]:
y_train.shape

In [None]:
x_train_pad.shape

In [None]:
from tensorflow import keras
keras.utils.plot_model(model, "1lay, validation_split=0.05, epochs=1, batch_size=8, 40ktrain.png", show_shapes=True)

In [None]:
#keras.utils.plot_model(model, "my_first_model.png")

In [None]:
len(xall)

In [None]:
type(xall)

In [None]:
xall[1]

In [None]:
print(processed_reviews[1])

In [None]:
len(processed_reviews)

In [None]:
type(processed_reviews)

In [None]:
len(x_train_pad)

In [None]:
y_pred[0:50]

In [None]:
type(yall)

In [None]:
for i in range(0,50000):
    yall[i]=yall[i]-1

In [None]:
yall[49999]

In [None]:
y_pred[0]

In [None]:
x_train_pad

In [None]:
x_train_tokens