In [1]:
# trafile 2020.10.1
import tarfile

#pandas 1.1.3
import pandas as pd

# requests 2.24.0
import re

# tensorflow 2.4
import tensorflow as tf
from tensorflow import keras

#nltk 3.5
import nltk
from nltk.corpus import stopwords

#matplotlip 3.3.2
import matplotlib.pyplot as plt
# numpy 1.19.5
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros

# keras 2.4.3
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

#sklearn 0.23.2
from sklearn.model_selection import train_test_split


## read data

In [2]:
# READ THE CSV FILE
movie_reviews = pd.read_csv("./IMDB Dataset.csv")

movie_reviews.isnull().values.any()

movie_reviews.shape

(50000, 2)

In [3]:
#sample of the data
movie_reviews.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


## preproccessing data

In [4]:
#cleaning data 

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [5]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [6]:
X = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [7]:

y = movie_reviews['sentiment']

# when review is positive make it =1 , when it is negative make it =0
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

# splitting data

In [8]:
#Splitting data train:70% validation:10% test: 20%
X_train, y_train = X[:35000],  y[:35000]
X_val,y_val= X[35000:40000], y[35000:40000] 
X_test , y_test =X[40000:50000], y[40000:50000]

### tokenizing  data

In [9]:
#extract tokens from the text the number of tokens is 5000
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [10]:
# convert the text to a numeric sequence 
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

In [11]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

print('Found %s unique tokens.' % vocab_size)

Found 87377 unique tokens.


In [12]:
#max length of sequence
maxlen = 256

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [13]:
lstm_model= tf.keras.models.load_model('model-lstm-sentiment-movie.h5')

In [14]:
print(lstm_model.summary())


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 200)          17475400  
_________________________________________________________________
lstm (LSTM)                  (None, 256, 128)          168448    
_________________________________________________________________
dropout (Dropout)            (None, 256, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 17,664,489
Trainable params: 189,089
Non-trainable params: 17,475,400
______________________________________

### score  of model

In [15]:
score = lstm_model.evaluate(X_test, y_test, verbose=1)



#  testing starts here!

#### as shown in the out put of this code, the sequences of sentences are printed witht the propability of prediction
- if the propability is smaller than .5 that means it is near to negative
- if the propability is larger than .5 that means it is near to positive

In [16]:
### make class predictions with the model
y_pred = lstm_model.predict(X_test)
# summarize the first 10 cases
for i in range(10):
    print("X=%s, Predicted=%s" % (X_test[i], y_pred[i]))
    if y_pred[i] < 0.5:
        print('negative review')
    else:
        print('positive review')

X=[  83  118  182    4  131    9 3744   18    1  967 2229    2  250    1
   12 2347 1314    4   99    1  218    3   16    8   12  935  347    4
  209  979  414    6   10  209 1332   55  400  298    3   20  253   16
  948    9    9   28   46   58  210   51   13    8  109   10  484    2
    1   97   67   14    1   84  172    1  465  769    7    1   59   63
   52   45 2380   27   26   28   86   38   79   44  426    3   70  207
    2  112   19   60    9   27   26  266 1040  136   75   19  116   21
   10    1   84  174    4  685 3411  100    7    1   12  112   19   40
   96  284   14    8 1157  206  634   56  210    4    1 2164    3    8
  358 2347   12    2  785 2052 1646   26   23    4  131 1535    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    

#### matrix of values is the sentence when predicted is the  propability of positive or negative

#### you can test it yourself:

In [17]:
#you must run all the previuos cells
TEST_REVIEW = input("Enter your value: ")
#press enter to save the text into the variable


Enter your value: i liked the movie so much. i will watch it again. i will invite my friends to watch it.


In [18]:
preprocess_text(TEST_REVIEW)
remove_tags(TEST_REVIEW)
TEST_REVIEW = tokenizer.texts_to_sequences(TEST_REVIEW)
TEST_REVIEW = pad_sequences(TEST_REVIEW, padding='post', maxlen=maxlen)



In [19]:
y_pred = lstm_model.predict(TEST_REVIEW)
if y_pred[0] < 0.5:
    value ='negative review'
else:
    value = 'positive review'
print("Predicted=%s,  %s " % ( y_pred[0], value ))


Predicted=[0.69894475],  positive review 


#### or only by a given example

In [20]:
#positive review
TEST_REVIEW_positive = "I was \ on the edge of my seat the entire time. The acting was excellent, and the \scenery - my goodness. Watch this movie now!"

In [21]:
preprocess_text(TEST_REVIEW_positive)
remove_tags(TEST_REVIEW_positive)
TEST_REVIEW = tokenizer.texts_to_sequences(TEST_REVIEW_positive)
TEST_REVIEW = pad_sequences(TEST_REVIEW, padding='post', maxlen=maxlen)

y_pred = lstm_model.predict(TEST_REVIEW)
if y_pred[0] < 0.5:
    value ='negative review'
else:
    value = 'positive review'
print("Predicted=%s,  %s " % ( y_pred[0], value ))


Predicted=[0.69894475],  positive review 


In [22]:
#negative review
TEST_REVIEW_negative = "the acting was bad and not profisional I could'nt complete the movie and I slept in the middle of it I wont let my friends watch it because they will hate me"

In [23]:
preprocess_text(TEST_REVIEW_negative)
remove_tags(TEST_REVIEW_negative)
TEST_REVIEW = tokenizer.texts_to_sequences(TEST_REVIEW_negative)
TEST_REVIEW = pad_sequences(TEST_REVIEW, padding='post', maxlen=maxlen)

y_pred = lstm_model.predict(TEST_REVIEW)
if y_pred[0] < 0.5:
    value ='negative review'
else:
    value = 'positive review'
print("Predicted=%s,  %s " % ( y_pred[0], value ))


Predicted=[0.19478163],  negative review 


-------------------------------------------------------------------------------------

In [24]:
print("Nour Ammar y2013 140008")
print("Deep learning 2021 prof.Haluk Gumuskaya")

Nour Ammar y2013 140008
Deep learning 2021 prof.Haluk Gumuskaya
