In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords

## read data

In [2]:
# READ THE CSV FILE
movie_reviews = pd.read_csv("./IMDB Dataset.csv")

movie_reviews.isnull().values.any()

movie_reviews.shape

(50000, 2)

## preproccessing data

In [3]:
#cleaning data 

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [4]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [5]:
X = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [6]:

y = movie_reviews['sentiment']

# when review is positive make it =1 , when it is negative make it =0
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

# splitting data

In [7]:
#Splitting data train:70% validation:10% test: 20%
X_train, y_train = X[:35000],  y[:35000]
X_val,y_val= X[35000:40000], y[35000:40000] 
X_test , y_test =X[40000:50000], y[40000:50000]

### tokenizing  data

In [8]:
#extract tokens from the text the number of tokens is 5000
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [9]:
# convert the text to a numeric sequence 
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_val = tokenizer.texts_to_sequences(X_val)

In [10]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

print('Found %s unique tokens.' % vocab_size)

Found 87377 unique tokens.


In [11]:
#max length of sequence
maxlen = 256

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [12]:
lstm_model= tf.keras.models.load_model('model-lstm-sentiment-movie.h5')

In [13]:
print(lstm_model.summary())


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 200)          17475400  
_________________________________________________________________
lstm (LSTM)                  (None, 256, 128)          168448    
_________________________________________________________________
dropout (Dropout)            (None, 256, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 17,664,489
Trainable params: 189,089
Non-trainable params: 17,475,400
______________________________________

matrix of values is the sentence when predicted is the  propability of positive or negative

In [15]:
%load_ext tensorboard

In [22]:
import datetime, os
log_folder='log3'

In [23]:
from tensorflow.keras.callbacks import TensorBoard


In [24]:
callbacks = [TensorBoard(log_dir=log_folder,
                         histogram_freq=1,
                         write_graph=True,
                         write_images=True,
                         update_freq='epoch',
                         profile_batch=2,
                         embeddings_freq=1)]

In [26]:
lstm_model.fit(X_train, y_train,
          epochs=1,
          validation_data=(X_val,y_val),
          callbacks=callbacks)



<tensorflow.python.keras.callbacks.History at 0x26b2f88cdf0>

In [27]:
#calling tensorboard local host in notebook
%tensorboard --logdir {log_folder}
# you can also enter localhost port 6006 by typing the following command on anaconda command:
# tensorboard --logdir "c:/your path to logs file of the project"


-----------------------------------------------------------------------------------------------------------------------------

In [28]:
print("Nour Ammar y2013 140008 ")
print("Deep Learning Prof.Haluk Gumuskaya 2021")

Nour Ammar y2013 140008 
Deep Learning Prof.Haluk Gumuskaya 2021
