## Sentiment Analysis on IMDB DATASET
Steps
1. Load the dataset
2. Clean Dataset
3. Encode sentiments
4. Split dataset
5. Tokenize and Pad/Trancate Reviews
6. Build the architecture/Model
7. Train and test

In [2]:
## install anvil
## anvil is a platform that allows users to build web apps using python
!pip install anvil-uplink

Collecting anvil-uplink
  Downloading anvil_uplink-0.4.2-py2.py3-none-any.whl (90 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/90.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.1/90.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting argparse (from anvil-uplink)
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting ws4py (from anvil-uplink)
  Downloading ws4py-0.5.1.tar.gz (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ws4py
  Building wheel for ws4py (setup.py) ... [?25l[?25hdone
  Created wheel for ws4py: filename=ws4py-0.5.1-py3-none-any.whl size=45228 sha256=bb9baeb58ab832b62

In [3]:
import anvil.server

In [4]:
anvil.server.connect('server_SKD63ST5DDDR7VEHAZMV3ICS-T3JFQWAZ6FKEY6WB')

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default Environment" as SERVER


In [5]:
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [6]:
data = pd.read_csv('IMDB Dataset.csv')

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
english_stops = set(stopwords.words('english'))

In [9]:

def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']
    y_data = df['sentiment']

    # pre-process review
    x_data = x_data.replace({'<.*?>': ''}, regex=True) # remove HTML tags
    x_data = x_data.replace({'[^A-za-z]': ' '}, regex=True) # removes non alphabets
    x_data = x_data.apply(lambda review : [w for w in review.split() if w not in english_stops]) # remove stop words
    #x_data = x_data.apply(lambda review : [w.lower() for w in review])

    # encode sentiment
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data
x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data, '\n')


Reviews
0        [One, reviewers, mentioned, watching, Oz, epis...
1        [A, wonderful, little, production, The, filmin...
2        [I, thought, wonderful, way, spend, time, hot,...
3        [Basically, family, little, boy, Jake, thinks,...
4        [Petter, Mattei, Love, Time, Money, visually, ...
                               ...                        
49995    [I, thought, movie, right, good, job, It, crea...
49996    [Bad, plot, bad, dialogue, bad, acting, idioti...
49997    [I, Catholic, taught, parochial, elementary, s...
49998    [I, going, disagree, previous, comment, side, ...
49999    [No, one, expects, Star, Trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64 



In [10]:
# Split datasets

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test, '\n')

Train Set
28729    [Valley, Girl, always, hold, special, place, h...
29579    [This, Paul, F, Ryan, first, full, length, fea...
30349    [An, unusual, film, Ringo, Lam, one, strangely...
43283    [One, sensible, comedies, hit, Hindi, film, sc...
28823    [I, watched, film, based, favorable, reviews, ...
                               ...                        
22934    [Although, first, Hunter, S, Thompson, documen...
35935    [I, huge, fan, Emily, Watson, Breaking, The, W...
21694    [This, terrible, film, Angie, Dickenson, class...
45566    [One, beautiful, movies, ever, made, ex, Yu, S...
45857    [While, movie, titles, contains, word, Mother,...
Name: review, Length: 40000, dtype: object 

48223    [The, plot, something, white, hunters, capture...
5121     [I, guy, hesitant, watch, movie, know, Richard...
43392    [The, original, title, means, The, Birth, Octo...
6097     [I, say, Seventeen, Missing, much, better, I, ...
36643    [I, deeply, moved, movie, many, respects, Firs...
 

In [11]:
# getting maximum review length

def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))
    return int(np.ceil(np.mean(review_length)))

In [12]:
# Tokenize and pad/truncate reviews
# a neural network using accepts numerical data, so we need to encode the reviews
# use tensorflow tokenizer to encode reviews to integers where each
# word unique word is automically indexed based on x_train

# use pad_sequence to truncate all reviews to the same length

token = Tokenizer() # data already lowered
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)




Encoded X Train
 [[3499  152  122 ...  157   33    5]
 [   8  701 1042 ...  156  591 4935]
 [ 697 1713    4 ...    0    0    0]
 ...
 [   8  287    4 ...    0    0    0]
 [   5  214   28 ...    0    0    0]
 [ 376    3 2777 ... 4504 6374  207]] 

Encoded X Test
 [[    2    42    61 ...     0     0     0]
 [    1   119 11816 ...     0     0     0]
 [    2   123   324 ...   744     3   244]
 ...
 [ 1063   224    37 ...     0     0     0]
 [  170     3   514 ...   450   446   627]
 [    2     3  2032 ...     5 18127  1546]] 

Maximum review length:  130


In [13]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2984480   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 3009377 (11.48 MB)
Trainable params: 3009377 (11.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [14]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.52747, saving model to models/LSTM.h5
Epoch 2/5


  saving_api.save_model(


Epoch 2: accuracy improved from 0.52747 to 0.72880, saving model to models/LSTM.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.72880 to 0.79935, saving model to models/LSTM.h5
Epoch 4/5
Epoch 4: accuracy did not improve from 0.79935
Epoch 5/5
Epoch 5: accuracy improved from 0.79935 to 0.82563, saving model to models/LSTM.h5


<keras.src.callbacks.History at 0x7e6e1e8df820>

In [15]:
y_pred = model.predict_on_batch(x_test)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1
print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

test_loss, test_acc = model.evaluate(x_test, y_test)

Correct Prediction: 0
Wrong Prediction: 10000
Accuracy: 0.0


In [16]:
loaded_model = load_model('models/LSTM.h5')

In [17]:
## plot the training results

In [18]:
# Pre-process input
# adding everything in a single function makes
# it easily callable on the anvil server
# classify_review function takes a review (from the web page),
# runs inference on the review and returns a probabilistic value
# to determine whether the review was positive or negative.
@anvil.server.callable
def classify_review(review):
  regex = re.compile(r'[^a-zA-Z\s]')
  review = regex.sub('', review)
  print('Cleaned: ', review)

  words = review.split(' ')
  filtered = [w for w in words if w not in english_stops]
  filtered = ' '.join(filtered)
  filtered = [filtered.lower()]
  print('Filtered: ', filtered)

  tokenize_words = token.texts_to_sequences(filtered)
  tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')

  result = loaded_model.predict(tokenize_words)
  print(result)

  if result <= 0.7:
    return 'negative'
  else:
    return 'positive'

In [19]:
## print the keywords from the review
import anvil.mpl_util

@anvil.server.callable
def print_key_words(review):
  ## plot key words as a word cloud followed by the classification
  wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(review)
  plt.figure()
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.axis("off")
  ## plt.show()
  return anvil.mpl_util.plot_image()




In [None]:
anvil.server.wait_forever()

Cleaned:  This is a very good movie and I recommend 
Filtered:  ['this good movie i recommend ']
[[0.88820887]]
