In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
#'../input/sentiment-analysis-on-movie-reviews/train.tsv.zip'
train_data = pd.read_csv("/content/drive/MyDrive/sentiment analysis on movie reivew/train.tsv", sep = '\t')
train_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
#'../input/sentiment-analysis-on-movie-reviews/test.tsv.zip'
test_data = pd.read_csv("/content/drive/MyDrive/sentiment analysis on movie reivew/test.tsv",sep = '\t')
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    66292 non-null  int64 
 1   SentenceId  66292 non-null  int64 
 2   Phrase      66292 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [7]:
import matplotlib.pyplot as plt
import tensorflow as tf

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [9]:
print("TF version: ", tf.__version__)
if tf.__version__ < "2.0.0":
    tf.enable_eager_execution()
    print("Eager execution enabled.")
else:
    print("Eager execution enabled by default.")

if tf.test.gpu_device_name(): 
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
   print("Please install GPU version of TF")

TF version:  2.5.0
Eager execution enabled by default.
Default GPU Device: /device:GPU:0


In [10]:
print(train_data.shape)
print(test_data.shape)

(156060, 4)
(66292, 3)


In [154]:
train_data.columns

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [12]:
test_data.columns

Index(['PhraseId', 'SentenceId', 'Phrase'], dtype='object')

In [None]:
train_data['Phrase']

For each text movie review, the model has to predict a label for the sentiment. We evaluate the outputs of the model on classification accuracy. The sentiment labels are:

0 → Negative

1 → Somewhat negative

2 → Neutral

3 → Somewhat positive

4 → Positive

In [155]:
print(train_data['Sentiment'].unique())
train_data['Sentiment'].nunique()

[1 2 3 4 0]


5

In [16]:
train_data['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [17]:
from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re

In [21]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [18]:
train_data['Phrase']

0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
                                ...                        
156055                                            Hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [23]:
def sentence_cleaning(df):
    sentence = []
    for sent in tqdm(df['Phrase']):
        
        text = re.sub("[^a-zA-Z]"," ",sent)
        
        word = word_tokenize(text.lower())
        
        lemmatizer = WordNetLemmatizer()
        
        lemm_word = [lemmatizer.lemmatize(i) for i in word]
        
        sentence.append(lemm_word)
    return (sentence)

In [26]:
train_sent = sentence_cleaning(train_data)
test_sent = sentence_cleaning(test_data)

print(len(train_sent))
print(len(test_sent))

100%|██████████| 156060/156060 [00:21<00:00, 7349.46it/s]
100%|██████████| 66292/66292 [00:07<00:00, 8517.98it/s]

156060
66292





In [None]:
test_sent

In [30]:
from tensorflow.keras.utils import to_categorical


In [33]:
target_col = train_data.Sentiment.values
y_target = to_categorical(target_col)
y_target.shape

(156060, 5)

In [34]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(train_sent,y_target,test_size = 0.2,stratify = y_target)

In [35]:
unique_words = set()
max_len = 0
for sent in tqdm(X_train):
    unique_words.update(sent)
    if(max_len < len(sent)):
        max_len = len(sent)
        sentence = sent

100%|██████████| 124848/124848 [00:00<00:00, 640598.83it/s]


In [156]:
len(list(unique_words))

13731

In [37]:
vocabulary = len(list(unique_words))
oov = '<OOV>'
embedding_dim = 300
padding = 'post'
trunc = 'post'

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [39]:

tokenizer = Tokenizer(num_words = vocabulary,oov_token = oov,char_level = False)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen = max_len,padding=padding,truncating = trunc)
X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,maxlen = max_len,padding=padding,truncating = trunc)
X_test = tokenizer.texts_to_sequences(test_sent)
X_test = pad_sequences(X_test,maxlen = max_len,padding=padding,truncating = trunc)

In [40]:
print(X_train.shape,X_test.shape,X_val.shape)

(124848, 48) (66292, 48) (31212, 48)


In [41]:
from keras.models import Sequential
from keras.layers import Dense,Bidirectional,Activation,Dropout,LSTM,Embedding
from keras.layers.embeddings import Embedding

In [43]:
model = tf.keras.Sequential()
model.add(Embedding(vocabulary,embedding_dim,input_length = max_len))
model.add(Bidirectional(LSTM(128, dropout = 0.8, recurrent_dropout=0.8, return_sequences=True)))
model.add(Bidirectional(LSTM(128,dropout = 0.5,recurrent_dropout=0.5,return_sequences=False)))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])





In [44]:
model.compile(loss = 'categorical_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])

In [45]:
model.fit(X_train,y_train,validation_data = (X_val, y_val),epochs = 4,batch_size = 256,verbose = 1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f9efafd97d0>

In [46]:
model.evaluate(X_val, y_val)



[0.812545120716095, 0.663943350315094]

In [72]:
keras.models.save_model(model,"/content/drive/MyDrive/sentiment analysis on movie reivew/sentiment.tf")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/sentiment analysis on movie reivew/sentiment.tf/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/sentiment analysis on movie reivew/sentiment.tf/assets


In [114]:
model.save('/content/drive/MyDrive/sentiment analysis on movie reivew/rnn_senti')



INFO:tensorflow:Assets written to: /content/drive/MyDrive/sentiment analysis on movie reivew/rnn_senti_weights/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/sentiment analysis on movie reivew/rnn_senti_weights/assets


In [51]:
y_pred = np.argmax(model.predict(X_test), axis = -1)
#submission_df = pd.DataFrame({'PhraseId': test_data.PhraseId, 'Sentiment': y_pred})

In [52]:
y_pred

array([2, 2, 2, ..., 1, 1, 1])

In [75]:
from tensorflow.keras.models import load_model


In [82]:
m=load_model("/content/my_model")

For each text movie review, the model has to predict a label for the sentiment. We evaluate the outputs of the model on classification accuracy. The sentiment labels are:

0 → Negative

1 → Somewhat negative

2 → Neutral

3 → Somewhat positive

4 → Positive

In [92]:
def check(sent):
  sentence=[]
  text = re.sub("[^a-zA-Z]"," ",sent)
        
  word = word_tokenize(text.lower())
  
  lemmatizer = WordNetLemmatizer()
  
  lemm_word = [lemmatizer.lemmatize(i) for i in word]
  
  sentence.append(lemm_word)
  return (sentence)

In [103]:
t=check("worst movie")

In [115]:
l=["Great movie, I love it", "Bad movie, total waste of money"]

In [139]:
o=tokenizer.texts_to_sequences(l)

In [132]:
oo = pad_sequences(o,maxlen = max_len,padding=padding,truncating = trunc)

In [133]:
p=m.predict(oo)

In [152]:
p

array([[3.3911208e-06, 7.7382610e-05, 9.5555093e-03, 3.6676446e-01,
        6.2359923e-01],
       [7.5057054e-01, 2.3065925e-01, 1.8729445e-02, 3.9151852e-05,
        1.5903447e-06]], dtype=float32)

In [157]:
np.argmax(p,axis=-1)

array([4, 0])

In [109]:
np.argmax(p,axis=-1)

array([2, 2, 2, 2, 2, 2, 2, 2, 2])