In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
import pandas as pd
data = pd.read_csv('/gdrive/My Drive/DecisionTree/DataMining509/IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
#Task 1: Data preprocessing
import spacy

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
def normalize(review, lowercase, remove_stopwords):
  if lowercase:
    review = review.lower()
  doc = nlp(review)
  lemmatized =list()
  for token in doc:
    if not (remove_stopwords) or (remove_stopwords and not token.is_stop):
      lemmatized.append(token.lemma_)
  return " ".join(lemmatized)
data['review'] = data['review'].apply(normalize, lowercase =True, remove_stopwords =True)

In [10]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(data['sentiment'])
data['sentiment'] = label_encoder.transform(data['sentiment'])

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=10, stratify=data['sentiment'])


In [46]:
#Task 2: Lexicon-based sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.downloader.download('vader_lexicon')
sentiment = SentimentIntensityAnalyzer()
v_predicted = []
for text in X_test: 
  sent= sentiment.polarity_scores(text)
  if sent['compound']>0.5: 
    v_predicted.append('positive')
  else:
    v_predicted.append('negative')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [47]:
from sklearn import metrics
v_performance = metrics.classification_report(Y_test,v_predicted, target_names= ['negative', 'positive'])
print(v_performance)

              precision    recall  f1-score   support

    negative       0.77      0.56      0.65      5108
    positive       0.64      0.82      0.72      4892

    accuracy                           0.69     10000
   macro avg       0.70      0.69      0.68     10000
weighted avg       0.70      0.69      0.68     10000



In [28]:
#Task 3: Naive Bayes model for sentiment analysis (Best model)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range=(1,1), tokenizer = token.tokenize, max_features=1000)
X_train_vect = cv.fit_transform(X_train)
X_train_vect.shape



(40000, 1000)

In [49]:
df = pd.DataFrame(X_train_vect.toarray(), columns=cv.get_feature_names_out())
df.head()

Unnamed: 0,1,10,100,15,2,20,3,30,4,5,...,write,writer,writing,wrong,yeah,year,yes,york,young,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [48]:
X_test_vect= cv.transform(X_test)
X_test_vect.shape

(10000, 1000)

In [51]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train_vect, Y_train)

In [52]:
predicted = MNB.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)

              precision    recall  f1-score   support

           0       0.85      0.81      0.83      5108
           1       0.81      0.85      0.83      4892

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



In [53]:
#Second attempt, max_features=500
cv = CountVectorizer(stop_words='english',ngram_range=(1,1), tokenizer = token.tokenize, max_features=500)
X_train_vect = cv.fit_transform(X_train)
X_test_vect= cv.transform(X_test)
MNB = MultinomialNB()
MNB.fit(X_train_vect, Y_train)
predicted = MNB.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)



              precision    recall  f1-score   support

           0       0.83      0.79      0.81      5108
           1       0.79      0.83      0.81      4892

    accuracy                           0.81     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.81      0.81      0.81     10000



In [54]:
#Third attempt, max_features=500 using bigrams instead of unigrams
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range=(2,2), tokenizer = token.tokenize, max_features=500)
X_train_vect = cv.fit_transform(X_train)
X_test_vect= cv.transform(X_test)
MNB = MultinomialNB()
MNB.fit(X_train_vect, Y_train)
predicted = MNB.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)



              precision    recall  f1-score   support

           0       0.78      0.67      0.72      5108
           1       0.70      0.80      0.75      4892

    accuracy                           0.73     10000
   macro avg       0.74      0.73      0.73     10000
weighted avg       0.74      0.73      0.73     10000



In [55]:
#By using Countvectorizer with max_feature =1000 and unigrams, it performs the best f1-score with 0.83 than the other trials in naive_bayes algorithm.

In [65]:
#Task 4: SVM model for sentiment analysis (best model)
from sklearn import svm
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range=(1,1), tokenizer = token.tokenize, max_features=1000)
X_train_vect = cv.fit_transform(X_train)
X_test_vect= cv.transform(X_test)

clf = svm.SVC()
clf.fit(X_train_vect, Y_train)

predicted = clf.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)



              precision    recall  f1-score   support

           0       0.88      0.83      0.86      5108
           1       0.84      0.89      0.86      4892

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [66]:
#Second attempt, with ngram_range=(1,1) & max_features=500
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range=(1,1), tokenizer = token.tokenize, max_features=500)
X_train_vect = cv.fit_transform(X_train)
X_test_vect= cv.transform(X_test)

clf = svm.SVC()
clf.fit(X_train_vect, Y_train)

predicted = clf.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)



              precision    recall  f1-score   support

           0       0.87      0.81      0.84      5108
           1       0.82      0.87      0.84      4892

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [67]:
#Third attempt , with ngram_range=(2,2) & max_features=1000
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range=(2,2), tokenizer = token.tokenize, max_features=1000)
X_train_vect = cv.fit_transform(X_train)
X_test_vect= cv.transform(X_test)
clf = svm.SVC()
clf.fit(X_train_vect, Y_train)

predicted = clf.predict(X_test_vect)
performance = metrics.classification_report(Y_test,predicted, target_names= ['0', '1'])
print(performance)



              precision    recall  f1-score   support

           0       0.82      0.66      0.73      5108
           1       0.70      0.85      0.77      4892

    accuracy                           0.75     10000
   macro avg       0.76      0.75      0.75     10000
weighted avg       0.76      0.75      0.75     10000



In [None]:
#By using Countvectorizer with max_feature =1000 and unigrams, it performs the best f1-score with 0.86 than the other trials in SVM algorithm.

In [33]:
#Task 5: Deep Learning Models for Sentiment Analysis 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
embedding_dim = 100
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>" # out of vocabulary

# tokenize sentences
tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, truncating= trunc_type, padding=padding_type, maxlen=max_length)

# convert validation dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, truncating= trunc_type, padding=padding_type, maxlen=max_length)


vocab_size = len(word_index)

In [7]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [39]:
#best model
blstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, input_length=max_length, trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])
blstm.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m, precision_m, recall_m])

blstm.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 100)          8333200   
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 24)                3096      
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 8,420,801
Trainable params: 87,601
Non-trainable params: 8,333,200
_________________________________________________________________


In [40]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Attention, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', restore_best_weights=True)

num_epochs = 50
history = blstm.fit(train_padded, Y_train, 
                    epochs=num_epochs, verbose=1, callbacks=[early_stop],
                    validation_split=0.3)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


In [41]:
results = blstm.evaluate(test_padded, Y_test, verbose=1)



In [42]:
#second attemp, tf.keras.layers.Dense(32, activation='relu'),
blstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, input_length=max_length, trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])
blstm.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m, precision_m, recall_m])

blstm.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 100)          8333200   
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 32)                4128      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 8,421,841
Trainable params: 88,641
Non-trainable params: 8,333,200
_________________________________________________________________


In [43]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', restore_best_weights=True)

num_epochs = 50
history = blstm.fit(train_padded, Y_train, 
                    epochs=num_epochs, verbose=1, callbacks=[early_stop],
                    validation_split=0.3)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


In [44]:
results = blstm.evaluate(test_padded, Y_test, verbose=1)



In [45]:
#Third attempt, adding tf.keras.layers.Dense(24, activation='relu')
blstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, input_length=max_length, trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])
blstm.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', f1_m, precision_m, recall_m])

blstm.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 120, 100)          8333200   
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_6 (Dense)             (None, 32)                4128      
                                                                 
 dense_7 (Dense)             (None, 24)                792       
                                                                 
 dense_8 (Dense)             (None, 1)                 25        
                                                                 
Total params: 8,422,625
Trainable params: 89,425
Non-trainable params: 8,333,200
_______________________________________

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, mode='min', restore_best_weights=True)

num_epochs = 50
history = blstm.fit(train_padded, Y_train, 
                    epochs=num_epochs, verbose=1, callbacks=[early_stop],
                    validation_split=0.3)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

In [47]:
results = blstm.evaluate(test_padded, Y_test, verbose=1)



In [None]:
#Conclusion: Using DL methods doesn't mean it will lead to a higher accuracy. The traditional machine learning method with word embedding can generate a better outcome in this case.
#Also, lexicon-based Sentiment Analysis performs not very well, maybe because it cannot detect the implicit meaning word by word or sarcasm.