dataset: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
!wget --no-check-certificate "https://drive.google.com/uc?export=download&id=1Ux85UMh6Cs2Qy30Q7VgpV4gNAW_7vfkk" -O kaggle.json > /dev/null 2>&1

In [2]:
!rm -rf ~/.kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 74% 19.0M/25.7M [00:00<00:00, 89.3MB/s]
100% 25.7M/25.7M [00:00<00:00, 92.6MB/s]


In [4]:
!unzip "imdb-dataset-of-50k-movie-reviews.zip"
!rm -rf "imdb-dataset-of-50k-movie-reviews.zip"

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [5]:
import numpy as np
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [6]:
imdb_df = pd.read_csv('IMDB Dataset.csv')
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
print(imdb_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None


In [9]:
#shuffle all data
imdb_df = imdb_df.sample(frac=1.0).reset_index(drop=True)

In [11]:
#calculate polarity score for every review in the review column and assign 1 in the new prediction column if polarity score >=0 else -1
imdb_df['Prediction'] = imdb_df['review'].apply(lambda x: 1 if sia.polarity_scores(x)['compound'] >= 0 else -1)
imdb_df.head()

Unnamed: 0,review,sentiment,Prediction
0,I've watched a lot of TV through the years. So...,negative,1
1,I have nothing against a fast-paced fright-fli...,negative,1
2,The phenomenon Helge Schneider defies easy des...,positive,1
3,Although The Notorious Bettie Page is well act...,positive,1
4,*** Possable spoiler but probably not ***<br /...,negative,-1


In [12]:
#in the sentiment column, if sentiment is positive, label=1, else label=-1
imdb_df['sentiment'] = imdb_df['sentiment'].apply(lambda x: -1 if x == 'negative' else 1)
imdb_df.head()

Unnamed: 0,review,sentiment,Prediction
0,I've watched a lot of TV through the years. So...,-1,1
1,I have nothing against a fast-paced fright-fli...,-1,1
2,The phenomenon Helge Schneider defies easy des...,1,1
3,Although The Notorious Bettie Page is well act...,1,1
4,*** Possable spoiler but probably not ***<br /...,-1,-1


In [15]:
#create a new accuracy column where accuracy=1 if sentiment(column2)=prediction(column3) else = 0
#axis=1 is to apply it to each row
imdb_df['Accuracy']=imdb_df.apply(lambda x: 1 if x[1]==x[2] else 0,axis=1)
imdb_df.head()

Unnamed: 0,review,sentiment,Prediction,Accuracy
0,I've watched a lot of TV through the years. So...,-1,1,0
1,I have nothing against a fast-paced fright-fli...,-1,1,0
2,The phenomenon Helge Schneider defies easy des...,1,1,1
3,Although The Notorious Bettie Page is well act...,1,1,1
4,*** Possable spoiler but probably not ***<br /...,-1,-1,1


In [17]:
def conf_matrix(x):
  if x[1] == 1 and x[2] == 1:
    return 'TP'
  elif x[1] == 1 and x[2] == -1:
    return 'FN'
  elif x[1] == -1 and x[2] == 1:
    return 'FP'
  elif x[1] == -1 and x[2] == -1:
    return 'TN'
  else:
    return 0

imdb_df['Conf_Matrix'] = imdb_df.apply(lambda x: conf_matrix(x), axis=1)
imdb_df.head()

Unnamed: 0,review,sentiment,Prediction,Accuracy,Conf_Matrix
0,I've watched a lot of TV through the years. So...,-1,1,0,FP
1,I have nothing against a fast-paced fright-fli...,-1,1,0,FP
2,The phenomenon Helge Schneider defies easy des...,1,1,1,TP
3,Although The Notorious Bettie Page is well act...,1,1,1,TP
4,*** Possable spoiler but probably not ***<br /...,-1,-1,1,TN


In [20]:
conf_vals = imdb_df.Conf_Matrix.value_counts().to_dict()
print(conf_vals)

accuracy = (conf_vals['TP'] + conf_vals['TN']) / (conf_vals['TP'] + conf_vals['TN'] + conf_vals['FP'] + conf_vals['FN'])
precision = conf_vals['TP'] / (conf_vals['TP'] + conf_vals['FP'])
recall = conf_vals['TP'] / (conf_vals['TP'] + conf_vals['FN'])
f1_score = 2*precision*recall / (precision + recall)
print('Accuracy: ', round(100 * accuracy, 2),'%',
      '\nPrecision: ', round(100 * precision, 2),'%',
      '\nRecall: ', round(100 * recall, 2),'%',
      '\nF1 Score: ', round(100 * f1_score, 2),'%')

{'TP': 21403, 'TN': 13410, 'FP': 11590, 'FN': 3597}
Accuracy:  69.63 % 
Precision:  64.87 % 
Recall:  85.61 % 
F1 Score:  73.81 %
