#  <center> **Sentiment Analysis Project** </center>

The 2,000 record IMDb movie review database is accessible through NLTK directly with

### Load the Data

In [11]:
import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import movie_reviews

df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.dropna(inplace=True)
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


### Identifying and removing rows with empty reviews

In [12]:
blanks = []

for i, lb, rv in df.itertuples():
    if(type(rv)) == str:
        if rv.isspace():
            blanks.append(i)


blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

### Dropping the identified rows with empty reviews

In [13]:
df. drop(blanks, inplace=True)

### Displaying the counts of each sentiment label ('pos' or 'neg')


In [14]:
df['label'].value_counts()

label
neg    969
pos    969
Name: count, dtype: int64

### Initializing the SentimentIntensityAnalyzer


In [16]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

### Applying VADER sentiment analysis to each review and storing the results in new columns


In [17]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

### Extracting the compound score from the 'scores' dictionary and storing it in a new column


In [28]:
df['compound'] = df['scores'].apply(lambda d: d['compound'])

### Classifying each review as 'pos' or 'neg' based on the compound score


In [30]:
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >= 0 else 'neg')
df.head(10)

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg
5,neg,"to put it bluntly , ed wood would have been pr...","{'neg': 0.123, 'neu': 0.821, 'pos': 0.056, 'co...",-0.9855,neg
6,neg,"synopsis : melissa , a mentally-disturbed woma...","{'neg': 0.087, 'neu': 0.742, 'pos': 0.17, 'com...",0.9871,pos
7,neg,tim robbins and martin lawernce team up in thi...,"{'neg': 0.118, 'neu': 0.709, 'pos': 0.172, 'co...",0.9829,pos
8,neg,"in "" gia "" , angelina jolie plays the titular ...","{'neg': 0.082, 'neu': 0.862, 'pos': 0.056, 'co...",-0.8278,neg
9,neg,"in 1990 , the surprise success an unheralded l...","{'neg': 0.145, 'neu': 0.728, 'pos': 0.127, 'co...",-0.9147,neg


### Evaluating the sentiment analysis results and printing the accuracy score, classification report, and confusion matrix


In [27]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print('accuracy_score: ',accuracy_score(df['label'], df['comp_score']))
print('\nclassification_report:\n',classification_report(df['label'], df['comp_score']))
print('\nconfusion_matrix:\n',confusion_matrix(df['label'], df['comp_score']))

accuracy_score:  0.6357069143446853

classification_report:
               precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938


confusion_matrix:
 [[427 542]
 [164 805]]
