In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("moviereviews.tsv", sep='\t')

In [3]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [5]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [10]:
blanks = []
for i, lbl, rev in df.itertuples():
    if type(rev) == str:
        if rev.isspace():
            blanks.append(i)

In [11]:
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [12]:
df.drop(blanks, inplace=True)

In [13]:
blanks = []
for i, lbl, rev in df.itertuples():
    if type(rev) == str:
        if rev.isspace():
            blanks.append(i)

In [14]:
blanks

[]

In [16]:
df['label'].value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [17]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [18]:
sid = SentimentIntensityAnalyzer()

In [19]:
df["score"] = df["review"].apply(lambda review: sid.polarity_scores(review))

In [20]:
df.head()

Unnamed: 0,label,review,score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co..."
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com..."
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com..."
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co..."
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co..."


In [21]:
df["compound"] = df["score"].apply(lambda score: score["compound"])

In [22]:
df.head()

Unnamed: 0,label,review,score,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484


In [23]:
df["comp_score"] = df["compound"].apply(lambda compound: "pos" if compound >= 0 else "neg")

In [24]:
df.head()

Unnamed: 0,label,review,score,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [25]:
from sklearn import metrics

In [26]:
print(metrics.confusion_matrix(df['label'],df['comp_score']))

[[427 542]
 [164 805]]


In [27]:
print(metrics.classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



In [28]:
print(metrics.accuracy_score(df['label'],df['comp_score']))

0.6357069143446853
