# Data exploration

In [2]:
import pandas as pd
movies = pd.read_csv('https://proai.org/movie-reviews.csv.gz', index_col=0)

In [15]:
print('Total number of movies: ', movies.text.count())
movies.text.head()

Total number of movies:  10605


id
1    The Rock is destined to be the 21st Century's ...
2    The gorgeously elaborate continuation of ''The...
3                       Effective but too tepid biopic
4    If you sometimes like to go to the movies to h...
5    Emerges as something rare, an issue movie that...
Name: text, dtype: object

In [16]:
print('Labeled data')
movies.head().round(2)

Labeled data


Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.27,The Rock is destined to be the 21st Century's ...
2,3.53,The gorgeously elaborate continuation of ''The...
3,-0.6,Effective but too tepid biopic
4,1.47,If you sometimes like to go to the movies to h...
5,1.73,"Emerges as something rare, an issue movie that..."


In [6]:
movies.describe().round(2)

Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


We can see that the ratings are centered within the range of -4 and +4.

Now you can tokenize all those movie review texts to create a **bag of words** for each one. If you put them all into a Pandas DataFrame that will make them easier to work with

In [17]:
import pandas as pd
pd.options.display.width = 75
from nltk.tokenize import casual_tokenize
from collections import Counter # collections.Counter is used to create bag-of-words

bows = [] # array of Counter objects

for text in movies.text:
    bow = Counter(casual_tokenize(text))
    bows.append(bow)

print('Bag of words for the first movie:')
print(bows[0])


Bag of words for the first movie:
Counter({"'": 4, 'to': 2, 'The': 1, 'Rock': 1, 'is': 1, 'destined': 1, 'be': 1, 'the': 1, '21st': 1, "Century's": 1, 'new': 1, 'Conan': 1, 'and': 1, 'that': 1, "he's": 1, 'going': 1, 'make': 1, 'a': 1, 'splash': 1, 'even': 1, 'greater': 1, 'than': 1, 'Arnold': 1, 'Schwarzenegger': 1, ',': 1, 'Jean': 1, 'Claud': 1, 'Van': 1, 'Damme': 1, 'or': 1, 'Steven': 1, 'Segal': 1, '.': 1})


Now we create a DataFrame where each row represents a movie. Each column represents the count of token $t_i$ in the movie review. We will see the `df_movies.shape` equals `(10605, 20756)` where `10605` is the number of movie reivews and `20756` is the vocabulary size.

In [19]:
df_movies =pd.DataFrame.from_records(bows)
df_movies = df_movies.fillna(0).astype(int)
df_movies.shape

(10605, 20756)

In [20]:
df_movies.head()

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Train
Now you have all the data that a Naive Bayes model needs to find the keywords that predict sentiment from natural language text

In [25]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

# Examine the labels
df_labels = (movies.sentiment > 0)
df_labels

id
1         True
2         True
3        False
4         True
5         True
         ...  
10601    False
10602    False
10603    False
10604     True
10605    False
Name: sentiment, Length: 10605, dtype: bool

In [24]:
df_movies

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10603,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,2,1,0,0


What we have created is the bag-of-words representation of the corpus (`df_movies`) of shape `(n_movies, n_tokens)` and the trained labels for each movies (`df_labels`) of shape `(n_movies,)`

In [26]:
print(df_movies.shape)
print(df_labels.shape)

(10605, 20756)
(10605,)


In [28]:
nb = nb.fit(df_movies, df_labels)
movies['pred_senti'] = nb.predict_proba(df_movies)[:,1] * 8 - 4
movies['error'] = (movies.pred_senti - movies.sentiment)
mae = movies.error.abs().mean().round(1)
mae

1.9

Trained data labels movie ratings between -4 and +4 so we need to normalize the predicted probabilities (0 - 1) to the range by multiply by 8 and subtract 4.

# Evaluate

In [29]:
movies.columns

Index(['sentiment', 'text', 'pred_senti', 'error'], dtype='object')

In [30]:
movies['senti_ispos'] = (movies.sentiment > 0).astype(int)
movies['pred_ispos'] = (movies.pred_senti > 0).astype(int)

cols = [c for c in movies.columns if 'senti' in c or 'pred' in c]
movies[cols].head()

Unnamed: 0_level_0,sentiment,pred_senti,senti_ispos,pred_ispos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2.266667,2.511515,1,1
2,3.533333,3.999904,1,1
3,-0.6,-3.655976,0,0
4,1.466667,1.940954,1,1
5,1.733333,3.910373,1,1
