# Bag of words model
- Extract word tokens
- Compute frequency of word tokens
- Construct a word vector out of these frequencies and vocanulary of corpus 

In [1]:
import pandas as pd
movie = pd.read_csv('datasets/movie_overviews.csv')
movie.head()

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...


In [2]:
movie.isna().sum()

id             0
title          0
overview      12
tagline     2066
dtype: int64

In [3]:
movie.dropna(subset=['tagline'],inplace=True)
movie.isna().sum()

id          0
title       0
overview    0
tagline     0
dtype: int64

In [4]:
movie.head()

Unnamed: 0,id,title,overview,tagline
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga


In [5]:
# Create a corpus with movie tagline
corpus = movie['tagline']

In [6]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Print the shape of bow_matrix
print(bow_matrix.shape)

(7033, 6614)


## Mapping feature indices with feature names

In [7]:
corpus = ['The lion is the king of the jungle',
 'Lions have lifespans of a decade',
 'The lion is an endangered species']


In [8]:
# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())
bow_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0,0,0,1,1,1,0,1,0,1,0,3
1,0,1,0,1,0,0,0,1,0,1,1,0,0
2,1,0,1,0,1,0,0,0,1,0,0,1,1


In [9]:
# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

# Print bow_df
bow_df

Unnamed: 0,an,decade,endangered,have,is,jungle,king,lifespans,lion,lions,of,species,the
0,0,0,0,0,1,1,1,0,1,0,1,0,3
1,0,1,0,1,0,0,0,1,0,1,1,0,0
2,1,0,1,0,1,0,0,0,1,0,0,1,1


# Building a BoW Naive Bayes classifier
1. Text preprocessing
2. Building a bag of word model
3. Machine Learning

## Text preprocessing using CountVectorizer
CountVectorizer arguments:
- lowercase : False, True
- strip_accents : 'unicode','ascii',None
- stop_words : 'english',list, None
- token_pattern : regex
- tokenizer : function

In [10]:
movie_reviews = pd.read_csv('datasets/movie_reviews_clean.csv')
movie_reviews.head()

Unnamed: 0,review,sentiment
0,this anime series starts out great interesting...,0
1,some may go for a film like this but i most as...,0
2,i ve seen this piece of perfection during the ...,1
3,this movie is likely the worst movie i ve ever...,0
4,it ll soon be 10 yrs since this movie was rele...,1


In [11]:
X = movie_reviews['review']
y = movie_reviews['sentiment']

# Split into train test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.3, random_state=42)

In [12]:
X_train.head()

541    i was fortunate to attend the london premier o...
440    this is my favorite renoir from the fifties it...
482    there is only one racist joke in this daffy du...
422    this show lasted for most of the 1980s and had...
778    this movie is so bad it s good in an unintenti...
Name: review, dtype: object

In [13]:
X_train.shape

(700,)

In [14]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, stop_words='english')

# Fit and transform X_train
X_train_bow = vectorizer.fit_transform(X_train)

# Transform X_test
X_test_bow = vectorizer.transform(X_test)

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)

(700, 14727)
(300, 14727)


In [15]:
# Look at the top 5 rows where each row represents a review and each column represent the vocabluary in overall dataset
X_train_bow[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
from sklearn.naive_bayes import MultinomialNB

# Create a MultinomialNB object
clf = MultinomialNB()

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure the accuracy
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

The accuracy of the classifier on the test set is 0.787


In [17]:
# Predict the sentiment of a negative review
review = "The movie was terrible. The music was underwhelming and the acting mediocre."
prediction = clf.predict(vectorizer.transform([review]))
print("The sentiment predicted by the classifier is %i" % (prediction))

The sentiment predicted by the classifier is 0


# BoW Shortcomings
![image-3](image-3.png)

One is a positive review which states that the movie was good and not boring. The other is negative; commenting that the movie was not good and boring. 

- Exacty the same BoW representation!
- **The biggest shortcoming of the bag-of-words model: context of the words is lost.**
- Sentiment depends on the position of "not"

# Overcoming using: n-grams
- Contiguous sequence of n elements (or words) in a given document.
- n = 1 --> bag-of-words
- n = 2 (bigrams)
- n = 3 (trigrams) and so on..

Eg. "for you a thousand times over", n=2 then:

['for you', 'you a', 'a thousand', 'thousand times', 'times over']

In [20]:
corpus

['The lion is the king of the jungle',
 'Lions have lifespans of a decade',
 'The lion is an endangered species']

In [18]:
# Generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1,1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# Generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1,2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# Generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1, 3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print("ng1, ng2 and ng3 have %i, %i and %i features respectively" % (ng1.shape[1], ng2.shape[1], ng3.shape[1]))

In [28]:
# ngrams_range=(1,1), having 1 n_grams
ng1_df = pd.DataFrame(ng1.toarray(),columns=vectorizer_ng1.get_feature_names())
ng1_df

Unnamed: 0,an,decade,endangered,have,is,jungle,king,lifespans,lion,lions,of,species,the
0,0,0,0,0,1,1,1,0,1,0,1,0,3
1,0,1,0,1,0,0,0,1,0,1,1,0,0
2,1,0,1,0,1,0,0,0,1,0,0,1,1


In [29]:
# ngrams_range=(1,2), having 1 and 2 n_grams
ng2_df = pd.DataFrame(ng2.toarray(),columns=vectorizer_ng2.get_feature_names())
ng2_df

Unnamed: 0,an,an endangered,decade,endangered,endangered species,have,have lifespans,is,is an,is the,jungle,king,king of,lifespans,lifespans of,lion,lion is,lions,lions have,of,of decade,of the,species,the,the jungle,the king,the lion
0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,1,0,3,1,1,1
1,0,0,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,1,1,1,1,0,0,0,0,0,0
2,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,1


In [30]:
# ngrams_range=(1,3), having 1,2 and 3 n_grams
ng3_df = pd.DataFrame(ng3.toarray(),columns=vectorizer_ng3.get_feature_names())
ng3_df

Unnamed: 0,an,an endangered,an endangered species,decade,endangered,endangered species,have,have lifespans,have lifespans of,is,is an,is an endangered,is the,is the king,jungle,king,king of,king of the,lifespans,lifespans of,lifespans of decade,lion,lion is,lion is an,lion is the,lions,lions have,lions have lifespans,of,of decade,of the,of the jungle,species,the,the jungle,the king,the king of,the lion,the lion is
0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,0,3,1,1,1,1,1
1,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0
2,1,1,1,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1


# Building n-grams models

In [31]:
movie_reviews.head()

Unnamed: 0,review,sentiment
0,this anime series starts out great interesting...,0
1,some may go for a film like this but i most as...,0
2,i ve seen this piece of perfection during the ...,1
3,this movie is likely the worst movie i ve ever...,0
4,it ll soon be 10 yrs since this movie was rele...,1


In [32]:
X = movie_reviews['review']
y = movie_reviews['sentiment']

# Split into train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.3, random_state=42)

In [33]:
# Instantiate CountVectorizer with n_grams upto 2
vectorizer_ng = CountVectorizer(ngram_range=(1,2))

# Fit and Vectorizer train set
X_train_ng = vectorizer_ng.fit_transform(X_train)

# Transform test set
X_test_ng = vectorizer_ng.transform(X_test)

X_train_ng.shape, X_test_ng.shape

((700, 104352), (300, 104352))

In [34]:
# Define an instance of MultinomialNB 
clf_ng = MultinomialNB()

# Fit the classifier 
clf_ng.fit(X_train_ng, y_train)

# Measure the accuracy 
accuracy = clf_ng.score(X_test_ng, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

The accuracy of the classifier on the test set is 0.770


In [36]:
# Predict the sentiment of a negative review
review = "The movie was not good. The plot had several holes and the acting lacked panache."
prediction = clf_ng.predict(vectorizer_ng.transform([review]))
print("The sentiment predicted by the classifier is %i" % (prediction))

The sentiment predicted by the classifier is 0
