In [1]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
import time

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Global configuration
SEED = 42
np.random.seed(SEED) 

In [3]:
# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')
nlp_es = spacy.load('es_core_news_lg')

# Get list of stopwords in english
stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [4]:
# Read data
movie_overviews = pd.read_csv('data/movie_overviews.csv', index_col=0)
print(f'Head of movie_overviews: \n{movie_overviews.head()}')

spam = pd.read_csv('data/spam.csv', encoding='utf-8', usecols=['v1', 'v2'])
print(f'\n\nHead of spam: \n{spam.head()}')

movie_reviews_clean = pd.read_csv('data/movie_reviews_clean.csv')
print(f'\n\nHead of movie_reviews_clean: \n{movie_reviews_clean.head()}')
print('\nValues of sentiment column:')
print(movie_reviews_clean.sentiment.value_counts()) #movie_reviews_clean.sentiment.unique()

Head of movie_overviews: 
                             title  \
id                                   
862                      Toy Story   
8844                       Jumanji   
15602             Grumpier Old Men   
31357            Waiting to Exhale   
11862  Father of the Bride Part II   

                                                overview  \
id                                                         
862    Led by Woody, Andy's toys live happily in his ...   
8844   When siblings Judy and Peter discover an encha...   
15602  A family wedding reignites the ancient feud be...   
31357  Cheated on, mistreated and stepped on, the wom...   
11862  Just when George Banks has recovered from his ...   

                                                 tagline  
id                                                        
862                                                  NaN  
8844           Roll the dice and unleash the excitement!  
15602  Still Yelling. Still Fighting. Still Ready 

In [5]:
# Global functions
def preprocess(text, model=nlp, stopwords=stopwords):
    """Lemmatize a text and return it after cleaning stopwords and not alphanumerics tokens."""
    # Create Doc object without ner and parser
    # ner: EntityRecognizer, parser: owers the sentence boundary detection
    # Return lemmas without stopwords and non-alphabetic characters
    return ' '.join([token.lemma_ for token in model(text.lower(), disable=['ner', 'parser']) 
                     if token.lemma_.isalpha() and token.lemma_ not in stopwords])  

# 3. N-Gram models

Learn about n-gram modeling and use it to perform sentiment analysis on movie reviews.

## 3.1 Building a bag of words model

1. Building a bag of words model
>In this chapter, we will cover vectorization which is, as you may recall, the process of converting text into vectors.

2. Recap of data format for ML algorithms
>Recall that for any ML algorithm to run properly, data fed into it must be in tabular form and all the training features must be numerical. This is clearly not the case for textual data. In this lesson, we will learn a technique called bag of words that converts text documents into vectors.

3. Bag of words model
>The bag of words model is a procedure of extracting word tokens from a text document (henceforth, we will refer to this as just document), computing the frequency of these word tokens and constructing a word vector based on these frequencies and the vocabulary of the entire corpus of documents. This is best explained with the help of an example.

4. Bag of words model example
>Consider a corpus of three documents. The lion is the king of the jungle. Lions have an average lifespan of 15 years. And, the lion is an endangered species.

5. Bag of words model example
>We now extract the unique word tokens that occur in this corpus of documents. This will be the vocabulary of our model. In this example, the following 15 word tokens will constitute our vocabulary. Since there are 15 words in our vocabulary, our word vectors will have 15 dimensions and each dimension's value will correspond to the frequency of the word token corresponding to that dimension. For instance, the second dimension will correspond to the number of times the second word in the vocabulary, an, occurs in the document. Let's now convert our documents into word vectors using this bag of words model. The lion is the king of the jungle is converted to the following vector. Similarly, the other two sentences have the following word vector representations.

6. Text preprocessing
>As we were constructing this model, you may have noticed how text preprocessing would have been extremely useful in creating arguably better models. We would usually want Lions and lion to mean the same thing and therefore, counted as the same thing. The same applies to 'the' with different cases. We would also want to remove punctuations and stopwords as they are extremely common and don't really contribute much to the character of the document. Performing text preprocessing usually leads to smaller vocabularies, which is a good thing. While working with vectorization, it is routine to form word vectors running into thousands of dimensions and keeping this to a minimum helps improve performance.

7. Bag of words model using sklearn
>To construct the bag of words model in Python, we will use the scikit-learn library. We will use the corpus from before, consisting of the three sentences on lions. Let's ignore text preprocessing for now.

8. Bag of words model using sklearn
>We import the CountVectorizer class from sklearn.feature_extraction.text. This is the class that will help us build our bag of words model. Next, we instantiate a CountVectorizer object vectorizer. We finally create our matrix of word vectors by passing corpus to the fit_transform method of vectorizer. This is stored in bow_matrix. This bow_matrix is a sparse matrix and we can print out its 2D array form using bow matrix dot toarray(). This gives us the following output. Notice how this is different from the word vectors we generated. This is because CountVectorizer automatically lowercases words and ignores single character tokens such as 'a'. Also, it doesn't necessarily index the vocabulary in alphabetical order. We will learn how to map the vocabulary to the indices in the exercises. We can now use this bow_matrix as our training features in ML models.

9. Let's practice!
>We've covered a lot of theory in this lesson. Let us practice this in the exercises.

In [6]:
# Bag of words model using sklearn
lcorpus = pd.Series([
'The lion is the king of the jungle',
'Lions have lifespans of a decade',
'The lion is an endangered species'
])
print(f'Corpus to analize: \n{lcorpus}')

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(lcorpus)
features = vectorizer.get_feature_names()
matrix = bow_matrix.toarray()

print(f'\nFeatures: \n{features}')
print(f'\nVectorized data: \n{matrix}')

Corpus to analize: 
0    The lion is the king of the jungle
1      Lions have lifespans of a decade
2     The lion is an endangered species
dtype: object

Features: 
['an', 'decade', 'endangered', 'have', 'is', 'jungle', 'king', 'lifespans', 'lion', 'lions', 'of', 'species', 'the']

Vectorized data: 
[[0 0 0 0 1 1 1 0 1 0 1 0 3]
 [0 1 0 1 0 0 0 1 0 1 1 0 0]
 [1 0 1 0 1 0 0 0 1 0 0 1 1]]


## 3.2 Word vectors with a given vocabulary

You have been given a corpus of documents and you have computed the vocabulary of the corpus to be the following: V: a, an, and, but, can, come, evening, forever, go, i, men, may, on, the, women

**Instructions**

Which of the following corresponds to the bag of words vector for the document _"men may come and men may go but i go on forever"_?

**Possible Answers**

1. __(0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 2, 2, 1, 0, 0)__ Correct!
2. (0, 1, 0, 1, 1, 1, 2, 0, 2, 1, 0, 0, 0, 2, 0)
3. (2, 1, 0, 0, 2, 1, 0, 0, 0, 1)
4. (0, 0, 1, 2, 1, 2, 1, 1, 1, 0, 0, 1, 1, 1, 1)

**Results**

<font color=darkgreen>Good job! That is, indeed, the correct answer. Each value in the vector corresponds to the frequency of the corresponding word in the vocabulary.</font>

## 3.3 BoW model for movie taglines

In this exercise, you have been provided with a __corpus__ of more than 7000 movie tag lines. Your job is to generate the bag of words representation __bow_matrix__ for these taglines. For this exercise, we will ignore the text preprocessing step and generate bow_matrix directly.

We will also investigate the shape of the resultant __bow_matrix__. The first five taglines in __corpus__ have been printed to the console for you to examine.

**Instructions**

1. Import the CountVectorizer class from sklearn.
2. Instantiate a CountVectorizer object. Name it vectorizer.
3. Using fit_transform(), generate bow_matrix for corpus.

**Results**

<font color=darkgreen>Excellent! You now know how to generate a bag of words representation for a given corpus of documents. Notice that the word vectors created have more than 6600 dimensions. However, most of these dimensions have a value of zero since most words do not occur in a particular tagline.</font>

In [7]:
# Read data
corpus = movie_overviews[movie_overviews.tagline.notnull()].tagline
print(corpus.head())

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()

# Print the features in the bag of words
n = 25
print(f'\nFirst {n} word in bow: \n{features[:n]}')

# Print the shape of bow_matrix
print(f'\nShape of bow: {bow_matrix.shape}')

id
8844             Roll the dice and unleash the excitement!
15602    Still Yelling. Still Fighting. Still Ready for...
31357    Friends are the people who let you be yourself...
11862    Just When His World Is Back To Normal... He's ...
949                               A Los Angeles Crime Saga
Name: tagline, dtype: object

First 25 word in bow: 
['000', '007', '05', '06', '08', '09', '10', '100', '1000', '1001', '10pm', '11', '11th', '12', '121', '123', '13', '130', '1350', '135th', '13th', '14', '1408', '141', '15']

Shape of bow: (7033, 6614)


## 3.4 Analyzing dimensionality and preprocessing

In this exercise, you have been provided with a __lem_corpus__ which contains the pre-processed versions of the movie taglines from the previous exercise. In other words, the taglines have been lowercased and lemmatized, and stopwords have been removed.

Your job is to generate the bag of words representation __bow_lem_matrix__ for these lemmatized taglines and compare its shape with that of __bow_matrix__ obtained in the previous exercise. The first five lemmatized taglines in __lem_corpus__ have been printed to the console for you to examine.

**Instructions**

1. Import the CountVectorizer class from sklearn.
2. Instantiate a CountVectorizer object. Name it vectorizer.
3. Using fit_transform(), generate bow_lem_matrix for lem_corpus.

**Results**

<font color=darkgreen>Good job! Notice how the number of features have reduced significantly from around 6600 to around 5223 for pre-processed movie taglines. The reduced number of dimensions on account of text preprocessing usually leads to better performance when conducting machine learning and it is a good idea to consider it. However, as mentioned in a previous lesson, the final decision always depends on the nature of the application.</font>

In [8]:
# Read data
lem_corpus = corpus.apply(preprocess)
print(lem_corpus.head())

# Generate matrix of word vectors
bow_lem_matrix = vectorizer.fit_transform(lem_corpus)
lem_features = vectorizer.get_feature_names()

# Print the features in the bag of words
n = 25
print(f'\nFirst {n} word in bow: \n{lem_features[:n]}')

# Print the shape of bow_lem_matrix
print(f'\nShape of bow: {bow_lem_matrix.shape}')

id
8844     roll dice unleash excitement
15602           yell fight ready love
31357    friend people let let forget
11862      world normal surprise life
949            los angeles crime saga
Name: tagline, dtype: object

First 25 word in bow: 
['aaargh', 'aaron', 'abandon', 'abby', 'abduction', 'ability', 'able', 'aboard', 'abracatastic', 'absence', 'absolute', 'absolutely', 'absorbent', 'abyss', 'academy', 'accentuate', 'accept', 'access', 'accident', 'acclaim', 'acclaimed', 'accomplice', 'accord', 'account', 'accountancy']

Shape of bow: (7033, 4964)


## 3.5 Mapping feature indices with feature names

In the lesson video, we had seen that __CountVectorizer__ doesn't necessarily index the vocabulary in alphabetical order. In this exercise, we will learn to map each feature index to its corresponding feature name from the vocabulary.

We will use the same three sentences on lions from the video. The sentences are available in a list named corpus and has already been printed to the console.

**Instructions**

1. Instantiate a CountVectorizer object. Name it vectorizer.
2. Using fit_transform(), generate bow_matrix for corpus.
3. Using the get_feature_names() method, map the column names to the corresponding word in the vocabulary.

**Results**

<font color=darkgreen>Great job! Observe that the column names refer to the token whose frequency is being recorded. Therefore, since the first column name is an, the first feature represents the number of times the word 'an' occurs in a particular sentence. get_feature_names() essentially gives us a list which represents the mapping of the feature indices to the feature name in the vocabulary.</font>

In [9]:
# Showing data
print('Corpus: \n{lcorpus}')

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(lcorpus)

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

# Print bow_df
print(bow_df)

Corpus: 
{lcorpus}
   an  decade  endangered  have  is  jungle  king  lifespans  lion  lions  of  \
0   0       0           0     0   1       1     1          0     1      0   1   
1   0       1           0     1   0       0     0          1     0      1   1   
2   1       0           1     0   1       0     0          0     1      0   0   

   species  the  
0        0    3  
1        0    0  
2        1    1  


## 3.6 Building a BoW Naive Bayes classifier

1. Building a BoW Naive Bayes classifier
>In this lesson, we will walk through a machine learning problem that utilizes feature engineering techniques we've learned, to arrive at a desired result.

2. Spam filtering
>Let's take a look at the spam filtering problem. We're given a dataset of messages that have been labelled as spam or ham. Here, you can see a typical spam and ham message. Our task is to train an ML model that can predict the label given a particular text.

3. Steps
>There are 3 steps involved. The first is to preprocess the text. Next, we proceed to build the bag-of-words model. Finally, we conduct predictive modeling using the generated BoW vectors. Note that although we use the term 'modeling' in the context of both BoW and machine learning, they mean two different things.

4. Text preprocessing using CountVectorizer
>We've already learned how to conduct text preprocessing using spaCy. However, it is also possible to do this using CountVectorizer. CountVectorizer takes in a number of arguments to perform preprocessing. The lowercase argument, when set to True, converts words to lowercase. The strip_accents argument can convert accented characters according to unicode or ASCII mapping. Passing in a stopwords argument will lead to CountVectorizer ignoring stopwords. You can pass in a custom list or the string 'english' to use scikit-learn's list of English stopwords. You can specify tokenization using a regular expression as the value of the token_pattern argument. Tokenization can also be specified using a tokenizer argument. Here, you can pass a function that takes a string as an argument and returns a list of tokens. This way, CountVectorizer allows usage of spaCy's tokenization techniques. CountVectorizer cannot perform certain steps such as lemmatization automatically. This is where spaCy is useful. Although it performs tokenization and preprocessing, CountVectorizer's main job is to convert a corpus into a matrix of numerical vectors.

5. Building the BoW model
>As usual, we import CountVectorizer from scikit-learn. We then instantiate a CountVectorizer object called vectorizer. We perform accent stripping using ASCII mapping and remove English stopwords. We also set the lowercase argument to False. This is because spam messages usually tend to abuse all-capital words and we might want to preserve this information for the ML step. The dataset has been already been loaded into the dataframe df. We split this dataset into training and test sets using scikit-learn's train test split function.

6. Building the BoW model
>We now fit the vectorizer on the training set and transform it into its bag-of-words representation. We can perform both these steps together using the fit transform method. Next, we transform the test set into its BoW representation. Note, that we do not fit the vectorizer with the test data. It is possible that there are some words in the test data that is not in the vocabulary of the vectorizer. In such cases, CountVectorizer simply ignores these words.

7. Training the Naive Bayes classifier
>We're now in a good position to train an ML model. We will use the Multinomial Naive Bayes classifier for this task. We import the Multinomial NB class from scikit-learn and create an object named clf. We then fit the training BoW vectors and their corresponding labels to clf. We can now test the performance of our model. We compute the accuracy of the model on the test set using clf dot score. In this case, our model registered an accuracy of 76% on the test set.

8. Let's practice!
>We've covered a lot of ground in building a spam filter in this lesson. In the exercises, we will perform similar steps to perform sentiment analysis on movie reviews. Let's practice!

In [10]:
# Read data
df = spam.copy(deep=True)
df.columns = ['label', 'message']
print(df.head())

# Create CountVectorizer object
vectorizer = CountVectorizer(strip_accents='ascii', stop_words='english', lowercase=False)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.25, random_state=SEED)

# Generate training Bow vectors
X_train_bow = vectorizer.fit_transform(X_train)

# Generate test BoW vectors
X_test_bow = vectorizer.transform(X_test)

# Create MultinomialNB object
clf = MultinomialNB()

# Train clf
clf.fit(X_train_bow, y_train)

# Compute accuracy on test set
accuracy = clf.score(X_test_bow, y_test)
print(f'\n\nAccuracy of the created model to detect spam: {accuracy}')

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


Accuracy of the created model to detect spam: 0.9870782483847811


## 3.7 BoW vectors for movie reviews

In this exercise, you have been given two pandas Series, __X_train__ and __X_test__, which consist of movie reviews. They represent the training and the test review data respectively. Your task is to preprocess the reviews and generate BoW vectors for these two sets using __CountVectorizer__.

Once we have generated the BoW vector matrices __X_train_bow__ and __X_test_bow__, we will be in a very good position to apply a machine learning model to it and conduct sentiment analysis.

**Instructions**

1. Import CountVectorizer from the sklearn library.
2. Instantiate a CountVectorizer object named vectorizer. Ensure that all words are converted to lowercase and english stopwords are removed.
3. Using X_train, fit vectorizer and then use it to transform X_train to generate the set of BoW vectors X_train_bow.
4. Transform X_test using vectorizer to generate the set of BoW vectors X_test_bow.

**Results**

<font color=darkgreen>Great job! You now have a good idea of preprocessing text and transforming them into their bag-of-words representation using CountVectorizer. In this exercise, you have set the lowercase argument to True. However, note that this is the default value of lowercase and passing it explicitly is not necessary. Also, note that both X_train_bow and X_test_bow have 8158 features. There were words present in X_test that were not in X_train. CountVectorizer chose to ignore them in order to ensure that the dimensions of both sets remain the same.</font>

In [11]:
# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, stop_words='english')

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(movie_reviews_clean['review'], movie_reviews_clean['sentiment'], 
                                                    test_size=0.25, random_state=SEED)

# Fit and transform X_train
X_train_bow = vectorizer.fit_transform(X_train)

# Transform X_test
X_test_bow = vectorizer.transform(X_test)

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)

(750, 15251)
(250, 15251)


## 3.8 Predicting the sentiment of a movie review

In the previous exercise, you generated the bag-of-words representations for the training and test movie review data. In this exercise, we will use this model to train a Naive Bayes classifier that can detect the sentiment of a movie review and compute its accuracy. Note that since this is a binary classification problem, the model is only capable of classifying a review as either positive (1) or negative (0). It is incapable of detecting neutral reviews.

In case you don't recall, the training and test BoW vectors are available as __X_train_bow__ and __X_test_bow__ respectively. The corresponding labels are available as __y_train__ and __y_test__ respectively. Also, for you reference, the original movie review dataset is available as __df__.

**Instructions**

1. Instantiate an object of MultinomialNB. Name it clf.
2. Fit clf using X_train_bow and y_train.
3. Measure the accuracy of clf using X_test_bow and y_test.

**Results**

<font color=darkgreen>Excellent work! You have successfully performed basic sentiment analysis. Note that the accuracy of the classifier is 73.2%. Considering the fact that it was trained on only 750 reviews, this is reasonably good performance. The classifier also correctly predicts the sentiment of a mini negative review which we passed into it.</font>

In [12]:
# Create a MultinomialNB object
clf = MultinomialNB()

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure the accuracy
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was terrible. The music was underwhelming and the acting mediocre."
prediction = clf.predict(vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))

The accuracy of the classifier on the test set is 0.784
The sentiment predicted by the classifier is 0


## 3.9 Building n-gram models

1. Building n-gram models
>We already know how to build bag-of-words representations of our documents and use it to conduct various machine learning tasks.

2. BoW shortcomings
>Consider the following mini reviews. One is a positive review which states that the movie was good and not boring. The other is negative; commenting that the movie was not good and boring. If we were to construct BoW vectors for these reviews, we would get identical vectors since both reviews contain exactly the same words. And here in lies the biggest shortcoming of the bag-of-words model: context of the words is lost. In this example, the position of the word 'not' changes the entire sentiment of the review. Therefore, in this lesson, we will study techniques that will allow us to model this.

3. n-grams
>An n-gram is a contiguous sequence of n elements (or words) in a given document. The bag-of-words model that we've explored so far is nothing but an n-gram model where n is equal to one. Let's now explore n-grams when n is greater than one. Consider the sentence 'for you a thousand times over'. If we set n to 2, then the n-grams (called bigrams in this case) would be for you, you a, a thousand, thousand times and times over.

4. n-grams
>Similarly, for n equal to 3, the n-grams (or trigrams) will be for you a, you a thousand, a thousand times, thousand times over. Therefore, we can use these n-grams to capture more context and account for cases like 'not'.

5. Applications
>Apart from capturing more context, n-grams have a host of other useful applications. They are used in sentence completion, spelling correction and machine translation correction. In all these cases, the model computes the probability of n words occurring contiguously to perform the above processes.

6. Building n-gram models using scikit-learn
>Building these n-gram models using scikit-learn is extremely simple, now that we know how to use CountVectorizer. CountVectorizer takes in an argument ngram range which is a tuple containing the lower and upper bound for the range of n-values. For instance, passing 2,2 as the ngram_range will generate only bigrams. On the other hand, passing in 1,3 will generate n-grams where n is equal to 1, 2 and 3.

7. Shortcomings
>While on the surface, it may seem lucrative to generate n-grams of high orders to capture more and more context, it comes with caveats. We've already seen that the BoW vectors run into thousands of dimensions. Adding higher order n-grams increases the number of dimensions even more and while performing machine learning, leads to a problem known as the curse of dimensionality. Additionally, n-grams for n greater than 3 become exceedingly rare to find in multiple documents. So that feature becomes effectively useless. For these reasons, it is often a good idea to restrict yourself to n-grams where n is small.

8. Let's practice!
>Great! Let's now build these advanced n-gram models and discover more insights in the exercises.

In [13]:
# Data example
corpus = [
    'The movie was good and not boring',
    'The movie was not good and boring'
]
print(f'Corpus: \n{corpus}')

ngrams_totest = {(1, 1): 'Default', (2, 2): 'Bigrams', (1, 3): 'Unigrams, bigrams and trigrams'}
for ngram in ngrams_totest:
    # Create CountVectorizer object 
    print(f'\nCountVectorizer with ngram_range = {ngram} - {ngrams_totest[ngram]}')
    vectorizer = CountVectorizer(ngram_range=ngram)
    
    # Generate matrix of word vectors
    bow_matrix = vectorizer.fit_transform(corpus)
    features = vectorizer.get_feature_names()
    matrix = bow_matrix.toarray()
    print(f'Features: \n{features}')
    print(f'Vectorized data: \n{matrix} \n{matrix.shape}')

Corpus: 
['The movie was good and not boring', 'The movie was not good and boring']

CountVectorizer with ngram_range = (1, 1) - Default
Features: 
['and', 'boring', 'good', 'movie', 'not', 'the', 'was']
Vectorized data: 
[[1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1]] 
(2, 7)

CountVectorizer with ngram_range = (2, 2) - Bigrams
Features: 
['and boring', 'and not', 'good and', 'movie was', 'not boring', 'not good', 'the movie', 'was good', 'was not']
Vectorized data: 
[[0 1 1 1 1 0 1 1 0]
 [1 0 1 1 0 1 1 0 1]] 
(2, 9)

CountVectorizer with ngram_range = (1, 3) - Unigrams, bigrams and trigrams
Features: 
['and', 'and boring', 'and not', 'and not boring', 'boring', 'good', 'good and', 'good and boring', 'good and not', 'movie', 'movie was', 'movie was good', 'movie was not', 'not', 'not boring', 'not good', 'not good and', 'the', 'the movie', 'the movie was', 'was', 'was good', 'was good and', 'was not', 'was not good']
Vectorized data: 
[[1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 0]
 [1 1 0 0 

## 3.10 n-gram models for movie tag lines

In this exercise, we have been provided with a __corpus__ of more than 9000 movie tag lines. Our job is to generate n-gram models up to n equal to 1, n equal to 2 and n equal to 3 for this data and discover the number of features for each model.

We will then compare the number of features generated for each model.

**Instructions**

1. Generate an n-gram model with n-grams up to n=1. Name it ng1
2. Generate an n-gram model with n-grams up to n=2. Name it ng2
3. Generate an n-Gram Model with n-grams up to n=3. Name it ng3
4. Print the number of features for each model.

**Results**

<font color=darkgreen>Good job! You now know how to generate n-gram models containing higher order n-grams. Notice that ng2 has over 37,000 features whereas ng3 has over 76,000 features. This is much greater than the 6,000 dimensions obtained for ng1. As the n-gram range increases, so does the number of features, leading to increased computational costs and a problem known as the curse of dimensionality.</font>

In [14]:
# Read data
corpus = movie_overviews[movie_overviews.tagline.notnull()].tagline
print(corpus.head())

# Generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1,1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# Generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1,2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# Generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1, 3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print("\nng1, ng2 and ng3 have %i, %i and %i features respectively" % (ng1.shape[1], ng2.shape[1], ng3.shape[1]))

id
8844             Roll the dice and unleash the excitement!
15602    Still Yelling. Still Fighting. Still Ready for...
31357    Friends are the people who let you be yourself...
11862    Just When His World Is Back To Normal... He's ...
949                               A Los Angeles Crime Saga
Name: tagline, dtype: object

ng1, ng2 and ng3 have 6614, 37100 and 76881 features respectively


## 3.11 Higher order n-grams for sentiment analysis

Similar to a previous exercise, we are going to build a classifier that can detect if the review of a particular movie is positive or negative. However, this time, we will use n-grams up to n=2 for the task.

The n-gram training reviews are available as __X_train_ng__. The corresponding test reviews are available as __X_test_ng__. Finally, use __y_train__ and __y_test__ to access the training and test sentiment classes respectively.

**Instructions**

1. Define an instance of MultinomialNB. Name it clf_ng
2. Fit the classifier on X_train_ng and y_train.
3. Measure accuracy on X_test_ng and y_test the using score() method.

**Results**

<font color=darkgreen>Excellent job! You're now adept at performing sentiment analysis using text. Notice how this classifier performs slightly better than the BoW version. Also, it succeeds at correctly identifying the sentiment of the mini-review as negative. In the next chapter, we will learn more complex methods of vectorizing textual data.</font>

In [15]:
# Create a CountVectorizer object
ng_vectorizer = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2))

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(movie_reviews_clean['review'], movie_reviews_clean['sentiment'], 
                                                    test_size=0.25, random_state=SEED)

# Fit and transform X_train
X_train_ng = ng_vectorizer.fit_transform(X_train)

# Transform X_test
X_test_ng = ng_vectorizer.transform(X_test)

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)

# Define an instance of MultinomialNB 
clf_ng = MultinomialNB()

# Fit the classifier 
clf_ng.fit(X_train_ng, y_train)

# Measure the accuracy 
accuracy = clf_ng.score(X_test_ng, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was not good. The plot had several holes and the acting lacked panache."
prediction = clf_ng.predict(ng_vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))

(750, 15251)
(250, 15251)
The accuracy of the classifier on the test set is 0.768
The sentiment predicted by the classifier is 0


## 3.12 Comparing performance of n-gram models

You now know how to conduct sentiment analysis by converting text into various n-gram representations and feeding them to a classifier. In this exercise, we will conduct sentiment analysis for the same movie reviews from before using two n-gram models: unigrams and n-grams upto n equal to 3.

We will then compare the performance using three criteria: accuracy of the model on the test set, time taken to execute the program and the number of features created when generating the n-gram representation.

**Instructions**

1. Initialize a CountVectorizer object such that it generates unigrams.
2. Initialize a CountVectorizer object such that it generates ngrams upto n=3.

**Results**

<font color=darkgreen>Amazing work! The program took around 0.2 seconds in the case of the unigram model and more than 10 times longer for the higher order n-gram model. The unigram model had over 12,000 features whereas the n-gram model for upto n=3 had over 178,000! Despite taking higher computation time and generating more features, the classifier only performs marginally better in the latter case, producing an accuracy of 77% in comparison to the 75% for the unigram model.</font>

In [16]:
# Data example
ngrams_totest = {(1, 1): 'Unigram', (1, 3): 'Unigrams, bigrams and trigrams'}
for ngram in ngrams_totest:
    start_time = time.time()
    # Splitting the data into training and test sets
    train_X, test_X, train_y, test_y = train_test_split(movie_reviews_clean['review'], movie_reviews_clean['sentiment'], 
                                                        test_size=0.25, random_state=SEED, 
                                                        stratify=movie_reviews_clean['sentiment'])
    # Generating ngrams
    print(f'\nCountVectorizer with ngram_range = {ngram} - {ngrams_totest[ngram]}')
    vectorizer = CountVectorizer(lowercase=True, stop_words='english', ngram_range=ngram)
    
    # Generate matrix of word vectors
    train_X = vectorizer.fit_transform(train_X)
    test_X = vectorizer.transform(test_X)
    
    # Fit classifier
    clf = MultinomialNB()
    clf.fit(train_X, train_y)
    
    # Print accuracy, time and number of dimensions
    print("""
    The program took %.3f seconds to complete. 
    The accuracy on the test set is %.2f. 
    The ngram representation had %i features.
    """ % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1])
    )


CountVectorizer with ngram_range = (1, 1) - Unigram

    The program took 0.177 seconds to complete. 
    The accuracy on the test set is 0.79. 
    The ngram representation had 15027 features.
    

CountVectorizer with ngram_range = (1, 3) - Unigrams, bigrams and trigrams

    The program took 0.724 seconds to complete. 
    The accuracy on the test set is 0.82. 
    The ngram representation had 162285 features.
    


# Aditional material

- Datacamp course: https://learn.datacamp.com/courses/feature-engineering-for-nlp-in-python
- POS annotations in spaCy: https://spacy.io/api/annotation#pos-tagging
- NER annotations in spaCy: https://spacy.io/api/annotation#named-entities