# Please find the comment: "# your code here" and fill in your code. 
- Please keep the comment and do not delete it.
- The expected output has shown below each cell

# Build a n-gram language model using the brown corpus
- In this task, you are going to build a n-gram language model using the brown corpus; 
- You will build a unigram model, a bigram model, and a trigram model.
- You will then need to calculate the probability for a given sentence based on the your language model.

In [1]:
import nltk
from nltk.corpus import brown 
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

In [2]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [3]:
# the sentence you are going to calculate the probability for
test_sent = "Congratulations you have made it"

In [4]:
# each sentence is stored as a list of tokens
brown.sents()[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

- Build the unigram model: P(w1)

In [5]:
# Count frequency of unigrams
unigram_model = defaultdict(lambda: 0)
for sentence in brown.sents():
    for w1 in sentence:
        unigram_model[w1] += 1
        
# change counts to probability: P(w1) = count(w1)/total counts of all unigrams
for w1 in unigram_model:
    total_count = sum(unigram_model.values())# your code here 
    unigram_model[w1] = unigram_model[w1]/total_count # your code here 

In [6]:
# check the unigram model
print(unigram_model['Chicago'])
print(unigram_model['you']) 

0.0003902803894980554
0.00566839451208122


In [7]:
# calculate the sentence probability using the unigram model
# P('Congratulations you have made it') = P('Congratulations')*P(you)*P(have)*P(made)*P(it)
P_sent = 1
for wd in test_sent.split():
    print("P(%s): %.5f" % (wd, unigram_model[wd]))
    P_sent = P_sent*unigram_model[wd]

print("P(%s) based on unigram model is:" % (test_sent),P_sent)

P(Congratulations): 0.00002
P(you): 0.00567
P(have): 0.00519
P(made): 0.00202
P(it): 0.00872
P(Congratulations you have made it) based on unigram model is: 1.0170928774991854e-14


- Build the bigram model: P(w2|w1)

In [8]:
# Create a dict of dict to store bigram frequency: dict[key] = {key_1:value_1; key_2:value_2;...}
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of bigrams
for sentence in brown.sents():
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
        bigram_model[w1][w2] += 1
        
# Transform the counts to probabilities: P(w2|w1) = count(w1_w2)/total counts of all bigrams that start with w1: (w1, wi)
for w1 in bigram_model:
    total_count = float(sum(bigram_model[w1].values()))
    for w2 in bigram_model[w1]:
        bigram_model[w1][w2] /= total_count

In [9]:
# check the bigram model: P('job'|'good')
bigram_model['good']['(']

0.001303780964797914

In [10]:
# check all the possible w2 in the bigram ('good', w2)
dict(bigram_model['good'])

{"''": 0.010430247718383311,
 '(': 0.001303780964797914,
 ',': 0.0469361147327249,
 '--': 0.001303780964797914,
 '.': 0.05997392438070404,
 ':': 0.002607561929595828,
 ';': 0.003911342894393742,
 '?': 0.002607561929595828,
 'American': 0.001303780964797914,
 'Baptist': 0.001303780964797914,
 'British': 0.001303780964797914,
 'Catholic': 0.001303780964797914,
 'Catholics': 0.001303780964797914,
 'Democrat': 0.001303780964797914,
 'English': 0.002607561929595828,
 'Japanese': 0.001303780964797914,
 'Jew': 0.001303780964797914,
 'Lord': 0.001303780964797914,
 'Mayor': 0.001303780964797914,
 None: 0.001303780964797914,
 'Protestant': 0.001303780964797914,
 'Virgin': 0.001303780964797914,
 '``': 0.001303780964797914,
 'a': 0.003911342894393742,
 'abaringe': 0.001303780964797914,
 'address': 0.001303780964797914,
 'advantage': 0.001303780964797914,
 'agreement': 0.001303780964797914,
 'although': 0.001303780964797914,
 'am': 0.001303780964797914,
 'an': 0.001303780964797914,
 'and': 0.029986

In [11]:
# manually assign a bigram probability for P(you|Congratulations)
bigram_model['Congratulations']['you'] = 0.05678

In [12]:
# calculate the sentence probability using the bigram model
# P('Congratulations you have made it') = P('Congratulations')*P(you|Congratulations)*P(have|you)*P(made|have)*P(it|made)

P_sent = 1
wd_list = test_sent.split()
for wi,wd in enumerate(wd_list):
    if(wi == 0):
        P_sent = P_sent*(unigram_model[wd]) # your code here
        print(wd, float("%.5f" % unigram_model[wd]))
    else:
        P_sent =  P_sent*(bigram_model[wd_list[wi-1]][wd_list[wi]]) # your code here
        print("P(%s|%s): %.5f" % (wd_list[wi], wd_list[wi-1], bigram_model[wd_list[wi-1]][wd_list[wi]]))  
    
print("P(%s) based on bigram model is:" % (test_sent),P_sent)

Congratulations 2e-05
P(you|Congratulations): 0.05678
P(have|you): 0.03868
P(made|have): 0.00797
P(it|made): 0.04100
P(Congratulations you have made it) based on bigram model is: 1.407282960001504e-11


- Build the trigram model: P(w1,w2,w3)

In [13]:
# Create a dict of dict to store trigram frequency: dict[key] = {key_1:value_1; key_2:value_2;...}
trigram_model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of trigrams
for sentence in brown.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        trigram_model[w1,w2][w3] += 1

# Transform the counts to probabilities: count(w1_w2_w3)/total counts of all trigrams that start with w1_w2: (w1, w2, wi)
for w1,w2 in trigram_model:
    total_count = sum(trigram_model[w1,w2][w3] for w3 in trigram_model[w1,w2])# your code here
    for w3 in trigram_model[w1,w2]:
        trigram_model[w1,w2][w3] /= total_count

In [14]:
print(trigram_model['an','investigation']['of']) 

0.5714285714285714


In [15]:
# check all the possible w3 in the trigram ('an', 'investigation', w3)
dict(trigram_model['an','investigation'])

{'.': 0.14285714285714285,
 'of': 0.5714285714285714,
 'which': 0.14285714285714285,
 'with': 0.14285714285714285}

In [16]:
# manually assign a bigram probability for P(you,Congratulations)
trigram_model['Congratulations','you']['have'] = 0.08765

In [17]:
# calculate the sentence probability using the trigram model
# P('Congratulations you have made it') = P('Congratulations')*P(you|Congratulations)*P(have|Congratulations,you)*P(made|you,have)*P(it|have,made)

P_sent = 1
wd_list = test_sent.split()
for wi,wd in enumerate(wd_list):
    if(wi == 0):
        P_sent = P_sent*(unigram_model[wd]) # your code here
        print("P(%s): %.5f" % (wd, unigram_model[wd]))
    elif(wi == 1):
        P_sent =  P_sent*(bigram_model[wd_list[wi-1]][wd_list[wi]]) # your code here
        print("P(%s|%s): %.5f" % (wd_list[wi], wd_list[wi-1], bigram_model[wd_list[wi-1]][wd_list[wi]]))  
    else:
        P_sent = P_sent*(trigram_model[wd_list[wi-2], wd_list[wi-1]][wd_list[wi]]) # your code here
        print("P(%s|%s,%s): %.5f" % (wd_list[wi], wd_list[wi-2], wd_list[wi-1], trigram_model[wd_list[wi-2],wd_list[wi-1]][wd_list[wi]]))  
    
print("P(%s) based on trigram model is:" % (test_sent),P_sent)

P(Congratulations): 0.00002
P(you|Congratulations): 0.05678
P(have|Congratulations,you): 0.08765
P(made|you,have): 0.00935
P(it|have,made): 0.06452
P(Congratulations you have made it) based on trigram model is: 5.887520532622753e-11


# Compare the performance of Logistic Regression classifier and Naive Bayes classifier

In [18]:
# import google.colab
from google.colab import files
uploaded = files.upload()

Saving df_data.csv to df_data.csv


In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [20]:
df_data = pd.read_csv('df_data.csv')
df_data.shape, display(df_data.head(2))

Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1


((49582, 2), None)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_df=0.8, min_df=5, 
                             ngram_range=(1,1), binary=True)

X = vectorizer.fit_transform(df_data.review)
y = df_data.label.values

print("X.shape : ",X.shape)
print("y.shape : ",y.shape)

X.shape :  (49582, 36669)
y.shape :  (49582,)


In [23]:
# train test split
import random
from sklearn.model_selection import train_test_split
random.seed(42)

def split_train_test(train_fraction, test_fraction):
    df_data_reidx = df_data.reset_index(drop=True) # reset index in case there are gaps
    train_idx, test_idx = train_test_split(np.arange(df_data_reidx.shape[0]), 
                                           train_size=train_fraction, test_size=test_fraction, 
                                           shuffle=True, random_state=42)

    # len(train_idx), len(test_idx)
    # print("Number of training examples:{}".format(len(train_idx)))
    # print("Number of testing examples:{}".format(len(test_idx)))

    X_train = X[train_idx]
    y_train = y[train_idx]

    X_test = X[test_idx]
    y_test = y[test_idx]

    print("Training data: X_train : {}, y_train : {}".format(X_train.shape, y_train.shape))
    print("Testing data: X_test : {}, y_test : {}".format(X_test.shape, y_test.shape))
    
    return X_train, y_train, X_test, y_test

In [24]:
X_train, y_train, X_test, y_test = split_train_test(train_fraction=0.8, test_fraction=0.2)

Training data: X_train : (39665, 36669), y_train : (39665,)
Testing data: X_test : (9917, 36669), y_test : (9917,)


- Fit a Logistic Regression classifier and check its performance

In [33]:
from sklearn.linear_model import LogisticRegression

def evaluate_LR():
    LR_clf = LogisticRegression(solver='liblinear')
    LR_clf.fit(X_train, y_train)
    y_LRpred_test = LR_clf.predict(X_test)
    print(classification_report(y_test, y_LRpred_test)) # write your code inside the function

evaluate_LR()

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4939
           1       0.87      0.89      0.88      4978

    accuracy                           0.88      9917
   macro avg       0.88      0.88      0.88      9917
weighted avg       0.88      0.88      0.88      9917



- Fit a Naive Bayes classifier and check its performance

In [34]:
from sklearn.naive_bayes import BernoulliNB

def evaluate_NB():
    NB_clf = BernoulliNB()
    NB_clf.fit(X_train, y_train)
    y_NBpred_test = NB_clf.predict(X_test)# your code here: apply the classifier to make prediction on test set
    print(classification_report(y_test, y_NBpred_test))

evaluate_NB()

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      4939
           1       0.87      0.83      0.85      4978

    accuracy                           0.85      9917
   macro avg       0.85      0.85      0.85      9917
weighted avg       0.85      0.85      0.85      9917



- Select a small number of training data, re-fit the two classifiers and compare their performance

In [35]:
X_train, y_train, X_test, y_test = split_train_test(train_fraction=0.3, test_fraction=0.2)

Training data: X_train : (14874, 36669), y_train : (14874,)
Testing data: X_test : (9917, 36669), y_test : (9917,)


In [39]:
# evaluate the LR classifier performance on new training and testing data
evaluate_LR() # your code here (one line)

              precision    recall  f1-score   support

           0       0.87      0.85      0.86      4939
           1       0.85      0.87      0.86      4978

    accuracy                           0.86      9917
   macro avg       0.86      0.86      0.86      9917
weighted avg       0.86      0.86      0.86      9917



In [41]:
# evaluate the NB classifier performance on new training and testing data
evaluate_NB() # your code here (one line)

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      4939
           1       0.86      0.82      0.84      4978

    accuracy                           0.85      9917
   macro avg       0.85      0.85      0.85      9917
weighted avg       0.85      0.85      0.85      9917

