# MP2: Text Categorization

Hope McIntyre (hm7zg)

In [1]:
# Base
import numpy as np
import pandas as pd
import json
import re
import string
from os import listdir
import math
import timeit
# Natural Language Processing
import nltk
from nltk.stem.snowball import EnglishStemmer # load the stemmer module from NLTK
# Get an instance of SnowballStemmer for English
stemmer = EnglishStemmer() 
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
# iPython Notebook
from IPython.display import display, HTML

In [2]:
# Load Stopword List
stopwords = open('stopwords.txt', 'r').read().split()

In [3]:
# Read in JSON

# Get File Names for Test and Train set
files_test = listdir('yelp/test')
files_train = listdir('yelp/train')

In [4]:
def load_json(fileList,filePath):
    dataDF = pd.DataFrame()
    for file in fileList:
        #json_data = open(filePath+file, encoding = "ISO-8859-1").read()
        json_data = open(filePath+file, errors = "ignore").read()
        data = json.loads(json_data)

        # Move Reviews only to DataFrame
        dataDF = dataDF.append(pd.DataFrame.from_dict(data['Reviews']))
        
    return dataDF

In [5]:
dataDF_train = load_json(files_train, 'yelp/train/')

text_train = dataDF_train['Content']
reviewID_train = dataDF_train['ReviewID']
print("Num of rows: ",len(text_train))

Num of rows:  38688


In [6]:
def clean_words(text):
    tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer()
    words = tokenizer.tokenize(text)

    # Convert to Lowercase
    # words = words.map(str.lower)
    cleanWords = [t.lower() for t in words]

    # Normalize (remove punctuation)
    # punc = string.punctuation
    # cleanWords = [t for t in cleanWords if t not in punc]
    cleanWords = [re.sub('[^0-9a-z]', "", x) for x in cleanWords]
    
    # Remove Empty Vectors
    cleanWords = [x for x in cleanWords if x != '']
 
    # Identify Digits & Convert to Num
    cleanWords = [re.sub("\d+", "NUM", x) for x in cleanWords]

    # Stem Words
    cleanWords = [stemmer.stem(x) for x in cleanWords] # call stemmer to stem the input
    
    return cleanWords

In [7]:
# Tokenize & Clean Train Corpus
tokens_train = []
tokens_train = [clean_words(i) for i in text_train]

# 2.1 MLE for statistical language models with proper smoothing

### Unigram Model

In [8]:
def term_freq(textList):
    TF = {}
    for row in textList:
        #print(row)
        for word in row:
            # print(word)
            if word in TF:
                TF[word] += 1
            else:
                TF[word] = 1
    return TF

In [9]:
TTF_train = term_freq(tokens_train)

In [12]:
u_theta = pd.DataFrame.from_dict(TTF_train, orient = "index")
u_theta.columns = ['TTF']
u_theta.sort('TTF', ascending = False)[0:10]

Unnamed: 0,TTF
the,261229
and,162202
i,144286
a,129623
to,103806
was,95743
it,85929
of,79493
is,60543
for,55931


In [258]:
# Total Number of Words in Training Corpus
nWords_train = u_theta['TTF'].sum()
nWords_train

4948882

In [259]:
# Number of Unique Words in Training Corpus
vSize_train = len(u_theta['TTF'])
vSize_train

64512

In [13]:
# Calculate Probabilty of Each Word by TTF/N
u_theta['p'] = u_theta/nWords_train
u_theta.sort('TTF', ascending = False)[0:10]

Unnamed: 0,TTF,p
the,261229,0.052785
and,162202,0.032775
i,144286,0.029155
a,129623,0.026192
to,103806,0.020976
was,95743,0.019346
it,85929,0.017363
of,79493,0.016063
is,60543,0.012234
for,55931,0.011302


In [260]:
# Check that Probability Sums to 1
u_theta['p'].sum()

0.9999999999993633

### Bigram with Linear Interpolation

In [98]:
# Create Dict of Dict with the Instance of Every nextWord
def count_nextWordFreq(textList):
    NWF = {}
    for tokenList in textList:
        for i in range(0,len(tokenList)-1):
            word = tokenList[i]
            nextWord = tokenList[i+1]
            #print(word)
            #print(nextWord)
            # print(word)
            if word in NWF:
                if nextWord in NWF[word]:
                    NWF[word][nextWord] += 1
                else:
                    NWF[word][nextWord] = 1
            else:
                NWF[word] = {}
                NWF[word][nextWord] = 1
            #print(NWF)
    return NWF

In [99]:
NWF = count_nextWordFreq(tokens_train)

In [97]:
# Lambda Value
l = 0.9

In [100]:
# Smoothed Probability with Linear Interpolation
def calc_pSmoothLI(wordi,givenWord, l):
    if wordi in NWF[givenWord]:
        prob = NWF[givenWord][wordi]/sum(NWF[givenWord].values())
    else:
        prob = 0
    probSmooth = l*prob + (1-l)*u_theta['p'].ix[wordi]
    return probSmooth

In [101]:
probSmoothLI_good = [calc_pSmoothLI(word, 'good', 0.9) for word in u_theta.index]

In [102]:
probSmoothLI_good = pd.DataFrame(probSmoothLI_good)
probSmoothLI_good.index = u_theta.index
probSmoothLI_good.columns = ['p_smooth']
print("The 10 Most Likely Words Following Good: Linear Interpolation")
probSmoothLI_good.sort('p_smooth',ascending = False)[0:10]

The 10 Most Likely Words Following Good: Linear Interpolation


Unnamed: 0,p_smooth
but,0.086325
and,0.062767
the,0.054971
i,0.052512
as,0.034262
food,0.02704
it,0.01877
thing,0.01708
for,0.016851
too,0.014835


In [103]:
# Check to make sure the Probabilities add to 1
sum(probSmoothLI_good['p_smooth'])

1.0000000000009288

### Bigram Absolute Discount

In [268]:
def calc_pSmoothAD(wordi,givenWord, delta):
    #smooth_NWF = {}
    givenWordCount = sum(NWF[givenWord].values())
    uniqueWords = len(NWF[givenWord])
    l = (d*uniqueWords)/(givenWordCount)
    #print(givenWordCount)
    # calculate lambda value, from w(i-1)
    # L = (d*len(u_theta))/sum(u_theta['TTF'])
    if wordi in NWF[givenWord]:
        #print(NWF[givenWord][wordi])
        #print(max(NWF[givenWord][wordi]-delta,0))
        probSmooth = (max(NWF[givenWord][wordi]-delta,0)/givenWordCount) + l*u_theta['p'].ix[wordi]
        #print(probSmooth)
    else: 
        probSmooth = l*u_theta['p'].ix[wordi]
        #print(probSmooth)
    return probSmooth

In [270]:
# delta - Given
d = 0.1

In [333]:
probSmoothAD_good = [calc_pSmoothAD(word, 'good', d) for word in u_theta.index]

In [334]:
probSmoothAD_good = pd.DataFrame(probSmoothAD_good)
probSmoothAD_good.index = u_theta.index
probSmoothAD_good.columns = ['p_smooth']
print("The 10 Most Likely Words Following Good: Absolute Discount Smooth")
probSmoothAD_good.sort('p_smooth',ascending = False)[0:10]

The 10 Most Likely Words Following Good: Absolute Discount Smooth


Unnamed: 0,p_smooth
but,0.094913
and,0.066339
the,0.055601
i,0.055319
as,0.037631
food,0.029526
it,0.019051
thing,0.018827
for,0.017548
too,0.01627


In [335]:
# Check to make sure the Probabilities add to 1
sum(probSmoothAD_good['p_smooth'])

0.99999999999883449

### 3. Are those top 10 words the same from these two bigram language models?

A: Yes, the words are the same from the two bigram language models, though the probability of each word does vary slightly.

# 2.2 Generate text documents from a language model

In [None]:
# Generate 15 Word Sentence from Unigram Model

In [17]:
import random
random.seed()

### Generated Sentences - Unigram

In [327]:
# Randomly Sample 15 word sentences from the Probability Distribution of the Unigram Language Model
print("Unigram Model Generated Sentences:\n")
genSentences_u = []
genSentencesProb_u = []
for i in range(0,10): # Number of Sentences
    genSent_u = []
    genSent_prob = 1
    for j in range(0,15): # Number of Words in Sentences
        # Randomly Sample Word from Unigram Distribution
        # np.random.choice randomly samples from a set of values given a vector of probabilities (p)
        selectedWord = np.random.choice(u_theta.index, size = 1, p = u_theta['p'])[0]
        genSent_u.append(selectedWord)
        # Fetch Probability of Selected Word
        prob_selectedWord = u_theta['p'].ix[selectedWord]
        genSent_prob = genSent_prob*prob_selectedWord
    print(i+1, genSent_u)
    print('Probability of Generated Sentence = ',genSent_prob)
    # Store Just in Case
    genSentences_u.append(genSent_u)
    genSentencesProb_u.append(genSent_prob)

Unigram Model Generated Sentences:

1 ['mass', 'were', 'bit', 'had', 'a', 'when', 'came', 'dog', 'restaur', 'man', 'health', 'on', 'tri', 'need', 'grew']
Probability of Generated Sentence =  9.64946525588e-47
2 ['the', 's', 'long', 'food', 'been', 'so', 'toward', 'the', 'the', 'and', 'quick', 'for', 'pretti', 'and', 'stuf']
Probability of Generated Sentence =  5.66744040088e-36
3 ['fri', 'outsid', 'billeh', 'old', 'corn', 'up', 'around', 'hostess', 's', 'marin', 'tender', 'like', 'with', 'small', 'fuzzi']
Probability of Generated Sentence =  3.05381355006e-52
4 ['tri', 'flavor', 'my', 'drink', 'everi', 'super', 'and', 'michigan', 'the', 'love', 'love', 'in', 's', 'she', 'this']
Probability of Generated Sentence =  3.07117627767e-39
5 ['end', 'best', 'are', 'europ', 'rare', 'a', 'northsid', 'time', 'realli', 'from', 'roster', 'time', 'admit', 'for', 'there']
Probability of Generated Sentence =  1.24002414962e-49
6 ['di', 'the', 'tast', 'spain', 'lamb', 'the', 'when', 'ononc', 'get', 'ju

### Generated Sentences - Bigram Linear Interpolation

In [328]:
# Randomly Sample 15 word sentences from the Probability Distribution of the Bigram Linear Interpolation Model
# NEED TO ADD SMOOTHING, BUT WHERE?
print("Bigram L.I. Model Generated Sentences:\n")

genSentences_biLI = []
genSentencesProb_biLI = []
for i in range(0,10): # Number of Sentences
    genSent = []
    for j in range(0,15): # Number of Words in Sentences
        if j == 0:
            # Randomly Sample Word from Unigram Distribution
            # np.random.choice randomly samples from a set of values given a vector of probabilities (p)
            selectedWord = np.random.choice(u_theta.index, size = 1, p = u_theta['p'])[0]
            genSent.append(selectedWord)
            genSent_prob = u_theta['p'].ix[selectedWord]
        else:
            prob = [calc_pSmoothLI(word, genSent[j-1], 0.9) for word in u_theta.index]
            prob = pd.DataFrame(prob)
            prob.index = u_theta.index
            prob.columns = ['p_smooth']
            selectedWord = np.random.choice(prob.index, size = 1, p = prob['p_smooth'])[0]
            genSent.append(selectedWord)
            genSent_prob = genSent_prob*prob['p_smooth'].ix[selectedWord]
    print(i+1, genSent)
    print('Probability of Generated Sentence = ',genSent_prob)
    # Store Just in Case
    genSentences_biLI.append(genSent)
    genSentencesProb_biLI.append(genSent_prob)

Bigram L.I. Model Generated Sentences:

1 ['not', 'on', 'this', 'i', 'crisp', 'to', 'welcom', 'a', 'runni', 'for', 'visit', 'all', 'of', 'chai', 'tast']
Probability of Generated Sentence =  5.88608659982e-41
2 ['bar', 'and', 'that', 'even', 'in', 'the', 'for', 'gotten', 'oh', 'that', 'price', 'at', 'a', 'nice', 'staff']
Probability of Generated Sentence =  3.29121339835e-35
3 ['servic', 'was', 'expect', 'to', 'the', 'menu', 'were', 'good', 'but', 'it', 'from', 'what', 'to', 'maxim', 'the']
Probability of Generated Sentence =  1.44576715016e-27
4 ['m', 'there', 'is', 'one', 'friend', 'and', 'wait', 'and', 'cycl', 'but', 'i', 've', 'ever', 'in', 'the']
Probability of Generated Sentence =  3.40935628966e-29
5 ['person', 'pizza', 'and', 'possibl', 'the', 'rich', 'flavor', 'and', 'the', 'chicken', 'and', 'ham', 'saute', 'whole', 'tabl']
Probability of Generated Sentence =  4.19077500239e-34
6 ['crowd', 'dure', 'blackhawk', 'won', 'a', 'hot', 'order', 'asap', 'love', 'fri', 'are', 'appropri'

### Generated Sentences - Bigram Absolute Discount Smoothing

In [329]:
print("Bigram A.D. Model Generated Sentences:\n")

genSentences_biAD = []
genSentencesProb_biAD = []

for i in range(0,10): # Number of Sentences
    genSent = []
    for j in range(0,15): # Number of Words in Sentences
        if j == 0:
            # Randomly Sample Word from Unigram Distribution
            # np.random.choice randomly samples from a set of values given a vector of probabilities (p)
            selectedWord = np.random.choice(u_theta.index, size = 1, p = u_theta['p'])[0]
            genSent.append(selectedWord)
            genSent_prob = u_theta['p'].ix[selectedWord]
        else:
            prob = [calc_pSmoothAD(word, genSent[j-1], d) for word in u_theta.index]
            prob = pd.DataFrame(prob)
            prob.index = u_theta.index
            prob.columns = ['p_smooth']
            selectedWord = np.random.choice(prob.index, size = 1, p = prob['p_smooth'])[0]
            genSent.append(selectedWord)
            genSent_prob = genSent_prob*prob['p_smooth'].ix[selectedWord]
    print(i+1, genSent)
    print('Probability of Generated Sentence = ',genSent_prob)
    # Store Just in Case
    genSentences_biLI.append(genSent)
    genSentencesProb_biLI.append(genSent_prob)

Bigram A.D. Model Generated Sentences:

1 ['went', 'here', 'has', 'grill', 'chees', 'was', 'feel', 'special', 'was', 'much', 'much', 'enjoy', 'thati', 'think', 'i']
Probability of Generated Sentence =  3.76118445877e-34
2 ['their', 'food', 'came', 'back', 'in', 'their', 'sauc', 'and', 'cash', 'regist', 'the', 'date', 'were', 'wonder', 'i']
Probability of Generated Sentence =  2.36565416869e-29
3 ['the', 'server', 'preemptiv', 'brought', 'it', 'was', 'pretti', 'good', 'these', 'textur', 'to', 'get', 'my', '13', 'day']
Probability of Generated Sentence =  2.49112136875e-32
4 ['of', 'sushi', 'rice', 'these', 'kind', 'and', 'matin', 'martini', 'glass', 'of', 'the', 'price', 'the', 'wait', 'for']
Probability of Generated Sentence =  2.78654883325e-28
5 ['thought', 'about', 'this', 'place', 'for', 'the', 'waitress', 'split', 'an', 'avid', 'riesl', 'i', 'opt', 'to', 'wait']
Probability of Generated Sentence =  3.98046725781e-27
6 ['which', 'was', 'occupi', 'the', 'wine', 'select', 'and', 'one

## 2.3 Language model evaluation

In [18]:
dataDF_test = load_json(files_test, 'yelp/test/')

# Extract Only Text Values in PD Series
text_test = dataDF_test['Content']
reviewID_test = dataDF_test['ReviewID']
print("Num of rows: ",len(text_test))

Num of rows:  19803


In [19]:
# Tokenize & Clean Train Corpus
tokens_test = []
tokens_test = [clean_words(i) for i in text_test]

### Unigram Model Perplexity Evaluation

In [160]:
def calc_perplexity(tokenList):
    if tokenList != []:
        doc_length = len(tokenList)
        pSmooth = calc_pSmoothAdditive(tokenList, 0.1, vSize_train, nWords_train)
        #print(doc_length)
        perp = 1
        for word in tokenList:
            probs = pSmooth.ix[word]
            # Modified Perplexity Calculation to Mathematical Equivalent - Raising the Inverse Liklihood 
            # of Each Probability to the Sqrt of N then Multiplying Values for Each Word Together
            perp = perp*((1/probs)**(1/doc_length))
        return perp
    else:
        return np.NAN

In [159]:
def calc_pSmoothAdditive(tokenList, d, vSize_train, nWords_train):
    unseenWords = list(set(tokenList) - set(u_theta.index))
    #print(len(unseenWords))
    if len(unseenWords) == 0:
        return u_theta['p']
    else:
        # Build Series with all unique words in training set + unseen words from test document
        pSmooth = u_theta['TTF'].append(pd.Series(([0]*len(unseenWords)), index = unseenWords))
        nWords_train += len(unseenWords)
        vSize_train += len(unseenWords)
        f = lambda x: ((x + d) / (nWords_train + d*vSize_train))
        pSmooth = pSmooth.map(f)
        return pSmooth

In [176]:
perp_test = []
perp_test = [calc_perplexity(i) for i in tokens_test]

In [177]:
# Add Document By Document Perplexity to Data DataFrame
dataDF_test['perp_u'] = perp_test

In [None]:
dataDF_test.sort('perp_u', ascending = False)[0:10]

In [181]:
# Mean Perplexity
mean_perp_u = dataDF_test['perp_u'].mean()
mean_perp_u

1759.6233519870582

In [182]:
# Std. Deviation Perplextiy
stddev_perp_u = np.std(dataDF_test['perp_u'])
stddev_perp_u

55800.82900697847

### Bigram Model - L.I. Perplexity Evaluation

In [304]:
def calc_perplexityLI(tokenList):
    if tokenList != []:
        pSmooth_uni = calc_pSmoothAdditive(tokenList, d, vSize_train, nWords_train)

        doc_length = len(tokenList)
        
        perp = 1
        
        for i in range(1,len(tokenList)):
            # This needs to use the smoothed with unseen words dist
            # print(tokenList[i],tokenList[i-1])
            probs = calc_pSmoothLI_perpMod(tokenList[i], tokenList[i-1], 0.9, pSmooth_uni)
            # Modified Perplexity Calculation to Mathematical Equivalent - Raising the Inverse Liklihood 
            # of Each Probability to the Sqrt of N then Multiplying Values for Each Word Together
            perp = perp*((1/probs)**(1/doc_length))
        return perp
    else:
        return np.NAN

In [305]:
def calc_pSmoothLI_perpMod(wordi,givenWord, l, uni_probDist):
    if givenWord in NWF.keys():
        if wordi in NWF[givenWord]:
            prob = NWF[givenWord][wordi]/sum(NWF[givenWord].values())
        else:
            prob = 0
    else: # Case where givenWord is unseen in Training
        prob = 1/vSize_train
    probSmooth = l*prob + (1-l)*uni_probDist.ix[wordi]
    return probSmooth

In [307]:
perpLI_test = []
perpLI_test = [calc_perplexityLI(i) for i in tokens_test]

In [308]:
# Add Document By Document Perplexity to Data DataFrame
dataDF_test['perp_biLI'] = perpLI_test

In [None]:
dataDF_test.sort('perp_biLI', ascending = False)[0:10]

In [310]:
# Mean Perplexity
mean_perp_biLI = dataDF_test['perp_biLI'].mean()
mean_perp_biLI

249.80536916631573

In [311]:
# Std. Deviation Perplextiy
stddev_perp_biLI = np.std(dataDF_test['perp_biLI'])
stddev_perp_biLI

2032.001430748753

### Bigram Model - A.D. Perplexity Evaluation

In [336]:
def calc_perplexityAD(tokenList):
    if tokenList != []:
        pSmooth_uni = calc_pSmoothAdditive(tokenList, d, vSize_train, nWords_train)
        doc_length = len(tokenList)
        
        perp = 1
        for i in range(1,len(tokenList)):
            probs = calc_pSmoothAD_perpMod(tokenList[i], tokenList[i-1], 0.1, pSmooth_uni)
            # Modified Perplexity Calculation to Mathematical Equivalent - Raising the Inverse Liklihood 
            # of Each Probability to the Sqrt of N then Multiplying Values for Each Word Together
            perp = perp*((1/probs)**(1/doc_length))
        return perp
    else:
        return np.NAN

In [337]:
def calc_pSmoothAD_perpMod(wordi,givenWord, delta, uni_probDist):
    if givenWord in NWF.keys():
        givenWordCount = sum(NWF[givenWord].values())
        uniqueWords = len(NWF[givenWord])
        l = (d*uniqueWords)/(givenWordCount)
        if wordi in NWF[givenWord]:
            probSmooth = (max(NWF[givenWord][wordi]-delta,0)/givenWordCount) + l*uni_probDist.ix[wordi]
        else: 
            probSmooth = l*uni_probDist.ix[wordi]
    else: # Case where givenWord is unseen in Training
        probSmooth = 1/vSize_train
    return probSmooth

In [339]:
perpAD_test = []
perpAD_test = [calc_perplexityAD(i) for i in tokens_test]

In [340]:
# Add Document By Document Perplexity to Data DataFrame
dataDF_test['perp_biAD'] = perpAD_test

In [None]:
dataDF_test.sort('perp_biAD', ascending = False)[0:10]

In [342]:
# Mean Perplexity
mean_perp_biAD = dataDF_test['perp_biAD'].mean()
mean_perp_biAD

379.10818626482325

In [343]:
# Std. Deviation Perplextiy
stddev_perp_biAD = np.std(dataDF_test['perp_biAD'])
stddev_perp_biAD

6665.424039280692

### 3. Can you conclude which language model predicts the data in test folder better? And why

A: The bigram Linear Interpolation model predicts the data in the test folder the best. I can tell this because the mean and standard deviation of the perplexity calculated on the Test Set is the lowest.