In [1]:
# import libs
import dill as dl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
sections = [('arts', 'Ch4/Arts.csv'), 
            ('business','Ch4/Business.csv'), 
            ('obituaries','Ch4/Obituaries.csv'), 
            ('sports','Ch4/Sports.csv'), 
            ('world', 'Ch4/World.csv')]
for section,file in sections:
    with open(file) as f:
        content = f.readlines()
        content = [x.split(sep='\t', maxsplit=2) for x in content] 
        globals()[section] = pd.DataFrame(content, columns=['url', 'title', 'body'])

arts.describe()


Unnamed: 0,url,title,body
count,1000,1000,1000
unique,1000,963,1000
top,https://www.nytimes.com/2018/03/05/style/vanit...,11 of Our Best Weekend Reads,"<!DOCTYPE html><html lang=""en"" itemId=""https:/..."
freq,1,15,1


In [3]:
# pre-process title and body. tokenize and remove stop-words
from bs4 import BeautifulSoup
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))

def get_tokenized_text(text):
    # remove style and script elements
    soup = BeautifulSoup(text, 'lxml')
    for script in soup(["script", "style"]):
        script.decompose()

    # tokenize and remove stop words
    tokens = word_tokenize(soup.get_text())
    # remove single quotes, lower the strings
    tokens = list(map(lambda w: w.replace("'", "").lower(), tokens))
    tokens = [w for w in tokens if not w in stop_words]
    return ','.join(tokens)

# tokenize title and body
for t,s in sections:
    globals()[t]['title_tokenized'] = globals()[t].apply(lambda row: get_tokenized_text(row['title']), axis = 1)
    globals()[t]['body_tokenized'] = globals()[t].apply(lambda row: get_tokenized_text(row['body']), axis = 1)

    

In [5]:
sports.describe()
sports.head(5)

Unnamed: 0,url,title,body,title_tokenized,body_tokenized
0,https://www.nytimes.com/2018/04/26/arts/design...,10 Galleries to Visit Now on the Upper East Side,<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,"10,galleries,visit,upper,east,side","10,galleries,visit,upper,east,side,-,new,york,..."
1,https://www.nytimes.com/aponline/2018/04/26/sp...,"The Latest: Lions, Bengals Each Draft Centers",<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,"latest,:,lions,,,bengals,draft,centers","latest,:,packers,cowboys,add,defensive,stars,-..."
2,https://www.nytimes.com/2018/04/26/sports/nfl-...,NFL Draft 2018 Live: Round 1 Pick-by-Pick Updates,<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,"nfl,draft,2018,live,:,round,1,pick-by-pick,upd...","nfl,draft,2018,live,:,round,1,pick-by-pick,upd..."
3,https://www.nytimes.com/2018/04/26/nyregion/la...,"The Lawyer at the Side of de Blasio, Cuomo and...",<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,"lawyer,side,de,blasio,,,cuomo,conor,mcgregor","lawyer,side,de,blasio,,,cuomo,conor,mcgregor,-..."
4,https://www.nytimes.com/aponline/2018/04/26/sp...,"Game 6s on Tap: LeBron, Raptors, Jazz Look to ...",<!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...,"game,6s,tap,:,lebron,,,raptors,,,jazz,look,adv...","game,6s,tap,:,lebron,,,raptors,,,jazz,look,adv..."


In [52]:
# test-train split
from sklearn.model_selection import train_test_split

for df,file in sections:
    train, test = train_test_split(globals()[df], test_size=0.5)
    globals()[df + "_train"] = train
    globals()[df + "_test"] = test

print(sports_train.head(2))
print(business_test.head(2))

                                                   url  \
369  https://www.nytimes.com/2018/04/10/sports/hock...   
573  https://www.nytimes.com/2018/03/18/world/europ...   

                                                 title  \
369  Built to Win, the Golden Knights Did So Sooner...   
573  A Wrestling Culture That Helps Keep Boys Away ...   

                                                  body  \
369  <!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...   
573  <!DOCTYPE html><!--[if (gt IE 9)|!(IE)]> <!-->...   

                                     title_tokenized  \
369       built,win,,,golden,knights,sooner,expected   
573  wrestling,culture,helps,keep,boys,away,fighting   

                                        body_tokenized  
369  built,win,,,golden,knights,sooner,expected,-,n...  
573  wrestling,culture,helps,keep,boys,away,fightin...  
                                                   url  \
467  https://www.nytimes.com/2018/04/21/pageoneplus...   
410  https://www.ny

In [65]:
# create word vectors, across documents, for each section
from functools import reduce
def combine_words(w1, w2):
    ret = list(set(  (w1.split(',') if (w1 is not None) else []) + 
                     (w2.split(',') if (w2 is not None) else []) ))
    ret.sort()
    return ",".join(ret)

def reduce_words(dataframe):
    return reduce(lambda w1, w2: combine_words(w1,w2), dataframe.loc[:,'body_tokenized'])

for section,file in sections:
    globals()[section + "_words_train"] = reduce_words(globals()[section+"_train"])


In [73]:
print(type(business_words_train))

<class 'str'>


In [75]:
# clean up word tokens
#  remove any tokens less than 3 chars in size
#  remove quote char from word beginnings
tokens_vars = ['arts_words', 'business_words', 'obituaries_words', 'sports_words', 'world_words']
for tokens in tokens_vars:
    globals()[tokens + "_train_fil"] = list(filter(lambda word: len(word) > 3, globals()[tokens + "_train"].split(",")))
    
len(business_words_train_fil)

35222

In [76]:
for s,f in sections:
    print(len(globals()[s + "_words_train_fil"]))

49169
35222
33489
45287
37053


In [77]:
# collapse all section training words into a single vector
words_train = reduce(lambda v1,v2: v1 + v2, list(map(lambda s: globals()[s[0] + "_words_train_fil"], sections)))

len(words_train)

200220

In [None]:
# calculate probabilities from training sets
def calc_word_priors(alpha, beta, word_tokens, dataframe):
    word_priors = []
    num_documents = dataframe.size
    for word in word_tokens:
        count = (dataframe['title_tokenized'].str.contains("," + word + ",", regex = False)).sum()
        count = count + (dataframe['body_tokenized'].str.contains("," + word + ",", regex = False)).sum()
        prob = (count + alpha)/(num_documents + beta)
        word_priors.append(prob)
    return word_priors


import datetime
# find priors for all training words
words_priors = [[]] 
for section, file in sections:
    print(datetime.datetime.now().time(), ' calculating priors for ' + section)
    words_priors.append(calc_word_priors(1, 2, words_train, globals()[section + "_train"]))

20:36:54.324593  calculating priors for arts
20:58:50.816766  calculating priors for business
21:17:29.807737  calculating priors for obituaries


In [14]:
import math

def calc_log_odds(class_prior, base_class_prior):
    log_odds = math.log( (class_prior * (1 - base_class_prior)) / ( base_class_prior * (1 - class_prior) ) ) if (class_prior > 0.0) & (base_class_prior > 0.0) else 0
    return log_odds

def calc_base_class_weights(class_prior, base_class_prior):
    return math.log( (1 - class_prior) / (1 - base_class_prior) ) if (class_prior > 0.0) & (base_class_prior > 0.0) else 0
    
# calculate weights
words_theta = words_priors

base_class_theta = words_theta[1]

words_weights = [[calc_log_odds(theta, base_class_theta[index]) for index,theta in enumerate(class_theta)] for class_theta in words_theta[2:6]]

words_weights_base_class = [[calc_base_class_weights(theta, base_class_theta[index]) for index,theta in enumerate(class_theta)] for class_theta in words_theta[2:6]]
words_weights_base_class = list(map(sum, words_weights_base_class))

In [17]:
# test
def calc_class_prob(document_words, words_train, words_weights, words_weights_base_class):
    prob = []
    for doc_class in range(5):
        class_prob = []
        for word in words_train:
            word_index = document_words.index(word)
            if( word_index != -1):
                class_prob.append(words_weights[doc_class][word_index])
            else:
                class_prob.append(1 - words_weights[doc_class][word_index])
aaa        prob.ppend(sum(cl))
arts_words_test

SyntaxError: invalid syntax (<ipython-input-17-abb8b9e6f51f>, line 12)

In [26]:
print(len(words_weights[1]))

8507


In [29]:
# test
def calc_class_prob(document_words, words_train, words_theta):
    prob = []
    for doc_class in range(5):
        class_prob = []
        for word in words_train:
            try:
                word_index = words_train.index(word)
                if( word not in document_words):
                    class_prob.append(math.log(words_theta[doc_class + 1][word_index]))
                else:
                    class_prob.append(math.log(1 - words_theta[doc_class + 1][word_index]))
            except IndexError:
                print(doc_class, word_index)
        prob.append(sum(class_prob))
    return prob

print(calc_class_prob(arts_words_train, words_train, words_theta))

[-30014.99153768913, -29751.01455324569, -31099.8956275988, -28846.16115197452, -29871.516564088706]
