In [19]:
import os, csv, math, random
import pandas as pd
import numpy as np

from collections import Counter

inaugural = pd.read_csv('data/inaugural_complete.csv')
inaugural.tail()

Unnamed: 0,urls,years,pres,texts,wordcounts,party,change,new counts,party_indices,change_indices
53,http://avalon.law.yale.edu/21st_century/gbush1...,2001,George W. Bush,"of ; January 20, 2001 President ...",1595.0,Republican,New_Party,1595,4,0
54,http://avalon.law.yale.edu/21st_century/gbush2...,2005,George W. Bush,"of ; January 20, 2005 Vice Presi...",2073.0,Republican,Incumbent_Party,2073,4,1
55,http://avalon.law.yale.edu/21st_century/obama.asp,2009,Barack Obama,"of ; January 20, 2009 My fellow ...",2417.0,Democrat,New_Party,2417,2,0
56,,2013,Barack Obama,Thank you. Thank you so much. Vice President ...,,Democrat,Incumbent_Party,2090,2,1
57,,2017,Donald J. Trump,"Chief Justice Roberts, President Carter, Presi...",,Republican,"New_Party,",1456,4,0


## perform a bayes analysis, but first simplify data frame

In [11]:
def inaugural_test(a_data_frame, rowidx):
    if 'Incumbent_Party' in a_data_frame['change'][rowidx]:
        return 'incumbent'
    elif 'New_Party' in a_data_frame['change'][rowidx]:
        return 'new_party'
    else:
        return 'other'
    
address_text = inaugural['texts']

party_change = []
fold = []
for idx in inaugural.index:
    party_change.append(inaugural_test(inaugural, idx))
    fold.append(random.sample(list(range(5)), 1)[0])
party_change = pd.Series(party_change, index = inaugural.index)
fold = pd.Series(fold, index = inaugural.index)

# print(address_text)
# print(fold)
# print(party_change)

inaug_df = pd.concat([address_text, party_change, fold], axis = 1)
inaug_df.columns = ['text', 'change', 'fold']

# limit the dataframe to columns with either android or iphone;
# exclude 'other'
inaug_df = inaug_df[(inaug_df['change'] == 'incumbent') | (inaug_df['change'] == 'new_party')]

inaug_df.head()


Unnamed: 0,text,change,fold
0,"of THE CITY OF NEW YORK THURSDAY, APRIL 3...",new_party,1
1,"of THE CITY OF PHILADELPHIA MONDAY, MARCH...",incumbent,0
2,© 2008 Lillian Goldman Law Library 127 Wall St...,new_party,3
3,"March 4, 1801 FRIENDS AND FELLOW-...",incumbent,4
4,"March 4, 1805 Proceeding, fellow citi...",incumbent,4


In [12]:
# make training set and test set

testset = inaug_df[inaug_df['fold'] == 4]
trainingset = inaug_df[inaug_df['fold'] != 4]
print('Training set includes ' + str(trainingset.shape[0]))
print('Test set includes ' + str(testset.shape[0]))

Training set includes 48
Test set includes 10


In [13]:
def tokenize(astring):
    ''' Breaks a string into words, and counts them.
    Designed so it strips punctuation and lowercases everything,
    but doesn't separate hashtags and at-signs.
    '''
    wordcounts = Counter()
    # create a counter to hold the counts
    
    tokens = astring.split()
    for t in tokens:
        word = t.strip(',.!?:;-—()<>[]/"\'').lower()
        wordcounts[word] += 1
        
    return wordcounts

def create_vocab(seq_of_strings, n):
    ''' Given a sequence of text snippets, this function
    returns the n most common words. We'll use this to
    create a limited 'vocabulary'.
    '''
    vocab = Counter()
    for astring in seq_of_strings:
        counts = tokenize(astring)
        vocab = vocab + counts
    topn = [x[0] for x in vocab.most_common(n)]
    return topn

In [16]:
def categorize(df, rowidx):
    if df.loc[rowidx, 'change'] == 'new_party':
        return 'positive'
    elif df.loc[rowidx, 'change'] == 'incumbent':
        return 'negative'
    else:
        print('error: neither new_party nor incumbent')
        return 'other'

def get_priors(df):
    source_counts = df.groupby('change').count()['text']
    print(source_counts)
    positive_odds = source_counts['new_party'] / source_counts['incumbent']
    negative_odds = source_counts['incumbent'] / source_counts['new_party']
    return math.log(positive_odds), math.log(negative_odds)

def train_nb_model(df, p):
    vocab = create_vocab(df['text'], p)
    vocabset = set(vocab)
    # we make a set because membership-checking is faster
    # in sets; but we also hold onto the list, which is ordered
    
    positive_prior, negative_prior = get_priors(df)
    
    positive_counts = Counter()
    negative_counts = Counter()
    
    for i in df.index:
        address = df['text'][i]
        address_counts = tokenize(address)
        category = categorize(df, i)
        if category == 'negative':
            negative_counts = negative_counts + address_counts
        elif category == 'positive':
            positive_counts = positive_counts + address_counts
    
    # Now let's organize these Counters into a DataFrame
    
    negative = pd.Series(1, index = vocab)
    positive = pd.Series(1, index = vocab)
    # notice initializing to 1 -- Laplacian smoothing
    
    for word, count in positive_counts.items():
        if word in vocabset:
            positive[word] += count
    
    for word, count in negative_counts.items():
        if word in vocabset:
            negative[word] += count
    
    all_prob = (negative + positive) / (np.sum(negative) + np.sum(positive))
    
    negative_prob = negative / np.sum(negative)
    positive_prob = positive / np.sum(positive)
    
    # note that when we sum up the negative and positive
    # columns, we are also summing up all the Laplacian 1's
    # we initially added to them
    
    model = pd.concat([negative, positive, all_prob, 
                       negative_prob, positive_prob], axis = 1) 
        
    model.columns = ['neg', 'pos', 'all_prob', 'neg_prob', 'pos_prob']
    
    # The next step is unnecessary, and will not be found in
    # most published versions of naive Bayes. I'm providing it
    # because it may help you understand the logic of the
    # algorithm.
    
    model['neg_norm'] = negative_prob / all_prob
    model['pos_norm'] = positive_prob / all_prob
    
    
    model['log_neg'] = [math.log(x) for x in model['neg_norm']]
    model['log_pos'] = [math.log(x) for x in model['pos_norm']]
    return vocab, positive_prior, negative_prior, model

vocab, positive_prior, negative_prior, model = train_nb_model(trainingset, 1500)
model.head() 
        

change
incumbent    33
new_party    15
Name: text, dtype: int64


Unnamed: 0,neg,pos,all_prob,neg_prob,pos_prob,neg_norm,pos_norm,log_neg,log_pos
the,5493,3040,0.086479,0.086432,0.086565,0.999451,1.000994,-0.00055,0.000994
of,3803,2200,0.060839,0.05984,0.062646,0.983584,1.029708,-0.016552,0.029275
and,2802,1444,0.043032,0.044089,0.041119,1.02457,0.955536,0.024273,-0.045483
to,2349,1341,0.037397,0.036961,0.038186,0.988349,1.021086,-0.01172,0.020866
in,1561,792,0.023847,0.024562,0.022553,1.029993,0.945721,0.029552,-0.055808


In [17]:
pd.options.mode.chained_assignment = None

def apply_model(vocab, positive_prior, negative_prior, model, testset):
    right = 0
    wrong = 0
    vocabset = set(vocab)
    odds_pos = []
    odds_neg = []

    for i in testset.index:
        odds_positive = positive_prior
        odds_negative = negative_prior
        tweet = testset['text'][i]
        tweet_counts = tokenize(tweet)
        for word, count in tweet_counts.items():
            if word not in vocabset:
                continue
            odds_positive += model.loc[word, 'log_pos']
            odds_negative += model.loc[word, 'log_neg']
            
        if odds_positive > odds_negative:
            prediction = 'positive'
        else:
            prediction = 'negative'
        
        odds_pos.append(odds_positive)
        odds_neg.append(odds_negative)

        reality = categorize(testset, i)
        if reality != 'positive' and reality != 'negative':
            continue
        elif prediction == reality:
            right += 1
        else:
            wrong += 1

    print("Got " + str(right) + " rows right, and " + str(wrong) + " wrong.")
    accuracy = (right / (wrong + right)) * 100
    print("Accuracy was {0:.2f}%".format(accuracy))
    
    resultset = testset.copy()
    resultset['odds_positive'] = odds_pos
    resultset['odds_negative'] = odds_neg
    resultset = resultset.sort_values(by = 'odds_positive')
    
    return resultset, accuracy

newtestset, accuracy = apply_model(vocab, positive_prior, 
                         negative_prior, model, testset)

Got 2 rows right, and 8 wrong.
Accuracy was 20.00%


In [18]:
newtestset.tail()

Unnamed: 0,text,change,fold,odds_positive,odds_negative
57,"Chief Justice Roberts, President Carter, Presi...",new_party,4,-16.818832,-2.523418
4,"March 4, 1805 Proceeding, fellow citi...",incumbent,4,-7.696757,-16.587717
16,"of FRIDAY, MARCH 4, 1853 My Countrymen: ...",new_party,4,-7.309068,-29.751812
3,"March 4, 1801 FRIENDS AND FELLOW-...",incumbent,4,-2.619296,-18.475073
15,"of MONDAY, MARCH 5, 1849 Elected by the A...",new_party,4,1.784165,-14.894785
