In [1]:
%matplotlib inline
import csv, requests, os
import pandas as pd
import numpy as np

## Get data from Google sheets

In [2]:
def make_regular_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/edit#gid={sheet_id}"

def make_csv_gsheet_url(doc_id, sheet_id):
    return f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=csv&id={doc_id}&gid={sheet_id}"


GOOGLE_SHEET_ID = '1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o'
print("Querying Doc:", make_regular_gsheet_url(GOOGLE_SHEET_ID, "0"))
response = requests.get(make_csv_gsheet_url(GOOGLE_SHEET_ID, "0"))
reader = csv.reader(response.text.splitlines())
header = next(reader)
df = pd.DataFrame(list(reader), columns=header, dtype=int)

# Remove rows when N/A is a filename
df = df[df['Filename'] != 'N/A']
df['filepath'] = 'speeches/' + df.Filename
df['file_exists'] = df['filepath'].apply(lambda x: os.path.isfile(x))
df.head()

Querying Doc: https://docs.google.com/spreadsheets/d/1bvRKCfu9iGllHsOolDjMtbGA_2COddQFoZ7I45Lyn6o/edit#gid=0


  df = pd.DataFrame(list(reader), columns=header, dtype=int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['filepath'] = 'speeches/' + df.Filename
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['file_exists'] = df['filepath'].apply(lambda x: os.path.isfile(x))


Unnamed: 0,Filename,State,Governor,Gender,Party,Type of Speech,New Gov?,2020 Contender?,Region,Trifecta Status,Trifecta,Best Transcript URL,Selector,Note,Lesser Transcript URL,New Best Transcript URL,filepath,file_exists
0,Alabama_Inaugural.txt,Alabama,Kay Ivey,Female,R,Inaugural,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,https://www.al.com/news/2019/01/the-full-text-...,,speeches/Alabama_Inaugural.txt,True
1,Alabama_SOTS.txt,Alabama,Kay Ivey,Female,R,State of the state,No,No,South,R trifecta,Trifecta,https://governor.alabama.gov/remarks-speeches/...,,,,https://governor.alabama.gov/remarks-speeches/...,speeches/Alabama_SOTS.txt,True
3,Alaska_SOTS.txt,Alaska,Mike Dunleavy,Male,R,State of the state,Yes,No,West,Divided government,Divided,https://gov.alaska.gov/newsroom/2019/01/22/201...,,,https://www.adn.com/politics/2019/01/23/watch-...,https://gov.alaska.gov/newsroom/2019/01/22/201...,speeches/Alaska_SOTS.txt,True
4,Arizona_Inaugural.txt,Arizona,Doug Ducey,Male,R,Inaugural,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,,speeches/Arizona_Inaugural.txt,True
5,Arizona_SOTS.txt,Arizona,Doug Ducey,Male,R,State of the state,No,No,West,R trifecta,Trifecta,https://azgovernor.gov/governor/news/2019/01/g...,,,,https://azgovernor.gov/governor/news/2019/01/g...,speeches/Arizona_SOTS.txt,True


## Filter Data

In [3]:
df = df[df['Type of Speech'].isin(['State of the state','Both'])]
f"Dataset is {len(df)} speeches"

'Dataset is 50 speeches'

## Read Speeches

In [4]:
def get_speeches(df):
    speeches = []
    for path in df['filepath']:
        with open(path) as f:
            text = f.read()
            speeches.append(text)
    return speeches

speeches = get_speeches(df)

## Turn Speeches Into Vectors!

In [115]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## YOU CAN EDIT THESE
y_columns = ['Party', 'Trifecta']
BINARY=False
NGRAM_RANGE=(1,1)
MIN_DF=0

vectorizer = CountVectorizer(
        stop_words='english', # 'english' if not custom list
        ngram_range=NGRAM_RANGE,
        binary=BINARY,
        min_df=MIN_DF
    )
X = vectorizer.fit_transform(speeches)
y = np.array(df['Party'])
# 1 is Republican
y = (y == 'R').astype('int') 

# Train Classifier
clf = MultinomialNB(alpha=1.0e-10, class_prior=None, fit_prior=True)
clf.fit(X, y)

# Test Classifier
# 5-fold cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
display(pd.DataFrame(scores).round(2))

pd.DataFrame(scores)[
    ['test_accuracy','test_precision','test_recall','test_f1']]\
    .mean().round(2)

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
0,0.0,0.0,0.62,0.6,0.86,0.71
1,0.0,0.0,0.54,0.54,1.0,0.7
2,0.0,0.0,0.75,0.83,0.71,0.77
3,0.0,0.0,0.75,0.67,1.0,0.8


test_accuracy     0.66
test_precision    0.66
test_recall       0.89
test_f1           0.74
dtype: float64

In [116]:
df.Party.value_counts()

R    27
D    23
Name: Party, dtype: int64

## Peek inside the model (coeffeicients on each word)

https://fivethirtyeight.com/features/what-americas-governors-are-talking-about/

see "state-of-the-states.ipynb"

More on this to come as we learn about bayesian classifiers!

In [117]:
# Prior Probabilities
df.Party.value_counts()

R    27
D    23
Name: Party, dtype: int64

In [114]:
pd.DataFrame(np.concatenate((clf.feature_count_, clf.feature_log_prob_), axis=0),
            index=['d_count', 'r_count', 'd_log_proba', 'r_log_proba'],
            columns=vectorizer.get_feature_names_out()
            )\
    .T.sort_values(by='r_log_proba')\
    .head(10)

Unnamed: 0,d_count,r_count,d_log_proba,r_log_proba
invisible,2.0,0.0,-9.970655,-33.884369
compromises,3.0,0.0,-9.56519,-33.884369
comptroller,9.0,0.0,-8.466578,-33.884369
selves,2.0,0.0,-9.970655,-33.884369
conquer,3.0,0.0,-9.56519,-33.884369
consequences,8.0,0.0,-8.584361,-33.884369
screenings,3.0,0.0,-9.56519,-33.884369
scientists,4.0,0.0,-9.277508,-33.884369
consultant,2.0,0.0,-9.970655,-33.884369
convention,2.0,0.0,-9.970655,-33.884369


# Comparing Models

Vectorizing again here for convenience. Let's play with the parameters and see what it does to the performance of the classifier!

In [118]:
vectorizer = CountVectorizer(
        stop_words='english', # 'english' if not custom list
        ngram_range=(1,2),
        binary=False,
        min_df=0
    )

X = vectorizer.fit_transform(speeches)

Below, I run multiple kinds of classifiers.

In [119]:
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X,y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,0.01,0.0,0.84,0.89,0.81,0.84
std,0.0,0.0,0.12,0.16,0.15,0.12


In [120]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1e9, solver='lbfgs', max_iter=4000)
clf.fit(X,y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,0.4,0.0,0.8,0.85,0.8,0.8
std,0.06,0.0,0.06,0.11,0.24,0.1


In [121]:
# Linear Support Vector Classification.
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X, y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,0.01,0.0,0.76,0.78,0.77,0.77
std,0.0,0.0,0.07,0.02,0.19,0.09


In [122]:
# Multi-layer perceptron (a type of Neural Network ¯\_(ツ)_/¯)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier()
clf.fit(X,y)
scores = cross_validate(clf, X, y, scoring=scoring, cv=4)
pd.DataFrame(scores).describe().round(2)[1:3]

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
mean,4.25,0.01,0.58,0.56,1.0,0.72
std,0.16,0.0,0.03,0.02,0.0,0.02
