In [79]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import re
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to /home/lnnersji/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lnnersji/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/lnnersji/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/lnnersji/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [80]:
# I'm combining the training and testing data bc the algorithm I used splits it in a way I don't understand lol
df = pd.read_csv("train.tsv", header=None, sep="\t", names=[
    'ID', 'Label', 'Statement', 'Subject', 'Speaker', 'Job', 'State', 'Party',
    'Barely True', 'False', 'Half True', 'Mostly True', 'Pants On Fire', 'Context'])
test_df = df = pd.read_csv("test.tsv", header=None, sep="\t", names=[
    'ID', 'Label', 'Statement', 'Subject', 'Speaker', 'Job', 'State', 'Party',
    'Barely True', 'False', 'Half True', 'Mostly True', 'Pants On Fire', 'Context'])
df = pd.concat([df, test_df])
df = df.dropna()
df.head()

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Job,State,Party,Barely True,False,Half True,Mostly True,Pants On Fire,Context
0,11972.json,true,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview
1,11685.json,false,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference
2,11096.json,false,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video


In [83]:
# pre-processing statements for classification algorithms

corpus = df['Statement'].copy()  # sample list
vectorizer = TfidfVectorizer()
lemmatizer = WordNetLemmatizer()
corpus_normalized = []

# normalize the data
for s in corpus:
    s = re.sub(r'[^a-zA-Z\s]', ' ', s, re.I|re.A)  # remove special characters
    s = s.lower()  # switch to lowercase
    s = s.strip()
    tokens = word_tokenize(s)
    tokens_normalized = []
    for t in tokens:
        if t not in stopwords.words():
            tokens_normalized.append(lemmatizer.lemmatize(t, pos='v'))
    s = ' '.join(tokens_normalized)
    corpus_normalized.append(s)
    
target = []
for l in df['Label']:
    if l == 'pants-fire': target.append(0)
    elif l == 'false': target.append(1)
    elif l == 'barely-true': target.append(2)
    elif l == 'half-true': target.append(3)
    elif l == 'mostly-true': target.append(4)
    elif l == 'true': target.append(5)
    
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(corpus_normalized,target,test_size=0.2)
    
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(corpus_normalized)
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [84]:
# Naive Bayes Classification Algorithm
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  57.98816568047337


In [12]:
#  Truth breakdown of each speaker
list_speakers = list(dict.fromkeys(df.dropna()['Speaker']))

for s in list_speakers:
    is_s = df['Speaker'] == s
    
    score = 0
    total_count = 0
    
    for l in df[is_s]['Label']:
        total_count = total_count + 5
        if l == 'pants-fire': pass
        elif l == 'false': score = score + 1
        elif l == 'barely-true': score = score + 2
        elif l == 'half-true': score = score + 3
        elif l == 'mostly-true': score = score + 4
        elif l == 'true': score = score + 5
            
    truth_score = (score / total_count) * 100
    
    print('{}: {}%'.format(s, truth_score))

mary-landrieu: 60.0%
tom-steyer: 40.0%
gwen-moore: 58.82352941176471%
david-dewhurst: 45.51724137931035%
butch-conway: 100.0%
garnet-coleman: 50.0%
rafael-cruz: 40.0%
dean-cannon: 70.0%
mike-fasano: 80.0%
gary-schaer: 100.0%
sondy-pope: 53.333333333333336%
bill-cassidy: 20.0%
reid-ribble: 10.0%
christopher-blazejewski: 73.33333333333333%
texans-economic-development: 80.0%
jill-stein: 40.0%
robin-schimminger: 20.0%
gretchen-carlson: 20.0%
national-taxpayers-union: 40.0%
tim-curtis: 40.0%
martin-omalley: 60.0%
ed-lindsey: 95.0%
eddie-lucio-jr: 80.0%
al-gore: 70.0%
eric-eisnaugle: 20.0%
national-organization-marriage: 10.0%
raymond-lahood: 53.333333333333336%
betty-mccollum: 100.0%
dennis-baxley: 60.0%
scott-randolph: 40.0%
renee-unterman: 60.0%
terry-gorman: 70.0%
jimmy-massie: 40.0%
our-city-our-safety-our-choice: 60.0%
david-sewell: 60.0%
charlie-gonzalez: 40.0%
bruce-starr: 20.0%
patrick-mchenry: 60.0%
andy-gardiner: 20.0%
donna-edwards: 20.0%
eleanor-holmes-norton: 60.0%
john-boehner

dave-hunt: 100.0%
dave-joyce: 100.0%
nancy-pelosi: 48.69565217391305%
roberto-dasilva: 60.0%
dan-micciche: 80.0%
arthur-cyr: 20.0%
shirley-turner: 100.0%
elliott-naishtat: 80.0%
colleen-conley: 60.0%
robert-bennett: 60.0%
lou-ann-zelenik: 20.0%
now-or-never-pac: 20.0%
holly-turner: 60.0%
dwight-jones: 50.0%
brian-kemp: 60.0%
armond-budish: 68.0%
dennis-moss: 20.0%
charlotte-lehan: 50.0%
alan-grayson: 63.33333333333333%
kevin-coughlin: 20.0%
kirk-cox: 100.0%
ann-kirkpatrick: 60.0%
babur-lateef: 60.0%
bob-hackworth: 20.0%
scott-hassett: 20.0%
bob-tiernan: 60.0%
rob-teilhet: 30.0%
mark-daniels: 20.0%
richard-morrison: 60.0%
mike-collins: 70.0%
bill-post: 0.0%
reince-priebus: 44.21052631578947%
michele-walsh: 100.0%
texas-liberty-pac: 20.0%
alfredo-gutierrez: 80.0%
diane-black: 100.0%
our-oregon: 80.0%
steny-hoyer: 65.0%
mike-keown: 60.0%
louie-gohmert: 27.500000000000004%
samuel-wurzelbacher: 10.0%
jeff-morales: 80.0%
senate-republican-conference: 50.0%
centers-disease-control: 80.0%
roge

joe-morrissey: 60.0%
basic-rights-oregon: 20.0%
johnny-isakson: 63.07692307692307%
delia-garza: 20.0%
susan-collins: 100.0%
vincent-fort: 48.0%
linda-finn: 80.0%
nicholas-kettle: 40.0%
charles-koch: 30.0%
alison-littell-mchose: 20.0%
patrick-leahy: 60.0%
doug-whitsett: 66.66666666666666%
martavius-jones: 100.0%
bob-wirch: 50.0%
bill-white: 60.0%
sandra-williams: 60.0%
hetty-rosenstein: 60.0%
bill-montford: 80.0%
ernest-moniz: 80.0%
gary-lambert: 0.0%
chris-koster: 90.0%
mike-dovilla: 60.0%
jeff-judson: 80.0%
gerard-robinson: 20.0%
mary-taylor: 50.0%
tom-leppert: 90.0%
daniel-grace: 20.0%
richard-lugar: 80.0%
zoe-lofgren: 80.0%
david-porter: 60.0%
scott-maddox: 60.0%
robert-walsh: 0.0%
robert-hurt: 36.0%
sharron-angle: 36.0%
hillary-clinton: 66.94560669456067%
jim-francesconi: 50.0%
planned-parenthood-action-fund: 40.0%
scott-fitzgerald: 30.0%
howard-dean: 42.5%
charlie-hales: 64.0%
ron-paul: 54.83870967741935%
ellen-troxclair: 0.0%
frank-luntz: 60.0%
joseph-mcnamara: 33.33333333333333%

albio-sires: 80.0%
mary-margaret-oliver: 80.0%
patti-doyle: 80.0%
randy-neugebauer: 0.0%
ron-maag: 30.0%
dannel-malloy: 60.0%
jay-wiley: 80.0%
kasim-reed: 70.58823529411765%
jill-chambers: 0.0%
saxby-chambliss: 63.63636363636363%
john-skvarla: 40.0%
jorge-ramos: 60.0%
donna-nesselbush: 60.0%
eugene-mckenna: 90.0%
kevin-otoole: 20.0%
matt-wingard: 60.0%
jim-keffer: 100.0%
robert-barber: 20.0%
lon-burnam: 30.0%
maciver-institute: 60.0%
rob-portman: 70.58823529411765%
tamara-holder: 80.0%
carlos-curbelo: 50.0%
doreen-costa: 60.0%
tavis-smiley: 40.0%
anthony-bucco: 40.0%
xavier-becerra: 90.0%
james-florio: 90.0%
clay-pell: 70.0%
katherine-cloonen: 0.0%
bryan-underwood: 80.0%
dan-gecker: 60.0%
chris-devaney: 60.0%
ted-cruz: 44.516129032258064%
state-rep-brian-clem-d-salem: 100.0%
rudy-giuliani: 52.22222222222223%
ted-kanavas: 60.0%
matt-caldwell: 60.0%
united-fair-economy: 80.0%
mindy-montford: 20.0%
national-association-manufacturers: 60.0%
frank-annunziato: 60.0%
workers-voice: 10.0%
ther

kate-brown: 40.0%
richard-pan: 20.0%
andy-craig: 60.0%
robert-singer: 40.0%
loren-collins: 100.0%
steve-latourette: 73.33333333333333%
tom-mechler: 60.0%
robby-mook: 80.0%
david-rivera: 20.0%
tom-harkin: 53.333333333333336%
joe-lieberman: 60.0%
yvette-mcgee-brown: 100.0%
jay-nixon: 70.0%
julie-pace: 80.0%
miscellany-blue: 80.0%
kris-jordan: 90.0%
diane-hurley: 100.0%
jim-cooper: 100.0%
jackie-speier: 80.0%
rick-kriseman: 50.0%
dave-schwartz: 80.0%
baker-harrell: 80.0%
doug-collins: 80.0%
leon-panetta: 60.0%
lamar-smith: 60.0%
leo-berman: 20.0%
david-wu: 80.0%
kathleen-ford: 50.0%
cory-gardner: 40.0%
jack-trammell: 40.0%
jack-bogdanski: 100.0%
steve-kagen: 0.0%
gordon-challstrom: 60.0%
jeff-flake: 100.0%
robert-healey: 70.0%
leticia-van-de-putte: 54.0%
hugh-thompson: 100.0%
david-scott: 60.0%
ian-bremmer: 80.0%
joe-barton: 48.0%
bob-schieffer: 20.0%
john-boccieri: 100.0%
kurt-browning: 40.0%
mike-scott: 100.0%
george-turner: 100.0%
mary-jordan: 80.0%
jon-husted: 43.333333333333336%
rick