<a href="https://colab.research.google.com/github/joealcantara/NLP/blob/master/pres.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Presidents Code - Move to Python

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd

from scipy import stats

import matplotlib
import matplotlib.pyplot as plt

import os
import nltk
from nltk import word_tokenize, sent_tokenize

from nltk.stem import *
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initalise stemmer and lemmatizer
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

from collections import Counter

from datetime import date

# To show plots in notebook
%matplotlib inline  

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('mode.chained_assignment','warn')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
def wordCount(text):
    wordDict = {}
    for sentence in text:
        for entry in sentence:
            keyTitle = entry[1]
            if keyTitle not in wordDict:
                wordDict[keyTitle] = 1
            else:
                count = wordDict[keyTitle]
                count = count + 1
                wordDict[keyTitle] = count
    return wordDict

In [0]:
def meanLengthSentence(text):
    total_length = 0
    for sent in text:
        total_length = total_length+len(sent)
    return total_length / len(text)

In [0]:
def preprocess(text):
    sentences = sent_tokenize(text)
    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [0]:
def strip_punctuation(text):
    table = str.maketrans(dict.fromkeys(':,.?'))
    s = text.translate(table)
    return s

In [0]:
def lexical_diversity(text):
    return len(set(text)) / len(text)

In [0]:
path = "/content/drive/My Drive/Data/presidents/data/"

pathReagan = path + 'ReaganSpeeches/'
pathBush = path + 'BushSpeeches/'
pathTrump = path + 'TrumpSpeeches/'

In [0]:
# Create Data Frames for the datasets. I am including one data frame for 2 terms of Reagan
# and 2 separate dataframes for each term.
dfReagan = pd.DataFrame()
dfReaganTerm1 = pd.DataFrame()
dfReaganTerm2 = pd.DataFrame()
dfBush = pd.DataFrame()
dfTrump = pd.DataFrame()
LIWC = pd.DataFrame()

In [0]:
for filename in os.listdir(pathReagan):
    if filename.endswith('txt'):
        f = open(pathReagan + filename)
        raw = f.read()
        # Clear raw of punctuation and tokenize for word counts.
        wordsNoPunct = strip_punctuation(raw)
        #hesitations = wordsNoPunct.count('—')
        wordsNoPunct.replace("—", ' ')
        wordsNoPunct = word_tokenize(wordsNoPunct)
        words = word_tokenize(raw)
        tokens_stemmed = [stemmer.stem(x) for x in words]
        newlist = []
        for word in words:
          x = lemmatizer.lemmatize(word)
          newlist.append(x)
        UniqueLemmas = len(set(newlist))
    
        # Word Counts for certain words
        c = Counter(words)
        Fillers = c['well'] + c['so'] + c['basically'] + c['actually'] + c['literally'] + c['um'] + c['ah']
        NSNouns = c['something'] + c['anything'] + c['thing'] + c['everything']
        LIVerbs = c['be'] + c['come'] + c['do'] + c['get'] + c['give'] + c['go'] + c['have'] + c['know'] + c['look']
        + c['make'] + c['see'] + c['tell'] + c['think'] + c['want']
        
        sents = sent_tokenize(raw)
        processed = preprocess(raw)
        lex = lexical_diversity(wordsNoPunct)
        mls = meanLengthSentence(processed)
        wordDict = wordCount(processed)
        thetuple = {'Filename': filename, 'TTR': lex,
                    'WordCount':len(wordsNoPunct), 
                    'UniqueWords':len(set(wordsNoPunct)),
                    'UniqueStems':len(set(tokens_stemmed)),
                    'UniqueLemmas':UniqueLemmas,
                    'MLU': mls, 'Fillers': Fillers,
                   'NSNouns': NSNouns, 'LIVerbs': LIVerbs}
        finalDict = {**thetuple, **wordDict}
        dfReagan = dfReagan.append(finalDict, ignore_index = True)
        reaganWords = words

In [0]:
for filename in os.listdir(pathBush):
    if filename.endswith('txt'):
        f = open(pathBush + filename)
        raw = f.read()
        # Clear raw of punctuation and tokenize for word counts.
        wordsNoPunct = strip_punctuation(raw)
        #hesitations = wordsNoPunct.count('-')
        wordsNoPunct.replace("-", ' ')
        wordsNoPunct = word_tokenize(wordsNoPunct)
        
        words = word_tokenize(raw)
        tokens_stemmed = [stemmer.stem(x) for x in words]
        newlist = []
        for word in words:
          x = lemmatizer.lemmatize(word)
          newlist.append(x)
        UniqueLemmas = len(set(newlist))

        # Word Counts for certain words
        c = Counter(words)
        Fillers = c['well'] + c['so'] + c['basically'] + c['actually'] + c['literally'] + c['um'] + c['ah']
        NSNouns = c['something'] + c['anything'] + c['thing']
        LIVerbs = c['be'] + c['come'] + c['do'] + c['get'] + c['give'] + c['go'] + c['have'] + c['know'] + c['look']
        + c['make'] + c['see'] + c['tell'] + c['think'] + c['want']
        
        sents = sent_tokenize(raw)
        processed = preprocess(raw)
        lex = lexical_diversity(wordsNoPunct)
        mls = meanLengthSentence(processed)
        wordDict = wordCount(processed)
        thetuple = {'Filename': filename, 'TTR': lex,
                    'WordCount':len(wordsNoPunct), 
                    'UniqueWords':len(set(wordsNoPunct)),
                    'UniqueStems':len(set(tokens_stemmed)),
                    'UniqueLemmas':UniqueLemmas,
                    'MLU': mls, 'Fillers': Fillers,
                   'NSNouns': NSNouns, 'LIVerbs': LIVerbs}
        finalDict = {**thetuple, **wordDict}
        dfBush = dfBush.append(finalDict, ignore_index = True)
        bushWords = words

In [0]:
for filename in os.listdir(pathTrump):
    if filename.endswith('txt'):
        f = open(pathTrump + filename)
        raw = f.read()
        # Clear raw of punctuation and tokenize for word counts.
        wordsNoPunct = strip_punctuation(raw)
        #hesitations = wordsNoPunct.count('-')
        wordsNoPunct.replace("-", ' ')
        wordsNoPunct = word_tokenize(wordsNoPunct)
        
        words = word_tokenize(raw)
        tokens_stemmed = [stemmer.stem(x) for x in words]
        newlist = []
        for word in words:
          x = lemmatizer.lemmatize(word)
          newlist.append(x)
        UniqueLemmas = len(set(newlist))

        # Word Counts for certain words
        c = Counter(words)
        Fillers = c['well'] + c['so'] + c['basically'] + c['actually'] + c['literally'] + c['um'] + c['ah']
        NSNouns = c['something'] + c['anything'] + c['thing']
        LIVerbs = c['be'] + c['come'] + c['do'] + c['get'] + c['give'] + c['go'] + c['have'] + c['know'] + c['look']
        + c['make'] + c['see'] + c['tell'] + c['think'] + c['want']
        
        sents = sent_tokenize(raw)
        processed = preprocess(raw)
        lex = lexical_diversity(wordsNoPunct)
        mls = meanLengthSentence(processed)
        wordDict = wordCount(processed)
        thetuple = {'Filename': filename, 'TTR': lex,
                    'WordCount':len(wordsNoPunct), 
                    'UniqueWords':len(set(wordsNoPunct)),
                    'UniqueStems':len(set(tokens_stemmed)),
                    'UniqueLemmas':UniqueLemmas,
                    'MLU': mls, 'Fillers': Fillers,
                   'NSNouns': NSNouns, 'LIVerbs': LIVerbs}
        finalDict = {**thetuple, **wordDict}
        dfTrump = dfTrump.append(finalDict, ignore_index = True)
        trumpWords = words

In [0]:
LIWC = pd.read_csv(path + "LIWC2015Results.csv")

In [0]:
dfReagan = pd.merge(dfReagan, LIWC, on='Filename', how='inner')
dfTrump = pd.merge(dfTrump, LIWC, on='Filename', how='inner')
dfBush = pd.merge(dfBush, LIWC, on='Filename', how='inner')

In [0]:
# Rearranging Columns
inserted_cols = ['Filename', 'TTR','WordCount', 'UniqueWords', 'UniqueStems', 'UniqueLemmas', 'MLU', 'Fillers', 'NSNouns', 'LIVerbs']
cols = ([col for col in inserted_cols if col in dfReagan] 
        + [col for col in dfReagan if col not in inserted_cols])
dfReagan = dfReagan[cols]

In [0]:
# Rearranging Columns
inserted_cols = ['Filename', 'TTR','WordCount', 'UniqueWords', 'UniqueStems', 'UniqueLemmas', 'MLU', 'Fillers', 'NSNouns', 'LIVerbs']
cols = ([col for col in inserted_cols if col in dfBush] 
        + [col for col in dfBush if col not in inserted_cols])
dfBush = dfBush[cols]

In [0]:
# Rearranging Columns
inserted_cols = ['Filename', 'TTR','WordCount', 'UniqueWords', 'UniqueStems', 'UniqueLemmas', 'MLU', 'Fillers', 'NSNouns', 'LIVerbs']
cols = ([col for col in inserted_cols if col in dfTrump] 
        + [col for col in dfTrump if col not in inserted_cols])
dfTrump = dfTrump[cols]

In [0]:
ReaganDates = pd.read_csv(path + "ReaganDates.csv")
BushDates = pd.read_csv(path + "BushDates.csv")
TrumpDates = pd.read_csv(path + "TrumpDates.csv")

In [0]:
import datetime

ReaganDates['JDate'] = [datetime.datetime.strptime(x, '%d/%m/%Y') for x in ReaganDates['Date']]
BushDates['JDate'] = [datetime.datetime.strptime(x, '%d/%m/%Y') for x in BushDates['Date']]
TrumpDates['JDate'] = [datetime.datetime.strptime(x, '%d/%m/%Y') for x in TrumpDates['Date']]
da = datetime.datetime(1981, 1, 29)
da = np.datetime64(da)
dc = datetime.datetime(1989, 1, 27)
dc = np.datetime64(dc)
de = datetime.datetime(2017, 1, 27)
de = np.datetime64(de)

In [0]:
ReaganDates['Days'] = ReaganDates['JDate'] - da
ReaganDates['Days'] = ReaganDates['Days'].dt.days
BushDates['Days'] = BushDates['JDate'] - dc
BushDates['Days'] = BushDates['Days'].dt.days
TrumpDates['Days'] = TrumpDates['JDate'] - de
TrumpDates['Days'] = TrumpDates['Days'].dt.days

In [0]:
dfReagan = ReaganDates.merge(dfReagan, on='Filename')
dfBush = BushDates.merge(dfBush, on='Filename')
dfTrump = TrumpDates.merge(dfTrump, on='Filename')

In [0]:
dfReagan = dfReagan.sort_values(by=['Days'])
dfBush = dfBush.sort_values(by=['Days'])
dfTrump = dfTrump.sort_values(by=['Days'])

In [0]:
# Calculate some new aggregate columns
dfReagan['Nouns'] = dfReagan['NN'] + dfReagan['NNS']+ dfReagan['NNP'] + dfReagan['NNPS']
dfReagan['Nouns/100'] = dfReagan['Nouns'] / 100
dfReagan['NounsNormalised'] = dfReagan['Nouns'] / dfReagan['WordCount']
dfReagan['Adjectives'] = dfReagan['JJ'] + dfReagan['JJR'] + dfReagan['JJS']
dfReagan['Adjectives/100'] = dfReagan['Adjectives'] / 100
dfReagan['AdjectivesNormalised'] = dfReagan['Adjectives'] / dfReagan['WordCount']
dfReagan['Adverbs'] = dfReagan['RB'] + dfReagan['RBR'] + dfReagan['RBS']
dfReagan['Adverbs/100'] = dfReagan['Adverbs'] / 100
dfReagan['AdverbsNormalised'] = dfReagan['Adverbs'] / dfReagan['WordCount']
dfReagan['Verbs'] = dfReagan['VB'] + dfReagan['VBD'] + dfReagan['VBG'] + dfReagan['VBN'] + dfReagan['VBP'] + dfReagan['VBZ']
dfReagan['Verbs/100'] = dfReagan['Verbs'] / 100
dfReagan['VerbsNormalised'] = dfReagan['Verbs'] / dfReagan['WordCount']
dfReagan['Pronouns'] = dfReagan['PRP'] + dfReagan['PRP$']
dfReagan['PronounsNormalised'] = dfReagan['Pronouns'] / dfReagan['WordCount']
dfReagan['UniqueWordsNormalised'] = dfReagan['UniqueWords'] / dfReagan['WordCount']
dfReagan['UniqueStemsNormalised'] = dfReagan['UniqueStems'] / dfReagan['WordCount']
dfReagan['UniqueLemmasNormalised'] = dfReagan['UniqueLemmas'] / dfReagan['WordCount']

In [0]:
# Calculate some new aggregate columns
dfBush['Nouns'] = dfBush['NN'] + dfBush['NNS']+ dfBush['NNP'] + dfBush['NNPS']
dfBush['Nouns/100'] = dfBush['Nouns'] / 100
dfBush['NounsNormalised'] = dfBush['Nouns'] / dfBush['WordCount']
dfBush['Adjectives'] = dfBush['JJ'] + dfBush['JJR'] + dfBush['JJS']
dfBush['Adjectives/100'] = dfBush['Adjectives'] / 100
dfBush['AdjectivesNormalised'] = dfBush['Adjectives'] / dfBush['WordCount']
dfBush['Adverbs'] = dfBush['RB'] + dfBush['RBR'] + dfBush['RBS']
dfBush['Adverbs/100'] = dfBush['Adverbs'] / 100
dfBush['AdverbsNormalised'] = dfBush['Adverbs'] / dfBush['WordCount']
dfBush['Verbs'] = dfBush['VB'] + dfBush['VBD'] + dfBush['VBG'] + dfBush['VBN'] + dfBush['VBP'] + dfBush['VBZ']
dfBush['Verbs/100'] = dfBush['Verbs'] / 100
dfBush['VerbsNormalised'] = dfBush['Verbs'] / dfBush['WordCount']
dfBush['Pronouns'] = dfBush['PRP'] + dfBush['PRP$']
dfBush['PronounsNormalised'] = dfBush['Pronouns'] / dfBush['WordCount']
dfBush['UniqueWordsNormalised'] = dfReagan['UniqueWords'] / dfReagan['WordCount']
dfBush['UniqueStemsNormalised'] = dfReagan['UniqueStems'] / dfReagan['WordCount']
dfBush['UniqueLemmasNormalised'] = dfBush['UniqueLemmas'] / dfBush['WordCount']

In [0]:
# Calculate some new aggregate columns
dfTrump['Nouns'] = dfTrump['NN'] + dfTrump['NNS']+ dfTrump['NNP'] + dfTrump['NNPS']
dfTrump['Nouns/100'] = dfTrump['Nouns'] / 100
dfTrump['NounsNormalised'] = dfTrump['Nouns'] / dfTrump['WordCount']
dfTrump['Adjectives'] = dfTrump['JJ'] + dfTrump['JJR'] + dfTrump['JJS']
dfTrump['Adjectives/100'] = dfTrump['Adjectives'] / 100
dfTrump['AdjectivesNormalised'] = dfTrump['Adjectives'] / dfTrump['WordCount']
dfTrump['Adverbs'] = dfTrump['RB'] + dfTrump['RBR'] + dfTrump['RBS']
dfTrump['Adverbs/100'] = dfTrump['Adverbs'] / 100
dfTrump['AdverbsNormalised'] = dfTrump['Adverbs'] / dfTrump['WordCount']
dfTrump['Verbs'] = dfTrump['VB'] + dfTrump['VBD'] + dfTrump['VBG'] + dfTrump['VBN'] + dfTrump['VBP'] + dfTrump['VBZ']
dfTrump['Verbs/100'] = dfTrump['Verbs'] / 100
dfTrump['VerbsNormalised'] = dfTrump['Verbs'] / dfTrump['WordCount']
dfTrump['Pronouns'] = dfTrump['PRP'] + dfTrump['PRP$']
dfTrump['PronounsNormalised'] = dfTrump['Pronouns'] / dfTrump['WordCount']
dfTrump['UniqueWordsNormalised'] = dfTrump['UniqueWords'] / dfTrump['WordCount']
dfTrump['UniqueStemsNormalised'] = dfTrump['UniqueStems'] / dfTrump['WordCount']
dfTrump['UniqueLemmasNormalised'] = dfTrump['UniqueLemmas'] / dfTrump['WordCount']

In [0]:
dfReaganTerm1 = dfReagan[dfReagan.JDate < pd.Timestamp(1985, 1, 31)]
dfReaganTerm2 = dfReagan[dfReagan.JDate > pd.Timestamp(1985, 1, 31)]

In [0]:
resultsReagan = pd.DataFrame()
columnsReagan = list(dfReagan)
resultsBush = pd.DataFrame()
columnsBush = list(dfBush)
resultsTrump = pd.DataFrame()
columnsTrump = list(dfTrump)

In [0]:
columnsReagan.remove('Filename')
columnsReagan.remove('Date')
columnsReagan.remove('JDate')

columnsBush.remove('Filename')
columnsBush.remove('Date')
columnsBush.remove('JDate')

columnsTrump.remove('Filename')
columnsTrump.remove('Date')
columnsTrump.remove('JDate')

# Fill NA's with 0s as in this dataset, NAN represent the feature NOT occuring in a particular document.
dfReagan = dfReagan.fillna(0)
dfBush = dfBush.fillna(0)
dfTrump = dfTrump.fillna(0)

In [0]:
from scipy.stats import pearsonr

for i in columnsReagan:
    r, p = pearsonr(dfReagan[i], dfReagan['Days'])
    pearsonResults = {'Feature': i, 'RSquared':r, 'P-Value': p}
    resultsReagan = resultsReagan.append(pearsonResults, ignore_index=True)

for i in columnsTrump:
    r, p = pearsonr(dfTrump[i], dfTrump['Days'])
    pearsonResults = {'Feature': i, 'RSquared':r, 'P-Value': p}
    resultsTrump = resultsTrump.append(pearsonResults, ignore_index=True)

for i in columnsBush:
    r, p = pearsonr(dfBush[i], dfBush['Days'])
    pearsonResults = {'Feature': i, 'RSquared':r, 'P-Value': p}
    resultsBush = resultsBush.append(pearsonResults, ignore_index=True)



In [0]:
def processForFDR(df):
  df = df.dropna()
  indexes = df[df['RSquared'] == 1].index
  df.drop(indexes, inplace=True)  
  return df

In [0]:
resultsReagan = processForFDR(resultsReagan)
resultsBush = processForFDR(resultsBush)
resultsTrump = processForFDR(resultsTrump)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [0]:
from statsmodels.stats.multitest import (multipletests, fdrcorrection,
                                         fdrcorrection_twostage,
                                         NullDistribution,
                                         local_fdr)

test = multipletests(resultsReagan['P-Value'], alpha=0.05, method = 'bonferroni', is_sorted=False, returnsorted=False)
resultsReagan['BN Accepted'] = test[0]
resultsReagan['BN Adjusted P-value'] = test[1]

test = multipletests(resultsReagan['P-Value'], alpha=0.05, method = 'holm', is_sorted=False, returnsorted=False)
resultsReagan['Holm Accepted'] = test[0]
resultsReagan['Holm Adjusted P-value'] = test[1]

test = multipletests(resultsReagan['P-Value'], alpha=0.05, method = 'fdr_bh', is_sorted=False, returnsorted=False)
resultsReagan['BH Accepted'] = test[0]
resultsReagan['BH Adjusted P-value'] = test[1]

test = multipletests(resultsReagan['P-Value'], alpha=0.05, method = 'fdr_by', is_sorted=False, returnsorted=False)
resultsReagan['BY Accepted'] = test[0]
resultsReagan['BY Adjusted P-value'] = test[1]

test = multipletests(resultsReagan['P-Value'], alpha=0.05, method = 'hommel', is_sorted=False, returnsorted=False)
resultsReagan['HM Accepted'] = test[0]
resultsReagan['HM Adjusted P-value'] = test[1]

test = multipletests(resultsReagan['P-Value'], alpha=0.05, method = 'simes-hochberg', is_sorted=False, returnsorted=False)
resultsReagan['HO Accepted'] = test[0]
resultsReagan['HO Adjusted P-value'] = test[1]

In [0]:
test = multipletests(resultsBush['P-Value'], alpha=0.05, method = 'bonferroni', is_sorted=False, returnsorted=False)
resultsBush['BN Accepted'] = test[0]
resultsBush['BN Adjusted P-value'] = test[1]

test = multipletests(resultsBush['P-Value'], alpha=0.05, method = 'holm', is_sorted=False, returnsorted=False)
resultsBush['Holm Accepted'] = test[0]
resultsBush['Holm Adjusted P-value'] = test[1]

test = multipletests(resultsBush['P-Value'], alpha=0.05, method = 'fdr_bh', is_sorted=False, returnsorted=False)
resultsBush['BH Accepted'] = test[0]
resultsBush['BH Adjusted P-value'] = test[1]

test = multipletests(resultsBush['P-Value'], alpha=0.05, method = 'fdr_by', is_sorted=False, returnsorted=False)
resultsBush['BY Accepted'] = test[0]
resultsBush['BY Adjusted P-value'] = test[1]

test = multipletests(resultsBush['P-Value'], alpha=0.05, method = 'hommel', is_sorted=False, returnsorted=False)
resultsBush['HM Accepted'] = test[0]
resultsBush['HM Adjusted P-value'] = test[1]

test = multipletests(resultsBush['P-Value'], alpha=0.05, method = 'simes-hochberg', is_sorted=False, returnsorted=False)
resultsBush['HO Accepted'] = test[0]
resultsBush['HO Adjusted P-value'] = test[1]

In [0]:
test = multipletests(resultsTrump['P-Value'], alpha=0.05, method = 'bonferroni', is_sorted=False, returnsorted=False)
resultsTrump['BN Accepted'] = test[0]
resultsTrump['BN Adjusted P-value'] = test[1]

test = multipletests(resultsTrump['P-Value'], alpha=0.05, method = 'holm', is_sorted=False, returnsorted=False)
resultsTrump['Holm Accepted'] = test[0]
resultsTrump['Holm Adjusted P-value'] = test[1]

test = multipletests(resultsTrump['P-Value'], alpha=0.05, method = 'fdr_bh', is_sorted=False, returnsorted=False)
resultsTrump['BH Accepted'] = test[0]
resultsTrump['BH Adjusted P-value'] = test[1]

test = multipletests(resultsTrump['P-Value'], alpha=0.05, method = 'fdr_by', is_sorted=False, returnsorted=False)
resultsTrump['BY Accepted'] = test[0]
resultsTrump['BY Adjusted P-value'] = test[1]

test = multipletests(resultsTrump['P-Value'], alpha=0.05, method = 'hommel', is_sorted=False, returnsorted=False)
resultsTrump['HM Accepted'] = test[0]
resultsTrump['HM Adjusted P-value'] = test[1]

test = multipletests(resultsTrump['P-Value'], alpha=0.05, method = 'simes-hochberg', is_sorted=False, returnsorted=False)
resultsTrump['HO Accepted'] = test[0]
resultsTrump['HO Adjusted P-value'] = test[1]

In [0]:
subsetReagan = pd.DataFrame()
subsetReagan = resultsReagan.loc[resultsReagan['P-Value'] < 0.05]
subsetReagan1 = resultsReagan.loc[resultsReagan['RSquared'] < -0.40]
subsetReagan2 = resultsReagan.loc[resultsReagan['RSquared'] > 0.40]
subsetReagan1 = subsetReagan1.sort_values(by=['RSquared'])
subsetReagan2 = subsetReagan2.sort_values(by=['RSquared'])
subsetReagan1.append(subsetReagan2)
ReaganNames = subsetReagan1['Feature'].values.tolist()

In [0]:
subsetBush = resultsBush.loc[resultsBush['P-Value'] < 0.05]
subsetBush1 = resultsBush.loc[resultsBush['RSquared'] < -0.40]
subsetBush2 = resultsBush.loc[resultsBush['RSquared'] > 0.40]
subsetBush1 = subsetBush1.sort_values(by=['RSquared'])
subsetBush2 = subsetBush2.sort_values(by=['RSquared'])
subsetBush1.append(subsetBush2)
BushNames = subsetBush1['Feature'].values.tolist()

In [0]:
subsetTrump = resultsTrump.loc[resultsTrump['P-Value'] < 0.05]
subsetTrump1 = resultsTrump.loc[resultsTrump['RSquared'] < -0.40]
subsetTrump2 = resultsTrump.loc[resultsTrump['RSquared'] > 0.40]
subsetTrump1 = subsetTrump1.sort_values(by=['RSquared'])
subsetTrump2 = subsetTrump2.sort_values(by=['RSquared'])
subsetTrump1.append(subsetTrump2)
TrumpNames = subsetTrump1['Feature'].values.tolist()

In [0]:
import plotly.express as px

for name in ReaganNames:
  fig = px.scatter(dfReagan, x = "Days", y = name, trendline="ols")
  fig.show()

In [0]:
for name in BushNames:
  fig = px.scatter(dfBush, x = "Days", y = name, trendline="ols")
  fig.show()

In [0]:
for name in TrumpNames:
  fig = px.scatter(dfTrump, x = "Days", y = name, trendline="ols", title='test')
  fig.show()

In [0]:
!pip install pygam
from pygam import LogisticGAM, LinearGAM

Collecting pygam
[?25l  Downloading https://files.pythonhosted.org/packages/13/be/775033ef08a8945bec6ad7973b161ca909f852442e0d7cfb8d1a214de1ac/pygam-0.8.0-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 2.4MB/s 
Installing collected packages: pygam
Successfully installed pygam-0.8.0


In [0]:
set_diff = set(dfBush.columns) - set(dfReagan.columns)

In [0]:
print(set_diff)

set()


In [0]:
dfBush = dfBush.drop(['SYM'], axis=1)

In [0]:
listOfColumns = list(dfBush.columns)

In [0]:
listOfColumns = listOfColumns[4:]

In [0]:
def tTests(columns, df1, df2):
  tTestResults = pd.DataFrame()
  for column in columns:
    t, p = stats.ttest_ind(df1[column], df2[column])
    atuple = {"column": column,
              "T-Value": t,
              "P-Value": p}
    tTestResults = tTestResults.append(atuple, ignore_index = True)
  return tTestResults

In [161]:
# T-Tests Reagan v Bush
b = tTests(listOfColumns, dfReagan, dfBush)
b

Unnamed: 0,P-Value,T-Value,column
0,0.05405,-1.93948,TTR
1,2e-05,4.42886,WordCount
2,0.0,6.7625,UniqueWords
3,0.0,7.56438,UniqueStems
4,0.0,6.88157,UniqueLemmas
5,0.0,13.45332,MLU
6,0.00702,2.72807,Fillers
7,0.0,9.74883,NSNouns
8,0.00801,2.68257,LIVerbs
9,0.0,8.56951,$


In [162]:
# T-Tests Reagan v Bush
c = tTests(listOfColumns, dfReagan, dfTrump)
c

Unnamed: 0,P-Value,T-Value,column
0,0.00012,-4.07653,TTR
1,0.0,6.48818,WordCount
2,0.0,11.27915,UniqueWords
3,0.0,11.86182,UniqueStems
4,0.0,11.24538,UniqueLemmas
5,0.0,25.88053,MLU
6,0.0194,2.39147,Fillers
7,5e-05,4.34139,NSNouns
8,0.00103,3.42242,LIVerbs
9,0.05866,1.92122,$


In [163]:
# T-Tests Reagan v Bush
d = tTests(listOfColumns, dfBush, dfTrump)
d

Unnamed: 0,P-Value,T-Value,column
0,0.13959,-1.48474,TTR
1,0.00427,2.89893,WordCount
2,6e-05,4.13634,UniqueWords
3,4e-05,4.22946,UniqueStems
4,6e-05,4.12376,UniqueLemmas
5,0.0,11.54463,MLU
6,0.41747,0.81293,Fillers
7,0.56631,-0.57469,NSNouns
8,0.10545,1.62824,LIVerbs
9,0.00322,-2.9917,$
