# EDA and Election Predictions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
governor = pd.read_csv('polls/governor_polls.csv')
senate = pd.read_csv('polls/senate_polls.csv')

## Merging into combined polls dataframes

In [3]:
rep_gov = governor[governor.party == 'REP'][['poll_id', 'fte_grade', 'state', 'office_type', 'created_at', 'answer', 'pct']].rename({'answer': 'rep_candidate', 'pct': 'rep_pct'}, axis=1)
dem_gov = governor[governor.party == 'DEM'][['poll_id', 'answer', 'pct']].rename({'answer': 'dem_candidate', 'pct': 'dem_pct'}, axis=1)

In [4]:
rep_sen = senate[senate.party == 'REP'][['poll_id', 'fte_grade', 'state', 'office_type', 'created_at', 'answer', 'pct']].rename({'answer': 'rep_candidate', 'pct': 'rep_pct'}, axis=1)
dem_sen = senate[senate.party == 'DEM'][['poll_id', 'answer', 'pct']].rename({'answer': 'dem_candidate', 'pct': 'dem_pct'}, axis=1)

In [5]:
combined_polls = pd.concat([pd.merge(rep_gov, dem_gov, on='poll_id', how='inner'), pd.merge(rep_sen, dem_sen, on='poll_id', how='inner')])

In [6]:
# mean function that places greater weight on higher quality polls
def weighted_mean(df):
    rep_sum = 0
    dem_sum = 0
    count = 0

    for i in range(df.shape[0]):
        if pd.isna(df.iloc[i].fte_grade):
            rep_sum += df.iloc[i].rep_pct
            dem_sum += df.iloc[i].dem_pct
            count += 1
        elif 'A' in df.iloc[i].fte_grade:
            rep_sum += df.iloc[i].rep_pct*3
            dem_sum += df.iloc[i].dem_pct*3
            count += 3
        elif 'B' in df.iloc[i].fte_grade:
            rep_sum += df.iloc[i].rep_pct*2
            dem_sum += df.iloc[i].dem_pct*2
            count += 2
        else:
            rep_sum += df.iloc[i].rep_pct
            dem_sum += df.iloc[i].dem_pct
            count += 1

    return (rep_sum/count, dem_sum/count)

## Sentiment Analysis

In [7]:
text_data = pd.read_csv('datasets/social_media_data.csv')

In [8]:
sa = SentimentIntensityAnalyzer()
text_data['sentiment'] = text_data.text.apply(lambda x: sa.polarity_scores(x)['compound'])

In [9]:
avg_sent = text_data.groupby('candidate').mean()

In [10]:
# keep last names only, (excepction for cortez masto)
avg_sent.index = [name if name == 'Cortez Masto' else name.split()[-1] for name in avg_sent.index]

In [11]:
avg_sent['party'] = ['d', 'r', 'r', 'd', 'd', 'r', 'r', 'd', 'r', 'd', 'd', 'r', 'r', 'd', 'd', 'd', 'r', 'r', 'r', 'r', 
                     'd', 'd', 'd', 'r', 'r', 'r', 'r', 'r', 'r', 'd', 'd', 'd', 'd', 'r', 'd', 'd', 'r', 'd', 'd', 'r']

In [12]:
avg_sent.sort_values(by='party', inplace=True)

In [13]:
dem_means = [weighted_mean(combined_polls[combined_polls['dem_candidate'] == name])[1] for name in avg_sent[avg_sent.party == 'd'].index]
rep_means = [weighted_mean(combined_polls[combined_polls['rep_candidate'] == name])[0] for name in avg_sent[avg_sent.party == 'r'].index]

In [14]:
dem_means.extend(rep_means)
avg_sent['weighted_mean'] = dem_means

In [15]:
avg_sent['result'] = [55.9, 51.4, 49.5, 53.6, 37.2, 56.4, 56.5, 58.5, 52.9, 50.3, 45.9, 47.4, 51.2, 41.3, 46.7, 41.6, 48.9, 40.0, 51.4, 51.2,
                      50.5, 57.1, 57.7, 53.3, 43.2, 48.8, 47.8, 41.7, 48.0, 53.4, 49.7, 48.6, 39.2, 46.3, 62.8, 59.4, 44.4, 46.5, 41.3, 47.1]

In [16]:
# average sentiment per party
avg_sent.groupby('party').mean()

Unnamed: 0_level_0,sentiment,weighted_mean,result
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d,0.127317,45.587761,49.39
r,0.074693,43.910246,49.34


In [17]:
# ready for use dataframe exported
avg_sent.to_csv('datasets/processed_data.csv')

## Model Building

In [18]:
# baseline linear model
X = avg_sent[['weighted_mean']]
y = avg_sent['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

MSE and r-squared for baseline model that only uses poll averages

In [20]:
print(metrics.mean_squared_error(y_test, lr.predict(X_test)))
print(lr.score(X_test, y_test))

9.67032190604391
0.6586796634896843


In [21]:
# including sentiment
X = avg_sent[['weighted_mean', 'sentiment']]
y = avg_sent['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [22]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

MSE and r-squared for model incorporating social media data

In [23]:
print(metrics.mean_squared_error(y_test, lr.predict(X_test)))
print(lr.score(X_test, y_test))

9.69505803350916
0.6578065856922303


In [24]:
# excluding Tom Sherman
X = avg_sent[avg_sent.sentiment < 0.4][['weighted_mean']]
y = avg_sent[avg_sent.sentiment < 0.4]['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

print(metrics.mean_squared_error(y_test, lr.predict(X_test)))
print(lr.score(X_test, y_test))

7.290767873346304
0.8531747786607021
