In [2]:
from functions import percentage_within_threshold

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv("final_data.csv")
data = data.dropna(axis=0)
data.head()
# features for training
train_columns = ['Year', 'State FIPS', 'District', 'Voter Turnout %', 'Median Age', 'Median income',
                 'Male Population %', 'Female Population %', 'White %', 'White (non Hispanic or Latino) %',
                 'Hispanic or Latino %', 'Black %', 'American Indian %', 'Asian %', 'Pacific %', 'Multiracial %',
                 'Less than HS %', 'HS grad %', 'Some college %', "Bachelor's degree %", "Graduate degree %",
                 'Below poverty line %', 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %',
                 'Income above 75k %']

X = data[data['Year'] != 2020][train_columns].values
y_democratic = data[data['Year'] != 2020]['Democratic %'].values
y_republican = data[data['Year'] != 2020]['Republican %'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_val, y_train_democratic, y_val_democratic, y_train_republican, y_val_republican = train_test_split(
    X, y_democratic, y_republican, test_size=0.2, random_state=42
)

In [4]:
rf_democratic = RandomForestRegressor(random_state=42)
rf_democratic.fit(X_train, y_train_democratic)

rf_republican = RandomForestRegressor(random_state=42)
rf_republican.fit(X_train, y_train_republican)

predictions_democratic_rf = rf_democratic.predict(X_val)
predictions_republican_rf = rf_republican.predict(X_val)

predictions_democratic_rf = np.clip(predictions_democratic_rf, 0, 100)
predictions_republican_rf = np.clip(predictions_republican_rf, 0, 100)

mse_democratic = mean_squared_error(y_val_democratic, predictions_democratic_rf)
mse_republican = mean_squared_error(y_val_republican, predictions_republican_rf)

accuracy_within_5_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_rf, threshold_percent=5)
accuracy_within_10_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_rf, threshold_percent=10)
accuracy_within_5_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_rf, threshold_percent=5)
accuracy_within_10_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_rf, threshold_percent=10)

print("RF Mean Squared Error for Democratic % on Validation Set:", mse_democratic)
print("RF Mean Squared Error for Republican % on Validation Set:", mse_republican)
print(f"RF Percentage of predictions within 5% for Democratic %: {accuracy_within_5_percent_democratic:.2f}%")
print(f"RF Percentage of predictions within 10% for Democratic %: {accuracy_within_10_percent_democratic:.2f}%")
print(f"RF Percentage of predictions within 5% for Republican %: {accuracy_within_5_percent_republican:.2f}%")
print(f"RF Percentage of predictions within 10% for Republican %: {accuracy_within_10_percent_republican:.2f}%")

RF Mean Squared Error for Democratic % on Validation Set: 247.1330302277992
RF Mean Squared Error for Republican % on Validation Set: 231.2745419953668
RF Percentage of predictions within 5% for Democratic %: 20.08%
RF Percentage of predictions within 10% for Democratic %: 40.93%
RF Percentage of predictions within 5% for Republican %: 25.10%
RF Percentage of predictions within 10% for Republican %: 42.86%


In [5]:
lr_democratic = LinearRegression()
lr_democratic.fit(X_train, y_train_democratic)

lr_republican = LinearRegression()
lr_republican.fit(X_train, y_train_republican)

predictions_democratic_lr = lr_democratic.predict(X_val)
predictions_republican_lr = lr_republican.predict(X_val)

predictions_democratic_lr = np.clip(predictions_democratic_lr, 0, 100)
predictions_republican_lr = np.clip(predictions_republican_lr, 0, 100)

mse_democratic = mean_squared_error(y_val_democratic, predictions_democratic_lr)
mse_republican = mean_squared_error(y_val_republican, predictions_republican_lr)

accuracy_within_5_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_rf, threshold_percent=5)
accuracy_within_10_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_rf, threshold_percent=10)
accuracy_within_5_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_lr, threshold_percent=5)
accuracy_within_10_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_lr, threshold_percent=10)

print("LR Mean Squared Error for Democratic % on Validation Set:", mse_democratic)
print("LR Mean Squared Error for Republican % on Validation Set:", mse_republican)
print(f"LR Percentage of predictions within 5% for Democratic %: {accuracy_within_5_percent_democratic:.2f}%")
print(f"LR Percentage of predictions within 10% for Democratic %: {accuracy_within_10_percent_democratic:.2f}%")
print(f"LR Percentage of predictions within 5% for Republican %: {accuracy_within_5_percent_republican:.2f}%")
print(f"LR Percentage of predictions within 10% for Republican %: {accuracy_within_10_percent_republican:.2f}%")

LR Mean Squared Error for Democratic % on Validation Set: 372.97121910310517
LR Mean Squared Error for Republican % on Validation Set: 368.0229674466167
LR Percentage of predictions within 5% for Democratic %: 20.08%
LR Percentage of predictions within 10% for Democratic %: 40.93%
LR Percentage of predictions within 5% for Republican %: 17.76%
LR Percentage of predictions within 10% for Republican %: 29.73%


In [6]:
gb_democratic = GradientBoostingRegressor(random_state=42)
gb_democratic.fit(X_train, y_train_democratic)

gb_republican = GradientBoostingRegressor(random_state=42)
gb_republican.fit(X_train, y_train_republican)

predictions_democratic_gb = gb_democratic.predict(X_val)
predictions_republican_gb = gb_republican.predict(X_val)

predictions_democratic_gb = np.clip(predictions_democratic_gb, 0, 100)
predictions_republican_gb = np.clip(predictions_republican_gb, 0, 100)

mse_democratic = mean_squared_error(y_val_democratic, predictions_democratic_gb)
mse_republican = mean_squared_error(y_val_republican, predictions_republican_gb)

accuracy_within_5_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_gb, threshold_percent=5)
accuracy_within_10_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_gb, threshold_percent=10)
accuracy_within_5_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_gb, threshold_percent=5)
accuracy_within_10_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_gb, threshold_percent=10)

print("GB Mean Squared Error for Democratic % on Validation Set:", mse_democratic)
print("GB Mean Squared Error for Republican % on Validation Set:", mse_republican)
print(f"GB Percentage of predictions within 5% for Democratic %: {accuracy_within_5_percent_democratic:.2f}%")
print(f"GB Percentage of predictions within 10% for Democratic %: {accuracy_within_10_percent_democratic:.2f}%")
print(f"GB Percentage of predictions within 5% for Republican %: {accuracy_within_5_percent_republican:.2f}%")
print(f"GB Percentage of predictions within 10% for Republican %: {accuracy_within_10_percent_republican:.2f}%")

GB Mean Squared Error for Democratic % on Validation Set: 218.68812287981285
GB Mean Squared Error for Republican % on Validation Set: 283.4572613769281
GB Percentage of predictions within 5% for Democratic %: 20.46%
GB Percentage of predictions within 10% for Democratic %: 36.29%
GB Percentage of predictions within 5% for Republican %: 14.67%
GB Percentage of predictions within 10% for Republican %: 33.98%


In [8]:
data = pd.read_csv("final_data.csv")
data = data.dropna(axis=0)
data.head()
# features for training
# take out: White, Multiracial, Pacific, Male, Female
train_columns = ['Year', 'State FIPS', 'District', 'Voter Turnout %', 'Median Age', 'Median income',
                 'White (non Hispanic or Latino) %', 'Hispanic or Latino %', 'Black %', 'American Indian %', 'Asian %', 
                 'Less than HS %', 'HS grad %', 'Some college %', "Bachelor's degree %", "Graduate degree %",
                 'Below poverty line %', 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %','Income above 75k %']

X = data[data['Year'] != 2020][train_columns].values
y_democratic = data[data['Year'] != 2020]['Democratic %'].values
y_republican = data[data['Year'] != 2020]['Republican %'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_val, y_train_democratic, y_val_democratic, y_train_republican, y_val_republican = train_test_split(
    X, y_democratic, y_republican, test_size=0.2, random_state=42
)

In [9]:
rf_democratic = RandomForestRegressor(random_state=42)
rf_democratic.fit(X_train, y_train_democratic)

rf_republican = RandomForestRegressor(random_state=42)
rf_republican.fit(X_train, y_train_republican)

predictions_democratic_rf = rf_democratic.predict(X_val)
predictions_republican_rf = rf_republican.predict(X_val)

predictions_democratic_rf = np.clip(predictions_democratic_rf, 0, 100)
predictions_republican_rf = np.clip(predictions_republican_rf, 0, 100)

mse_democratic = mean_squared_error(y_val_democratic, predictions_democratic_rf)
mse_republican = mean_squared_error(y_val_republican, predictions_republican_rf)

accuracy_within_5_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_rf, threshold_percent=5)
accuracy_within_10_percent_democratic = percentage_within_threshold(y_val_democratic, predictions_democratic_rf, threshold_percent=10)
accuracy_within_5_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_rf, threshold_percent=5)
accuracy_within_10_percent_republican = percentage_within_threshold(y_val_republican, predictions_republican_rf, threshold_percent=10)

print("RF Mean Squared Error for Democratic % on Validation Set:", mse_democratic)
print("RF Mean Squared Error for Republican % on Validation Set:", mse_republican)
print(f"RF Percentage of predictions within 5% for Democratic %: {accuracy_within_5_percent_democratic:.2f}%")
print(f"RF Percentage of predictions within 10% for Democratic %: {accuracy_within_10_percent_democratic:.2f}%")
print(f"RF Percentage of predictions within 5% for Republican %: {accuracy_within_5_percent_republican:.2f}%")
print(f"RF Percentage of predictions within 10% for Republican %: {accuracy_within_10_percent_republican:.2f}%")

RF Mean Squared Error for Democratic % on Validation Set: 254.34187636266404
RF Mean Squared Error for Republican % on Validation Set: 246.08595490277992
RF Percentage of predictions within 5% for Democratic %: 20.46%
RF Percentage of predictions within 10% for Democratic %: 38.61%
RF Percentage of predictions within 5% for Republican %: 18.92%
RF Percentage of predictions within 10% for Republican %: 40.15%
