In [3]:
from final_data import get_final_data
from functions import percentage_within_threshold

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv('final_data.csv')
data = data.dropna(axis=0)
data.shape

(1728, 30)

In [27]:
# features for training
train_columns = ['Year', 'State FIPS', 'District', 'Voter Turnout %', 'Median Age', 'Median income',
                 'Male Population %', 'Female Population %', 'White %', 'White (non Hispanic or Latino) %',
                 'Hispanic or Latino %', 'Black %', 'American Indian %', 'Asian %', 'Pacific %', 'Multiracial %',
                 'Less than HS %', 'HS grad %', 'Some college %', "Bachelor's degree %", "Graduate degree %",
                 'Below poverty line %', 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %',
                 'Income above 75k %']

# training data 2014, 2016, 2018 ;; testing data 2020
train_data = data[data['Year'].isin([2014, 2016, 2018])]
test_data = data[data['Year'] == 2020].copy()

# training features
X_train = train_data[train_columns].values

# target variables
y_train_dem = train_data['Democratic %'].values
y_train_rep = train_data['Republican %'].values

# normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 80% training, 20% validation for both dem and rep
X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
    X_train, y_train_dem, test_size=0.2, random_state=42
)

X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
    X_train, y_train_rep, test_size=0.2, random_state=42
)

# define and train random forests
rf_dem = RandomForestRegressor(random_state=42)
rf_dem.fit(X_train_dem, y_train_dem)

rf_rep = RandomForestRegressor(random_state=42)
rf_rep.fit(X_train_rep, y_train_rep)

# testing features, normalize
X_test = test_data[train_columns].values
X_test = scaler.transform(X_test)

# use trained models to predict on 2020 test data
predict_dem_rf = rf_dem.predict(X_test)
predict_dem_rf = np.clip(predict_dem_rf, 0, 100)
predict_rep_rf = rf_rep.predict(X_test)
predict_rep_rf = np.clip(predict_rep_rf, 0, 100)

# make sure combined percentages don't go above 100
sum_predictions = predict_dem_rf + predict_rep_rf
scaling_factor = 100 / np.maximum(sum_predictions, 1)  
predict_dem_rf *= scaling_factor
predict_rep_rf *= scaling_factor

# evaluate model performance using MSE, accuracy within 5 and 10
mse_dem = mean_squared_error(test_data['Democratic %'].values, predict_dem_rf)
mse_rep = mean_squared_error(test_data['Republican %'].values, predict_rep_rf)
acc_5_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=5)
acc_10_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=10)
acc_5_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=5)
acc_10_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=10)

print("Results for 2020 Democratic %:")
print("Mean Squared Error:", mse_dem)
print(f"Percentage of predictions within 5%: {acc_5_dem:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_dem:.2f}% \n")

print("Results for 2020 Republican %:")
print("Mean Squared Error:", mse_rep)
print(f"Percentage of predictions within 5%: {acc_5_rep:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_rep:.2f}%")

Results for 2020 Democratic %:
Mean Squared Error: 7426.354108351091
Percentage of predictions within 5%: 18.20%
Percentage of predictions within 10%: 37.10% 

Results for 2020 Republican %:
Mean Squared Error: 7009.9214681455205
Percentage of predictions within 5%: 18.20%
Percentage of predictions within 10%: 34.10%


In [11]:
# more than 0.025 from feature exploration
# remove Male, Female, White, Pacific, Multiracial

train_columns = ['Year', 'State FIPS', 'District', 'Voter Turnout %', 'Median Age', 'Median income',
                 'White (non Hispanic or Latino) %', 'Hispanic or Latino %', 'Black %', 'American Indian %', 'Asian %', 
                 'Less than HS %', 'HS grad %', 'Some college %', "Bachelor's degree %", "Graduate degree %",
                 'Below poverty line %', 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %','Income above 75k %']

# training data 2014, 2016, 2018 ;; testing data 2020
train_data = data[data['Year'].isin([2014, 2016, 2018])]
test_data = data[data['Year'] == 2020].copy()

# training features
X_train = train_data[train_columns].values

# target variables
y_train_dem = train_data['Democratic %'].values
y_train_rep = train_data['Republican %'].values

# normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 80% training, 20% validation for both dem and rep
X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
    X_train, y_train_dem, test_size=0.2, random_state=42
)

X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
    X_train, y_train_rep, test_size=0.2, random_state=42
)

# define and train random forests
rf_dem = RandomForestRegressor(random_state=42)
rf_dem.fit(X_train_dem, y_train_dem)

rf_rep = RandomForestRegressor(random_state=42)
rf_rep.fit(X_train_rep, y_train_rep)

# testing features, normalize
X_test = test_data[train_columns].values
X_test = scaler.transform(X_test)

# use trained models to predict on 2020 test data
predict_dem_rf = rf_dem.predict(X_test)
predict_dem_rf = np.clip(predict_dem_rf, 0, 100)
predict_rep_rf = rf_rep.predict(X_test)
predict_rep_rf = np.clip(predict_rep_rf, 0, 100)

# make sure combined percentages don't go above 100
sum_predictions = predict_dem_rf + predict_rep_rf
scaling_factor = 100 / np.maximum(sum_predictions, 1)  
predict_dem_rf *= scaling_factor
predict_rep_rf *= scaling_factor

# evaluate model performance using MSE, accuracy within 5 and 10
mse_dem = mean_squared_error(test_data['Democratic %'].values, predict_dem_rf)
mse_rep = mean_squared_error(test_data['Republican %'].values, predict_rep_rf)
acc_5_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=5)
acc_10_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=10)
acc_5_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=5)
acc_10_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=10)

print("Results for 2020 Democratic %:")
print("Mean Squared Error:", mse_dem)
print(f"Percentage of predictions within 5%: {acc_5_dem:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_dem:.2f}% \n")

print("Results for 2020 Republican %:")
print("Mean Squared Error:", mse_rep)
print(f"Percentage of predictions within 5%: {acc_5_rep:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_rep:.2f}%")

Results for 2020 Democratic %:
Mean Squared Error: 7407.362494916851
Percentage of predictions within 5%: 21.66%
Percentage of predictions within 10%: 39.17% 

Results for 2020 Republican %:
Mean Squared Error: 6995.772403905346
Percentage of predictions within 5%: 18.89%
Percentage of predictions within 10%: 41.47%


In [16]:
# more than 0.025 from feature exploration
# remove Male, Female, White, Pacific, Multiracial

train_columns = ['Voter Turnout %', 'Median Age', 'Median income',
                 'White (non Hispanic or Latino) %', 'Hispanic or Latino %', 'Black %', 'American Indian %', 'Asian %', 
                 'Less than HS %', 'HS grad %', 'Some college %', "Bachelor's degree %", "Graduate degree %",
                 'Below poverty line %', 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %','Income above 75k %']

# training data 2014, 2016, 2018 ;; testing data 2020
train_data = data[data['Year'].isin([2014, 2016, 2018])]
test_data = data[data['Year'] == 2020].copy()

# training features
X_train = train_data[train_columns].values

# target variables
y_train_dem = train_data['Democratic %'].values
y_train_rep = train_data['Republican %'].values

# normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 80% training, 20% validation for both dem and rep
X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
    X_train, y_train_dem, test_size=0.2, random_state=42
)

X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
    X_train, y_train_rep, test_size=0.2, random_state=42
)

# define and train random forests
rf_dem = RandomForestRegressor(random_state=42)
rf_dem.fit(X_train_dem, y_train_dem)

rf_rep = RandomForestRegressor(random_state=42)
rf_rep.fit(X_train_rep, y_train_rep)

# testing features, normalize
X_test = test_data[train_columns].values
X_test = scaler.transform(X_test)

# use trained models to predict on 2020 test data
predict_dem_rf = rf_dem.predict(X_test)
predict_dem_rf = np.clip(predict_dem_rf, 0, 100)
predict_rep_rf = rf_rep.predict(X_test)
predict_rep_rf = np.clip(predict_rep_rf, 0, 100)

# make sure combined percentages don't go above 100
sum_predictions = predict_dem_rf + predict_rep_rf
scaling_factor = 100 / np.maximum(sum_predictions, 1)  
predict_dem_rf *= scaling_factor
predict_rep_rf *= scaling_factor

# evaluate model performance using MSE, accuracy within 5 and 10
mse_dem = mean_squared_error(test_data['Democratic %'].values, predict_dem_rf)
mse_rep = mean_squared_error(test_data['Republican %'].values, predict_rep_rf)
acc_5_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=5)
acc_10_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=10)
acc_5_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=5)
acc_10_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=10)

print("Results for 2020 Democratic %:")
print("Mean Squared Error:", mse_dem)
print(f"Percentage of predictions within 5%: {acc_5_dem:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_dem:.2f}% \n")

print("Results for 2020 Republican %:")
print("Mean Squared Error:", mse_rep)
print(f"Percentage of predictions within 5%: {acc_5_rep:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_rep:.2f}%")

Results for 2020 Democratic %:
Mean Squared Error: 7417.889585020755
Percentage of predictions within 5%: 21.66%
Percentage of predictions within 10%: 38.48% 

Results for 2020 Republican %:
Mean Squared Error: 7005.467544728875
Percentage of predictions within 5%: 19.35%
Percentage of predictions within 10%: 39.86%


In [29]:
# more than 0.035 from feature exploration
# remove below poverty, bachelor's, hs grad, black, median age

train_columns = ['Year', 'State FIPS', 'District', 'Voter Turnout %', 'Median income',
                 'White (non Hispanic or Latino) %', 'Hispanic or Latino %', 'American Indian %', 'Asian %', 
                 'Less than HS %', 'Some college %', "Graduate degree %",
                 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %','Income above 75k %']

# training data 2014, 2016, 2018 ;; testing data 2020
train_data = data[data['Year'].isin([2014, 2016, 2018])]
test_data = data[data['Year'] == 2020].copy()

# training features
X_train = train_data[train_columns].values

# target variables
y_train_dem = train_data['Democratic %'].values
y_train_rep = train_data['Republican %'].values

# normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 80% training, 20% validation for both dem and rep
X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
    X_train, y_train_dem, test_size=0.2, random_state=42
)

X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
    X_train, y_train_rep, test_size=0.2, random_state=42
)

# define and train random forests
rf_dem = RandomForestRegressor(random_state=42)
rf_dem.fit(X_train_dem, y_train_dem)

rf_rep = RandomForestRegressor(random_state=42)
rf_rep.fit(X_train_rep, y_train_rep)

# testing features, normalize
X_test = test_data[train_columns].values
X_test = scaler.transform(X_test)

# use trained models to predict on 2020 test data
predict_dem_rf = rf_dem.predict(X_test)
predict_dem_rf = np.clip(predict_dem_rf, 0, 100)
predict_rep_rf = rf_rep.predict(X_test)
predict_rep_rf = np.clip(predict_rep_rf, 0, 100)

# make sure combined percentages don't go above 100
sum_predictions = predict_dem_rf + predict_rep_rf
scaling_factor = 100 / np.maximum(sum_predictions, 1)  
predict_dem_rf *= scaling_factor
predict_rep_rf *= scaling_factor

# evaluate model performance using MSE, accuracy within 5 and 10
mse_dem = mean_squared_error(test_data['Democratic %'].values, predict_dem_rf)
mse_rep = mean_squared_error(test_data['Republican %'].values, predict_rep_rf)
acc_5_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=5)
acc_10_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=10)
acc_5_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=5)
acc_10_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=10)

print("Results for 2020 Democratic %:")
print("Mean Squared Error:", mse_dem)
print(f"Percentage of predictions within 5%: {acc_5_dem:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_dem:.2f}% \n")

print("Results for 2020 Republican %:")
print("Mean Squared Error:", mse_rep)
print(f"Percentage of predictions within 5%: {acc_5_rep:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_rep:.2f}%")

Results for 2020 Democratic %:
Mean Squared Error: 7414.519298596086
Percentage of predictions within 5%: 18.43%
Percentage of predictions within 10%: 37.33% 

Results for 2020 Republican %:
Mean Squared Error: 7005.2749148605
Percentage of predictions within 5%: 18.20%
Percentage of predictions within 10%: 35.25%


In [13]:
# more than 0.035 from feature exploration
# remove below poverty, bachelor's, hs grad, black, median age

train_columns = ['Voter Turnout %', 'Median income',
                 'White (non Hispanic or Latino) %', 'Hispanic or Latino %', 'American Indian %', 'Asian %', 
                 'Less than HS %', 'Some college %', "Graduate degree %",
                 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %','Income above 75k %']

# training data 2014, 2016, 2018 ;; testing data 2020
train_data = data[data['Year'].isin([2014, 2016, 2018])]
test_data = data[data['Year'] == 2020].copy()

# training features
X_train = train_data[train_columns].values

# target variables
y_train_dem = train_data['Democratic %'].values
y_train_rep = train_data['Republican %'].values

# normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 80% training, 20% validation for both dem and rep
X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
    X_train, y_train_dem, test_size=0.2, random_state=42
)

X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
    X_train, y_train_rep, test_size=0.2, random_state=42
)

# define and train random forests
rf_dem = RandomForestRegressor(random_state=42)
rf_dem.fit(X_train_dem, y_train_dem)

rf_rep = RandomForestRegressor(random_state=42)
rf_rep.fit(X_train_rep, y_train_rep)

# testing features, normalize
X_test = test_data[train_columns].values
X_test = scaler.transform(X_test)

# use trained models to predict on 2020 test data
predict_dem_rf = rf_dem.predict(X_test)
predict_dem_rf = np.clip(predict_dem_rf, 0, 100)
predict_rep_rf = rf_rep.predict(X_test)
predict_rep_rf = np.clip(predict_rep_rf, 0, 100)

# make sure combined percentages don't go above 100
sum_predictions = predict_dem_rf + predict_rep_rf
scaling_factor = 100 / np.maximum(sum_predictions, 1)  
predict_dem_rf *= scaling_factor
predict_rep_rf *= scaling_factor

# evaluate model performance using MSE, accuracy within 5 and 10
mse_dem = mean_squared_error(test_data['Democratic %'].values, predict_dem_rf)
mse_rep = mean_squared_error(test_data['Republican %'].values, predict_rep_rf)
acc_5_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=5)
acc_10_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=10)
acc_5_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=5)
acc_10_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=10)

print("Results for 2020 Democratic %:")
print("Mean Squared Error:", mse_dem)
print(f"Percentage of predictions within 5%: {acc_5_dem:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_dem:.2f}% \n")

print("Results for 2020 Republican %:")
print("Mean Squared Error:", mse_rep)
print(f"Percentage of predictions within 5%: {acc_5_rep:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_rep:.2f}%")

Results for 2020 Democratic %:
Mean Squared Error: 7427.0591369485655
Percentage of predictions within 5%: 19.82%
Percentage of predictions within 10%: 36.41% 

Results for 2020 Republican %:
Mean Squared Error: 7015.488223986176
Percentage of predictions within 5%: 19.35%
Percentage of predictions within 10%: 37.56%


In [6]:
# more than 0.05 from feature exploration
# remove below poverty, bachelor's, hs grad, black, median age

train_columns = ['Year', 'State FIPS', 'District', 'Voter Turnout %', 'Asian %', 
                 'Some college %', "Graduate degree %",
                 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %']

# training data 2014, 2016, 2018 ;; testing data 2020
train_data = data[data['Year'].isin([2014, 2016, 2018])]
test_data = data[data['Year'] == 2020].copy()

# training features
X_train = train_data[train_columns].values

# target variables
y_train_dem = train_data['Democratic %'].values
y_train_rep = train_data['Republican %'].values

# normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 80% training, 20% validation for both dem and rep
X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
    X_train, y_train_dem, test_size=0.2, random_state=42
)

X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
    X_train, y_train_rep, test_size=0.2, random_state=42
)

# define and train random forests
rf_dem = RandomForestRegressor(random_state=42)
rf_dem.fit(X_train_dem, y_train_dem)

rf_rep = RandomForestRegressor(random_state=42)
rf_rep.fit(X_train_rep, y_train_rep)

# testing features, normalize
X_test = test_data[train_columns].values
X_test = scaler.transform(X_test)

# use trained models to predict on 2020 test data
predict_dem_rf = rf_dem.predict(X_test)
predict_dem_rf = np.clip(predict_dem_rf, 0, 100)
predict_rep_rf = rf_rep.predict(X_test)
predict_rep_rf = np.clip(predict_rep_rf, 0, 100)

# make sure combined percentages don't go above 100
sum_predictions = predict_dem_rf + predict_rep_rf
scaling_factor = 100 / np.maximum(sum_predictions, 1)  
predict_dem_rf *= scaling_factor
predict_rep_rf *= scaling_factor

# evaluate model performance using MSE, accuracy within 5 and 10
mse_dem = mean_squared_error(test_data['Democratic %'].values, predict_dem_rf)
mse_rep = mean_squared_error(test_data['Republican %'].values, predict_rep_rf)
acc_5_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=5)
acc_10_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=10)
acc_5_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=5)
acc_10_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=10)

print("Results for 2020 Democratic %:")
print("Mean Squared Error:", mse_dem)
print(f"Percentage of predictions within 5%: {acc_5_dem:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_dem:.2f}% \n")

print("Results for 2020 Republican %:")
print("Mean Squared Error:", mse_rep)
print(f"Percentage of predictions within 5%: {acc_5_rep:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_rep:.2f}%")

Results for 2020 Democratic %:
Mean Squared Error: 7455.916259321049
Percentage of predictions within 5%: 17.28%
Percentage of predictions within 10%: 34.56% 

Results for 2020 Republican %:
Mean Squared Error: 7061.805565308369
Percentage of predictions within 5%: 19.12%
Percentage of predictions within 10%: 34.56%


In [14]:
# more than 0.05 from feature exploration
# remove below poverty, bachelor's, hs grad, black, median age

train_columns = ['Voter Turnout %', 'Asian %', 
                 'Some college %', "Graduate degree %",
                 'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %']

# training data 2014, 2016, 2018 ;; testing data 2020
train_data = data[data['Year'].isin([2014, 2016, 2018])]
test_data = data[data['Year'] == 2020].copy()

# training features
X_train = train_data[train_columns].values

# target variables
y_train_dem = train_data['Democratic %'].values
y_train_rep = train_data['Republican %'].values

# normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# 80% training, 20% validation for both dem and rep
X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
    X_train, y_train_dem, test_size=0.2, random_state=42
)

X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
    X_train, y_train_rep, test_size=0.2, random_state=42
)

# define and train random forests
rf_dem = RandomForestRegressor(random_state=42)
rf_dem.fit(X_train_dem, y_train_dem)

rf_rep = RandomForestRegressor(random_state=42)
rf_rep.fit(X_train_rep, y_train_rep)

# testing features, normalize
X_test = test_data[train_columns].values
X_test = scaler.transform(X_test)

# use trained models to predict on 2020 test data
predict_dem_rf = rf_dem.predict(X_test)
predict_dem_rf = np.clip(predict_dem_rf, 0, 100)
predict_rep_rf = rf_rep.predict(X_test)
predict_rep_rf = np.clip(predict_rep_rf, 0, 100)

# make sure combined percentages don't go above 100
sum_predictions = predict_dem_rf + predict_rep_rf
scaling_factor = 100 / np.maximum(sum_predictions, 1)  
predict_dem_rf *= scaling_factor
predict_rep_rf *= scaling_factor

# evaluate model performance using MSE, accuracy within 5 and 10
mse_dem = mean_squared_error(test_data['Democratic %'].values, predict_dem_rf)
mse_rep = mean_squared_error(test_data['Republican %'].values, predict_rep_rf)
acc_5_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=5)
acc_10_dem = percentage_within_threshold(test_data['Democratic %'].values, predict_dem_rf, threshold_percent=10)
acc_5_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=5)
acc_10_rep = percentage_within_threshold(test_data['Republican %'].values, predict_rep_rf, threshold_percent=10)

print("Results for 2020 Democratic %:")
print("Mean Squared Error:", mse_dem)
print(f"Percentage of predictions within 5%: {acc_5_dem:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_dem:.2f}% \n")

print("Results for 2020 Republican %:")
print("Mean Squared Error:", mse_rep)
print(f"Percentage of predictions within 5%: {acc_5_rep:.2f}%")
print(f"Percentage of predictions within 10%: {acc_10_rep:.2f}%")

Results for 2020 Democratic %:
Mean Squared Error: 7484.539447194714
Percentage of predictions within 5%: 15.67%
Percentage of predictions within 10%: 31.57% 

Results for 2020 Republican %:
Mean Squared Error: 7095.194784183141
Percentage of predictions within 5%: 20.28%
Percentage of predictions within 10%: 34.79%


In [33]:
# TRYING BY SPLITTING

def create_models(data):
    train_columns = ['Voter Turnout %', 'Median Age', 'Median income', 'Below poverty line %', 
                    'White (non Hispanic or Latino) %', 'Hispanic or Latino %', 'Black %', 'American Indian %', 'Asian %', 
                    'Less than HS %', 'HS grad %', 'Some college %', "Bachelor's degree %", "Graduate degree %",
                    'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %','Income above 75k %']

    # training data 2014, 2016, 2018 ;; testing data 2020
    train_data = data[data['Year'].isin([2014, 2016, 2018])]

    # training features
    X_train = train_data[train_columns].values

    # target variables
    y_train_dem = train_data['Democratic %'].values
    y_train_rep = train_data['Republican %'].values

    # normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    # 80% training, 20% validation for both dem and rep
    X_train_dem, X_val_dem, y_train_dem, y_val_dem = train_test_split(
        X_train, y_train_dem, test_size=0.2, random_state=42
    )

    X_train_rep, X_val_rep, y_train_rep, y_val_rep = train_test_split(
        X_train, y_train_rep, test_size=0.2, random_state=42
    )

    # define and train random forests
    rf_dem = RandomForestRegressor(random_state=42)
    rf_dem.fit(X_train_dem, y_train_dem)

    rf_rep = RandomForestRegressor(random_state=42)
    rf_rep.fit(X_train_rep, y_train_rep)

    return rf_dem, rf_rep, scaler

def predict_2020(data, rf_dem, rf_rep, scaler):    
    train_columns = ['Voter Turnout %', 'Median Age', 'Median income', 'Below poverty line %', 
                    'White (non Hispanic or Latino) %', 'Hispanic or Latino %', 'Black %', 'American Indian %', 'Asian %', 
                    'Less than HS %', 'HS grad %', 'Some college %', "Bachelor's degree %", "Graduate degree %",
                    'Income under 25k %', 'Income 25k to 50k %', 'Income 50k to 75k %','Income above 75k %']

    test_data = data[data['Year'] == 2020].copy()

    # testing features, normalize
    X_test = test_data[train_columns].values
    X_test = scaler.transform(X_test)

    # use trained models to predict on 2020 test data
    predict_dem = rf_dem.predict(X_test)
    predict_dem = np.clip(predict_dem, 0, 100)
    predict_rep = rf_rep.predict(X_test)
    predict_rep = np.clip(predict_rep, 0, 100)

    # make sure combined percentages don't go above 100
    total = predict_dem + predict_rep
    scaling_factor = 100 / np.maximum(total, 1)  
    predict_dem *= scaling_factor
    predict_rep *= scaling_factor

    predict_dem = np.round(predict_dem, 2)
    predict_rep = np.round(predict_rep, 2)

    # add predictions, winners to new df
    test_data.loc[:, 'Predicted Democratic %'] = predict_dem
    test_data.loc[:, 'Predicted Republican %'] = predict_rep
    test_data.loc[:, 'Winner'] = np.where(test_data['Democratic %'] > test_data['Republican %'], 'Democratic', 'Republican')
    test_data.loc[:, 'Predicted Winner'] = np.where(test_data['Predicted Democratic %'] > test_data['Predicted Republican %'],
                                                    'Democratic', 'Republican')

    result_df = test_data[['Year', 'State', 'State Abbr', 'State FIPS', 'District', 'Democratic %', 'Republican %',
                            'Predicted Democratic %', 'Predicted Republican %', 'Winner', 'Predicted Winner']]
    result_df.to_csv('predictions_2020.csv', index=False)
    return result_df

def predict_user_cd(user_cd, rf_dem, rf_rep, scaler):
    user_cd = np.array(user_cd).reshape(1, -1)
    user_cd = scaler.transform(user_cd)

    predict_dem = rf_dem.predict(user_cd)
    predict_dem = np.clip(predict_dem, 0, 100)
    predict_rep = rf_rep.predict(user_cd)
    predict_rep = np.clip(predict_rep, 0, 100)

    # make sure combined percentages don't go above 100
    total = predict_dem + predict_rep
    scaling_factor = 100 / np.maximum(total, 1)  
    predict_dem *= scaling_factor
    predict_rep *= scaling_factor

    predict_dem = np.round(predict_dem, 2)
    predict_rep = np.round(predict_rep, 2)

    return predict_dem[0], predict_rep[0]

In [34]:
data = pd.read_csv('final_data.csv')
data = data.dropna(axis=0)

rf_dem, rf_rep, scaler = create_models(data)
pred_2020 = predict_2020(data, rf_dem, rf_rep, scaler)

user_cd = [45.52, 38.0, 28884.0, 10.12, 66.85, 10.77, 7.2, 0.36, 2.89, 7.68, 18.55, 19.72, 12.23, 7.03, 51.68, 23.29, 12.07, 11.67]
dem, rep = predict_user_cd(user_cd, rf_dem, rf_rep, scaler)
print(dem, rep)

42.38 57.62


In [31]:
# add predictions, winners to new df
test_data.loc[:, 'Predicted Democratic %'] = predict_dem_rf
test_data.loc[:, 'Predicted Republican %'] = predict_rep_rf
test_data.loc[:, 'Winner'] = np.where(test_data['Democratic %'] > test_data['Republican %'], 'Democratic', 'Republican')
test_data.loc[:, 'Predicted Winner'] = np.where(test_data['Predicted Democratic %'] > test_data['Predicted Republican %'],
                                                'Democratic', 'Republican')
result_df = test_data[['Year', 'State', 'State Abbr', 'State FIPS', 'District', 'Democratic %', 'Republican %',
                        'Predicted Democratic %', 'Predicted Republican %', 'Winner', 'Predicted Winner']]
result_df.to_csv('predictions.csv', index=False)

In [26]:
from sklearn.ensemble import RandomForestRegressor

# Prepare target variables as a 2D array
y_train = train_data[['Democratic %', 'Republican %']].values
y_test = test_data[['Democratic %', 'Republican %']].values

# Train the model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Predict
predictions = rf.predict(X_test)

# Normalize the predictions
predictions_sum = predictions.sum(axis=1).reshape(-1, 1)
normalized_predictions = predictions / predictions_sum * 100

# Evaluate the model
mse_dem = mean_squared_error(y_test[:, 0], normalized_predictions[:, 0])
mse_rep = mean_squared_error(y_test[:, 1], normalized_predictions[:, 1])

print("MSE for Democratic %:", mse_dem)
print("MSE for Republican %:", mse_rep)


MSE for Democratic %: 7314.122881329195
MSE for Republican %: 7059.4580478018925
