## Predicting Voting Behavior ##

< TODO: Short Project Description Here >

### Set up ###

In [None]:
# Install missingno and h2o libraries
!python -m pip install missingno
!python -m pip install h2o

# Verify that java is installed (needed for h2o)
!java -version

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import missingno as msno
from scipy.stats import chi2_contingency
import h2o


# Settings
pd.set_option('display.max_columns', None)

In [None]:
# Helper functions
def choose_random_color():
    colors = ['b', 'c', 'g', 'm', 'r', 'y', 'pink', 'turquoise', 'navy', 'lime', 'darkcyan', 'firebrick', 'slateblue', 'blueviolet']
    return random.choice(colors)

def chisquare_test(table):
    stat, p_value, dof, expected = chi2_contingency(table, correction=False)
    # Set significance level for interpreting p-value
    alpha = 0.05
    if p_value <= alpha:
        print('There is a relationship between the variables.')

### Data cleaning and preprocessing ###

In [None]:
# Read in nonvoters dataset
non_voters = pd.read_csv('nonvoters_data.csv')
sample_size = len(non_voters)

# Print data types and non-null count for each feature  
non_voters.info(verbose=True, show_counts=True)

In [None]:
# Change categorical features to be of type category (this is all columns except RespId, weight, and ppage)
non_categorical = ['RespId', 'weight', 'ppage']
for column in non_voters.columns:
    if column not in non_categorical:
        non_voters[column] = non_voters[column].astype('category')

# Introduce ordering to demographic features with inherent ordering
non_voters.income_cat = pd.Categorical(non_voters.income_cat, ordered=True, categories=['Less than $40k', '$40-75k', '$75-125k', '$125k or more'])
non_voters.educ = pd.Categorical(non_voters.educ, ordered=True, categories=['High school or less', 'Some college', 'College'])

# Examine age distribution of participants
print(non_voters.ppage.describe())

plt.hist(non_voters.ppage)
plt.title('Age Distribution of Participants in Sample')
plt.xlabel('age')
plt.ylabel('count')
plt.show()
plt.clf()

# Convert age feature to be catgorical instead of quantitative to improve interpretability
non_voters.ppage = pd.cut(non_voters.ppage, bins=[0, 36, 66, 99], labels=['35 or younger', '36-65', '66 or older'])
non_voters.ppage = pd.Categorical(non_voters.ppage, ordered=True, categories=['35 or younger', '36-65', '66 or older'])

# Examine new distribtuion
non_voters.ppage.value_counts().sort_index().plot(kind='bar', color=choose_random_color())
plt.title('Age Distribution of Participants in Sample')
plt.ylabel('count')
plt.show()

In [None]:
# Examine demographics of participants in the dataset
voter_demographics = non_voters[['ppage', 'educ', 'race', 'gender', 'income_cat']].copy()
print(voter_demographics.describe(include='all'))

# Visualize distribution of participant demographics
fig, ax = plt.subplots(3, 2, figsize=(10, 10))
voter_demographics.ppage.value_counts().sort_index().plot(ax=ax[0, 0], kind='bar', color=choose_random_color())
ax[0, 0].set_title('Participant Age')
ax[0, 0].set_ylabel('count')

voter_demographics.educ.value_counts().sort_index().plot(ax=ax[0, 1], kind='bar', color=choose_random_color())
ax[0, 1].set_title('Participant Education')

voter_demographics.race.value_counts().sort_index().plot(ax=ax[1, 0], kind='bar', color=choose_random_color())
ax[1, 0].set_title('Participant Race')
ax[1, 0].set_ylabel('count')

voter_demographics.gender.value_counts().sort_index().plot(ax=ax[1, 1], kind='bar', color=choose_random_color())
ax[1, 1].set_title('Participant Gender')

voter_demographics.income_cat.value_counts().sort_index().plot(ax=ax[2, 0], kind='bar', color=choose_random_color())
ax[2, 0].set_title('Participant Income Category')
ax[2, 0].set_ylabel('count')

ax[2, 1].axis('off')

plt.tight_layout()
plt.show()
    

In [None]:
# Remove column for question 1 since every participant answered yes (prerequiste for being included in the sample)
non_voters.drop(f'Q1', axis=1, inplace=True)

# Remove columns for question 19 because it has a different format than the other questions
for i in range(1, 11):
    non_voters.drop(f'Q19_{i}', axis=1, inplace=True)

# Check for NA values
na_feature_count = 0
na_columns = []
for column in non_voters.columns:
    na_values = non_voters[column].isna().sum()
    if(na_values > 0):
        na_columns.append(column)
        na_feature_count += 1

print(f'{na_feature_count} features have NA values.')
print(na_columns)

# Drop all columns with NA values since they represent questions asked to a subset of participants
non_voters.drop(na_columns, axis=1, inplace=True)

In [None]:
# Replace remaining -1 missing values with nan for easier analysis
non_voters = non_voters.replace(-1, np.nan)

# Use a missingno matrix to visually inspect if missing values are MCAR (missing completely at random)
# msno.matrix(non_voters)

# Since the missingness looks like it might not be MCAR, use chi-square tests to check if there is a signigicant relationship 
# between participant demographics and missing values, which could indicate that there is additional information in the missing values
# Chose one variable pair to test
contingency_table = pd.crosstab(non_voters['educ'], non_voters['Q2_1'].isna(), normalize='index')
contingency_table = contingency_table.rename(columns={False: "Answered", True: "Did Not Answer"})
print(contingency_table)
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f'chi statistic: {chi2}')
print(f'p-value: {p}')

# Test the other variables
questions = non_voters.filter(like='Q')
significant_count = 0
for q in questions.columns:
    for d in voter_demographics.columns:
        # Create a contingency table
        contingency_table = pd.crosstab(non_voters[d], non_voters[q].isna(), normalize='index')

        # Perform chi-square test
        chi2, p, dof, expected = chi2_contingency(contingency_table)
        # Set significance level for interpreting p-value
        alpha = 0.05
        if p <= alpha:
            print(f'There is a relationship between the {d} and {q} variables.')
            significant_count += 1

if not significant_count:
    print('There was no significant relationship found between demographic variables and missing values.')

# Since no significant relationship was found, assume MCAR and perform mode imputation on missing values
for column in non_voters.columns:
    if non_voters[column].isna().sum() > 0:
        non_voters[column].fillna(non_voters[column].mode()[0], inplace=True)



### Training the model ###

In [None]:
# Initialize the h2o cluster
h2o.init()

# Convert the pandas DataFrame to an h2o Frame
non_voters_h2o = h2o.H2OFrame(non_voters)

# Input parameters that are going to train
training_columns = [c for c in non_voters.columns if c not in ['RespId', 'weight', 'voter_category']]
print(training_columns)
# Output parameter train against input parameters
response_column = 'voter_category'

# Split cleaned dataset into training and test sets (80% for training, 20% for testing)
train, test = non_voters_h2o.split_frame(ratios=[0.8], seed=42)

# Check the dimensions of the splits
print("Training set shape:", train.shape)
print("Test set shape:", test.shape)

# Define model
model = h2o.estimators.H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)

# Train model
model.train(x=training_columns, y=response_column, training_frame=train, weights_column='weight')

# Model performance
performance = model.model_performance(test_data=test)

print(performance)