Here we run a monte carlo simulation using the provincial level win probabilities. 

National level:
- mean expected vote share per party
- probability of winning per party

The results of this notebook are stored in `/results`.

In [1]:
import os
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt

In [2]:
# load the provincial results

directory = '/Users/kbeebe/Desktop/economist_forecasting/election_forecaster/election_forecaster/results/'

filelist = [file for file in os.listdir(directory) if file.startswith('provincial')]
for file in filelist:
    exec("%s = pd.read_csv('%s')" % (file.split('.')[0], os.path.join(directory,file)))

# load dataland state electoral votes
electoral_college_votes = pd.read_csv('/Users/kbeebe/Desktop/economist_forecasting/election_forecaster/raw_data/dataland_demographics.csv')[['province', 'electoral_college_votes']]
ecv = np.array(electoral_college_votes.electoral_college_votes) # array for calculations later
draws = 1000

In [3]:
# get dates
dates_sc_A = provincial_forecast_A.date.unique()
dates_sc_B = provincial_forecast_B.date.unique()
dates_sc_C = provincial_forecast_C.date.unique()
dates_sc_D = provincial_forecast_D.date.unique()
dates_sc_E = provincial_forecast_E.date.unique()

In [4]:
def calculate_mean_vote_share(df_input, dates):

    national_results = []
    for date in dates:
        df = df_input[(df_input['date']==date)]
        sel_cols = df.columns.str.contains('mean_vote_share')
        df = df[df.columns[sel_cols]]

        # get the mean for each party over all provinces
        mean_vote_cols = df.columns.str.contains('mean_vote_share')
        column_means = df[df.columns[mean_vote_cols]].mean()

        res_df = pd.DataFrame([column_means.values], columns=column_means.index)
        res_df.insert(0, 'date', date)
        national_results.append(res_df)

    # concat
    df_national_results = pd.concat(national_results)
    return df_national_results

def run_prob_simulation(df_input, dates):

    national_results = []
    for date in dates:
        df = df_input[(df_input['date']==date)]
        sel_cols = df.columns.str.contains('win_probability')
        df = df[df.columns[sel_cols]]
        # transform to cumulative so we know random dice draws falls in which party probability
        df_cumsum = df.cumsum(axis = 1, skipna = True)
    
        # simulate random roll of the dice
        simulated_values_array = np.random.rand(12,draws) # 12 states, n draws
    
        # Initialize an empty array to store next largest indices
        next_largest_indices_array = np.empty_like(simulated_values_array)
        
        # Loop through each set of simulated values (each column in the matrix)
        for i in range(simulated_values_array.shape[1]):
            simulated_values = simulated_values_array[:, i]  # Get simulated values for the current simulation
            
            # Find the index of the next largest party for each state
            next_largest_indices = np.argmax((df_cumsum.values >= simulated_values[:, np.newaxis]), axis=1)
            
            # Store the next largest indices in the corresponding position of the array
            next_largest_indices_array[:, i] = next_largest_indices
    
        # Calculate the total electoral college votes won by each party in each simulation run
        total_votes_party_index_0 = np.sum(np.where(next_largest_indices_array == 0, ecv[:, np.newaxis], 0), axis=0)
        total_votes_party_index_1 = np.sum(np.where(next_largest_indices_array == 1, ecv[:, np.newaxis], 0), axis=0)
        total_votes_party_index_2 = np.sum(np.where(next_largest_indices_array == 2, ecv[:, np.newaxis], 0), axis=0)
        total_votes_party_index_3 = np.sum(np.where(next_largest_indices_array == 3, ecv[:, np.newaxis], 0), axis=0)
        
        # # Determine the winning party for each simulation run
        winning_party_indices = np.argmax([total_votes_party_index_0, total_votes_party_index_1, total_votes_party_index_2, total_votes_party_index_3], axis=0)
        
        # Count the number of times each party wins
        win_counts = np.bincount(winning_party_indices)
        
        # Calculate the total number of simulation runs
        total_runs = len(winning_party_indices)
        
        # Calculate the probability of each party winning over all runs
        win_probabilities = win_counts / total_runs
        
        # Ensure win_probabilities has probabilities for all parties
        num_parties = 4  # Assuming there are 4 parties
        if len(win_probabilities) < num_parties:
            missing_parties = num_parties - len(win_probabilities)
            win_probabilities = np.append(win_probabilities, [0] * missing_parties)
    
        win_prob_column_names = df.columns
        res_df = pd.DataFrame([win_probabilities], columns=win_prob_column_names)
        res_df.insert(0, 'date', date)
        national_results.append(res_df)
        
    
    # concat
    df_national_results = pd.concat(national_results)
    return df_national_results
    

In [5]:
# Scenario A
df_national_mean_vote_shares_A = calculate_mean_vote_share(provincial_forecast_A, dates_sc_A)
df_national_win_probs_A = run_prob_simulation(provincial_forecast_A, dates_sc_A)
df_national_results_A = df_national_mean_vote_shares_A.merge(df_national_win_probs_A, on='date')
# save
df_national_results_A.to_csv("../results/national_forecast_A.csv", index=False)


In [6]:
# Scenario B
df_national_mean_vote_shares_B = calculate_mean_vote_share(provincial_forecast_B, dates_sc_B)
df_national_win_probs_B = run_prob_simulation(provincial_forecast_B, dates_sc_B)
df_national_results_B = df_national_mean_vote_shares_B.merge(df_national_win_probs_B, on='date')
# save
df_national_results_B.to_csv("../results/national_forecast_B.csv", index=False)

In [7]:
# Scenario C
df_national_mean_vote_shares_C = calculate_mean_vote_share(provincial_forecast_C, dates_sc_C)
df_national_win_probs_C = run_prob_simulation(provincial_forecast_C, dates_sc_C)
df_national_results_C = df_national_mean_vote_shares_C.merge(df_national_win_probs_C, on='date')
# save
df_national_results_C.to_csv("../results/national_forecast_C.csv", index=False)

In [8]:
# Scenario D
df_national_mean_vote_shares_D = calculate_mean_vote_share(provincial_forecast_D, dates_sc_D)
df_national_win_probs_D = run_prob_simulation(provincial_forecast_D, dates_sc_D)
df_national_results_D = df_national_mean_vote_shares_D.merge(df_national_win_probs_D, on='date')
# save
df_national_results_D.to_csv("../results/national_forecast_D.csv", index=False)

In [9]:
# Scenario E
df_national_mean_vote_shares_E = calculate_mean_vote_share(provincial_forecast_E, dates_sc_E)
df_national_win_probs_E = run_prob_simulation(provincial_forecast_E, dates_sc_E)
df_national_results_E = df_national_mean_vote_shares_E.merge(df_national_win_probs_E, on='date')
# save
df_national_results_E.to_csv("../results/national_forecast_E.csv", index=False)