Download the kaggle dataset for Polymarket Election results by state.

Download public trade data from Kalshi for the whole nation.

In [76]:
import pandas as pd
import numpy as np
import os, json, requests
from tqdm import tqdm
import datetime

### GPT generated following state electoral votes for weighting
state_electoral_votes = {
    'NE': 5, 'RI': 4, 'NH': 4, 'NV': 6, 'OR': 8, 'WI': 10, 'GA': 16, 'AZ': 11, 'MS': 6,
    'NC': 16, 'ID': 4, 'IN': 11, 'MT': 4, 'ND': 3, 'MA': 11, 'WV': 4, 'TX': 40, 'DE': 3,
    'CO': 10, 'VA': 13, 'OK': 7, 'VT': 3, 'AK': 3, 'KY': 8, 'HI': 4, 'MO': 10, 'FL': 30,
    'UT': 6, 'NJ': 14, 'TN': 11, 'SD': 3, 'MD': 10, 'PA': 19, 'CA': 54, 'OH': 17, 'MI': 15,
    'NY': 28, 'CT': 7, 'SC': 9, 'WY': 3, 'IL': 19, 'WA': 12, 'MN': 10, 'IA': 6, 'AR': 6,
    'LA': 8, 'ME': 4, 'AL': 9, 'KS': 6, 'NM': 5, 'DC': 3
}

In [5]:
### state data; only 10-04 and 10-05 overlap with the Kalshi data
directory = 'archive/csv_day'
file_names = os.listdir(directory)

dataframes = []
for file_name in file_names:
    file_path = os.path.join(directory, file_name)
    df = pd.read_csv(file_path)
    state = file_name.split('_')[0]
    df['state'] = state
    df['date'] = pd.to_datetime(df['Date (UTC)']).dt.strftime("%Y-%m-%d")
    dataframes.append(df)

# Now you have a list of dataframes, one for each file in the directory
df_state = pd.concat(dataframes)
states = df_state['state'].unique()
dates = df_state['Date (UTC)'].unique()

In [68]:
states

array(['NE', 'RI', 'NH', 'NV', 'OR', 'WI', 'GA', 'AZ', 'MS', 'NC', 'ID',
       'IN', 'MT', 'ND', 'MA', 'WV', 'TX', 'DE', 'CO', 'VA', 'OK', 'VT',
       'AK', 'KY', 'HI', 'MO', 'FL', 'UT', 'NJ', 'TN', 'SD', 'MD', 'PA',
       'CA', 'OH', 'MI', 'NY', 'CT', 'SC', 'WY', 'IL', 'WA', 'MN', 'IA',
       'AR', 'LA', 'ME', 'AL', 'KS', 'NM'], dtype=object)

In [6]:
### national data
def get_date(date):
    URL = f"https://kalshi-public-docs.s3.amazonaws.com/reporting/market_data_{date}.json"
    response = requests.get(URL)
    jsondata = response.json()
    data = json.dumps(jsondata)
    df = pd.DataFrame(eval(data))
    # df = df[df['ticker_name'] == 'US National']
    df = df[df.ticker_name=='PRES-2024-DJT']
    return df
dates_national = sorted(list(set([i.split(' ')[0] for i in dates])))[-2:]
for i,s in enumerate(dates_national):
    month,day,year = s.split('-')
    dates_national[i] = year + '-' + month + '-' + day
dfs = []
for date in tqdm(dates_national):
    df = get_date(date)
    df['Date (UTC)'] = date
    dfs.append(df)
df_national = pd.concat(dfs)

100%|██████████| 2/2 [00:02<00:00,  1.17s/it]


In [7]:
df_national

Unnamed: 0,date,ticker_name,report_ticker,payout_type,open_interest,daily_volume,block_volume,high,low,status,Date (UTC)
15062,2024-10-04,PRES-2024-DJT,PRES,Binary Option,151640.0,257284,0,51,49,active,2024-10-04
13109,2024-10-05,PRES-2024-DJT,PRES,Binary Option,167088.0,19956,0,50,49,active,2024-10-05


In [54]:
df_state[(df_state['state'] == 'NE') & (df_state['date'] == '2024-10-05')]

Unnamed: 0,Date (UTC),Timestamp (UTC),Republican,Democrat,Other,state,date
190,10-05-2024 00:00,1728086402,0.9785,0.0135,0.0035,NE,2024-10-05
191,10-05-2024 15:05,1728140702,0.9775,0.0135,0.0035,NE,2024-10-05


In [26]:
date = dates_national[0]

In [55]:
### binomial approximation
votes_rep = []
votes_dem = []
weights = []
for state in states:
    df = df_state[df_state['state'] == state]
    df = df[df['date'] == date]
    if len(df) == 0:
        continue
    ### whether to throw away the third party votes
    p_rep = df['Republican'].values[0]
    p_dem = df['Democrat'].values[0]
    p_rep, p_dem = p_rep / (p_rep + p_dem), p_dem / (p_rep + p_dem)
    votes_rep.append(p_rep)
    votes_dem.append(p_dem)
    weights.append(state_electoral_votes[state])
    # price = df['Price'].values[0]
    # votes.append(price)
    # weights.append(state_electoral_votes[state])
votes_rep = np.array(votes_rep)
votes_dem = np.array(votes_dem)
weights = np.array(weights)
weights = weights / np.sum(weights)

In [92]:
### MC simulation
votes_rep = {}
votes_dem = {}
for state in states:
    df = df_state[df_state['state'] == state]
    df = df[df['date'] == date]
    if len(df) == 0:
        continue
    ### whether to throw away the third party votes
    p_rep = df['Republican'].values[0]
    p_dem = df['Democrat'].values[0]
    p_rep, p_dem = p_rep / (p_rep + p_dem), p_dem / (p_rep + p_dem)
    votes_rep[state] = p_rep
    votes_dem[state] = p_dem
votes_dem['DC'] = 1
votes_rep['DC'] = 0

# # Define the probability of winning each state (this is just an example list)
# win_probabilities = {
#     'NE': 0.8, 'RI': 0.6, 'NH': 0.5, 'NV': 0.55, 'OR': 0.7, 'WI': 0.52, 'GA': 0.48, 'AZ': 0.51,
#     'MS': 0.9, 'NC': 0.49, 'ID': 0.95, 'IN': 0.75, 'MT': 0.85, 'ND': 0.88, 'MA': 0.99, 'WV': 0.92,
#     'TX': 0.45, 'DE': 0.98, 'CO': 0.6, 'VA': 0.58, 'OK': 0.88, 'VT': 0.99, 'AK': 0.82, 'KY': 0.9,
#     'HI': 0.98, 'MO': 0.75, 'FL': 0.47, 'UT': 0.9, 'NJ': 0.95, 'TN': 0.85, 'SD': 0.92, 'MD': 0.98,
#     'PA': 0.5, 'CA': 0.99, 'OH': 0.48, 'MI': 0.52, 'NY': 0.99, 'CT': 0.98, 'SC': 0.48, 'WY': 0.95,
#     'IL': 0.99, 'WA': 0.97, 'MN': 0.55, 'IA': 0.48, 'AR': 0.9, 'LA': 0.9, 'ME': 0.55, 'AL': 0.9,
#     'KS': 0.88, 'NM': 0.58
# }
def simu_prob(win_probabilities):
    # Define the number of simulations
    num_simulations = 100000

    # Initialize a counter for wins
    win_counter = 0

    # Run the Monte Carlo simulation
    for _ in tqdm(range(num_simulations)):
        total_electoral_votes = 0
        # For each state, simulate the win/loss based on the probability of winning
        for state, prob in win_probabilities.items():
            if np.random.rand() < prob:
                total_electoral_votes += state_electoral_votes[state]
        # Check if the candidate won the election (needs at least 270 electoral votes)
        if total_electoral_votes >= 270:
            win_counter += 1

    # Calculate the probability of winning based on the simulations
    win_probability = win_counter / num_simulations

    # Print the result
    print(f"Estimated probability of winning: {win_probability * 100:.2f}%")
    return win_probability

In [88]:
np.array(list(votes_rep.values())) + np.array(list(votes_dem.values()))

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [89]:
sum(state_electoral_votes.values())

538

In [99]:
print('Rep:')
p1 = simu_prob(votes_rep)
print('Dem:')
p2 = simu_prob(votes_dem)
print(f'Tie with probability: {100*(1 - p1 - p2):.2f}%')

Rep:


100%|██████████| 100000/100000 [00:00<00:00, 100305.75it/s]


Estimated probability of winning: 41.97%
Dem:


100%|██████████| 100000/100000 [00:00<00:00, 103370.51it/s]

Estimated probability of winning: 56.44%
Tie with probability: 1.59%





In [31]:
np.sum(votes_rep * weights), np.sum(votes_dem * weights)

(0.48993457943925234, 0.5077177570093458)

In [12]:
votes

[0.9785,
 0.017,
 0.115,
 0.475,
 0.0295,
 0.435,
 0.63,
 0.635,
 0.978,
 0.575,
 0.9795,
 0.9745,
 0.977,
 0.982,
 0.0165,
 0.9815,
 0.885,
 0.024,
 0.031,
 0.125,
 0.9885,
 0.014,
 0.9,
 0.9775,
 0.0185,
 0.983,
 0.845,
 0.974,
 0.0315,
 0.976,
 0.9855,
 0.0095,
 0.515,
 0.0175,
 0.925,
 0.415,
 0.0265,
 0.0145,
 0.9745,
 0.987,
 0.021,
 0.0155,
 0.075,
 0.915,
 0.9875,
 0.9765,
 0.105,
 0.9785,
 0.978,
 0.08]

In [13]:
weights

[5,
 4,
 4,
 6,
 8,
 10,
 16,
 11,
 6,
 16,
 4,
 11,
 4,
 3,
 11,
 4,
 40,
 3,
 10,
 13,
 7,
 3,
 3,
 8,
 4,
 10,
 30,
 6,
 14,
 11,
 3,
 10,
 19,
 54,
 17,
 15,
 28,
 7,
 9,
 3,
 19,
 12,
 10,
 6,
 6,
 8,
 4,
 9,
 6,
 5]

In [10]:
df

Unnamed: 0,Date (UTC),Timestamp (UTC),Republican,Democrat,Other,state,date
189,10-04-2024 00:00,1728000002,0.08,0.915,0.0025,NM,2024-10-04


In [90]:
df_state['state'].unique()

array(['NE', 'RI', 'NH', 'NV', 'OR', 'WI', 'GA', 'AZ', 'MS', 'NC', 'ID',
       'IN', 'MT', 'ND', 'MA', 'WV', 'TX', 'DE', 'CO', 'VA', 'OK', 'VT',
       'AK', 'KY', 'HI', 'MO', 'FL', 'UT', 'NJ', 'TN', 'SD', 'MD', 'PA',
       'CA', 'OH', 'MI', 'NY', 'CT', 'SC', 'WY', 'IL', 'WA', 'MN', 'IA',
       'AR', 'LA', 'ME', 'AL', 'KS', 'NM'], dtype=object)

In [33]:
dates

array(['03-29-2024 00:00', '03-30-2024 00:00', '03-31-2024 00:00',
       '04-01-2024 00:00', '04-02-2024 00:00', '04-03-2024 00:00',
       '04-04-2024 00:00', '04-05-2024 00:00', '04-06-2024 00:00',
       '04-07-2024 00:00', '04-08-2024 00:00', '04-09-2024 00:00',
       '04-10-2024 00:00', '04-11-2024 00:00', '04-12-2024 00:00',
       '04-13-2024 00:00', '04-14-2024 00:00', '04-15-2024 00:00',
       '04-16-2024 00:00', '04-17-2024 00:00', '04-18-2024 00:00',
       '04-19-2024 00:00', '04-20-2024 00:00', '04-21-2024 00:00',
       '04-22-2024 00:00', '04-23-2024 00:00', '04-24-2024 00:00',
       '04-25-2024 00:00', '04-26-2024 00:00', '04-27-2024 00:00',
       '04-28-2024 00:00', '04-29-2024 00:00', '04-30-2024 00:00',
       '05-01-2024 00:00', '05-02-2024 00:00', '05-03-2024 00:00',
       '05-04-2024 00:00', '05-05-2024 00:00', '05-06-2024 00:00',
       '05-07-2024 00:00', '05-08-2024 00:00', '05-09-2024 00:00',
       '05-10-2024 00:00', '05-11-2024 00:00', '05-12-2024 00:

In [28]:
df.head()

Unnamed: 0,date,ticker_name,report_ticker,payout_type,open_interest,daily_volume,block_volume,high,low,status
0,2022-05-15,TURKEU-22JUN29,TURKEU,Binary Option,9994.0,0,0,1,1,finalized
1,2022-05-15,RECSS-22JUL,RECSS,Binary Option,68267.0,965,0,94,91,finalized
2,2022-05-15,CPTPP-23JAN4,CPTPP,Binary Option,2392.0,0,0,5,5,active
3,2022-05-15,MOON-25,MOON,Binary Option,5834.0,150,0,17,16,active
4,2022-05-15,MANCHIN-22JUL05,MANCHIN,Binary Option,7356.0,0,0,4,4,finalized


In [27]:
[i for i in df['ticker_name'].unique() if 'DJT' in i]

[]

In [17]:
data = []
for state in states:
    for date in dates:
        state_df = df[df['state'] == state]
        date_df = state_df[state_df['Date (UTC)'] == date]
        if len(date_df) == 0:
            continue
        data.append({
            'state': state,
            'date': date,
            'mean': date_df['Mean'].mean(),
            'median': date_df['Median'].mean(),
            'std': date_df['Std'].mean(),
        })

NameError: name 'data' is not defined

In [11]:
df.head()

Unnamed: 0,Date (UTC),Timestamp (UTC),Republican,Democrat,Other,state
0,03-29-2024 00:00,1711670402,0.915,0.08,0.011,NE
1,03-30-2024 00:00,1711756802,0.91,0.085,0.009,NE
2,03-31-2024 00:00,1711843203,0.915,0.085,0.0085,NE
3,04-01-2024 00:00,1711929602,0.915,0.085,0.0085,NE
4,04-02-2024 00:00,1712016003,0.915,0.085,0.0065,NE


In [6]:
dataframes[0].head()

Unnamed: 0,Date (UTC),Timestamp (UTC),Republican,Democrat,Other
0,03-29-2024 00:00,1711670402,0.915,0.08,0.011
1,03-30-2024 00:00,1711756802,0.91,0.085,0.009
2,03-31-2024 00:00,1711843203,0.915,0.085,0.0085
3,04-01-2024 00:00,1711929602,0.915,0.085,0.0085
4,04-02-2024 00:00,1712016003,0.915,0.085,0.0065


In [7]:
file_name

'NM_daily.csv'