In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from matplotlib.ticker import FuncFormatter
import os

'''
N-Weight Index represents % of support for candidate by polls in aggregate with weighting by sample size of poll.
Equal-Weight Index represents % of support for candidate by polls in aggregate with equal weighting across polls.
'''

# Data engineering
# Source: https://projects.fivethirtyeight.com/polls/president-general/2024/national/

index = ['pollster_rating_name', 'start_date', 'end_date', 'sample_size']
cols = index + ['candidate_name', 'pct']
incumbent = 'Kamala Harris'
challenger = 'Donald Trump'
candidates = {'Kamala Harris': 0, 'Donald Trump': 1}

path = os.path.join('data', 'president_polls.csv')
data = pd.read_csv(path, usecols=cols)

data = data.sort_values(by='candidate_name', key=lambda x: x.map(candidates))

data = data.groupby(index, as_index=False).agg({
            'candidate_name': lambda x: (x.iloc[0], x.iloc[1]) if len(x) > 1 else (x.iloc[0], None),
            'pct': lambda x: (x.iloc[0], x.iloc[1]) if len(x) > 1 else (x.iloc[0], None)})

data = data[data['candidate_name'].apply(lambda x: None not in x) & data['pct'].apply(lambda x: None not in x)]

data = data.assign(
    candidate_0 = data['candidate_name'].apply(lambda x: x[0]),
    candidate_1 = data['candidate_name'].apply(lambda x: x[1]),
    pct_1 = data['pct'].apply(lambda x: x[0]),
    pct_2 = data['pct'].apply(lambda x: x[1]))

data = data.drop(columns=['candidate_name', 'pct'])
data = data[data['candidate_0'] != data['candidate_1']]

# Index computing

data[['start_date', 'end_date']] = data[['start_date', 'end_date']].apply(pd.to_datetime, format='mixed')
data = data.sort_values(by='start_date')

start = pd.Timestamp('2024-07-21') # data.iloc[0]['start_date']
end = pd.Timestamp(data.iloc[-1]['end_date'])
dates = pd.date_range(start=start, end=end).tolist()

index1_List, index2_List, dateList = [], [], []

for date in dates:
    N = index_1 = index_2 = num_polls = sum_1 = sum_2 = 0
    for _, row in data.iterrows():
        if date < row['start_date'] or date > row['end_date']:
            exit
        else:
            pct_1, pct_2, sample_size = row['pct_1'], row['pct_2'], row['sample_size']
            N, index_1, index_2 = N + sample_size, index_1 + pct_1 * sample_size, index_2 + pct_2 * sample_size
            num_polls, sum_1, sum_2 = num_polls + 1, sum_1 + pct_1, sum_2 + pct_2
    if N != 0:
        dateList.append(date)
        index1_List.append((index_1 / N, sum_1 / num_polls))
        index2_List.append((index_2 / N, sum_2 / num_polls))

x = dateList
y1, z1 = zip(*index1_List)
y2, z2 = zip(*index2_List)

# Storing results

result = pd.DataFrame({'date': x, 'nw_incumbent': y1, 'ew_incumbent' : z1, 'nw_challenger': y2, 'ew_challenger' : z2})
result.to_csv('indices.csv')

# Plotting results

fig, (ax1, ax2) = plt.subplots(2, figsize=(10, 10), sharex=True)

axs = (ax1, ax2)
plots = ('N-Weight Index', 'Equal-Weight Index')
x_ticks = [pd.Timestamp('2024-07-21'), pd.Timestamp('2024-08-19'), pd.Timestamp('2024-09-10'), pd.Timestamp('2024-11-05')]
x_labels = ['Post Biden', 'Democratic Convention', 'Second Debate', 'Election Day']

for ax, plot in zip(axs, plots):
    ax.xaxis.set_major_formatter(DateFormatter('%m-%d-%Y'))
    ax.yaxis.set_major_formatter(FuncFormatter(lambda y, pos: f'{y:.0f}%'))
    ax.grid(axis='y', ls='--')
    ax.set_title(f"{plot}", fontweight='bold')
    ax.set_xticks(ticks=x_ticks, labels=x_labels)
    ax.set_facecolor('#f0f0f0')

plt.sca(ax1)
plt.plot(x, y1, c='blue', label=incumbent.split()[-1])
plt.plot(x, y2, c='red', label=challenger.split()[-1])
plt.legend(loc='best');

plt.sca(ax2)
plt.plot(x, z1, c='blue', label=incumbent.split()[-1])
plt.plot(x, z2, c='red', label=challenger.split()[-1])
plt.legend(loc='best');

plt.savefig('indices.png')
plt.close(fig)