In [1]:
import pandas as pd

In [3]:
# Load the dataset
df = pd.read_csv("../../SharedData/dataset-generation/1976-2020-senate.csv")

# Filter to general elections only and valid total vote counts
df_gen = df[(df['stage'] == 'gen') & (df['totalvotes'].notna())]

# Sum Republican votes by year and state
rep_votes = df_gen[df_gen['party_simplified'] == 'REPUBLICAN'].groupby(
    ['year', 'state_po']
)['candidatevotes'].sum().reset_index(name='republican_votes')

# Sum total votes by year and state
total_votes = df_gen.groupby(
    ['year', 'state_po']
)['totalvotes'].sum().reset_index(name='total_votes')

# Merge and calculate the percentage
merged = pd.merge(rep_votes, total_votes, on=['year', 'state_po'])
merged['republican_vote_pct'] = (merged['republican_votes'] / merged['total_votes']) * 100
merged['republican_vote_pct'] = merged['republican_vote_pct'].round(2)

merged = merged.rename(columns={'state_po':'state'})

merged['date'] = pd.to_datetime(merged['year'].astype(str) + '-12-01')
merged = merged.drop(columns='year')

# Display the result
print(merged.head())

  state  republican_votes  total_votes  republican_vote_pct       date
0    AZ            321236      3706050                 8.67 1976-12-01
1    CA           3748973     37352930                10.04 1976-12-01
2    CT            785683      5446664                14.43 1976-12-01
3    DE            125454      1123975                11.16 1976-12-01
4    FL           1057886      8572602                12.34 1976-12-01


In [4]:
# export new_df to a .csv
merged.to_csv('../../SharedData/dataset-generation-final/senate_votes.csv',index=False)