In [5]:
import pandas as pd

In [6]:
# Load the dataset
df = pd.read_csv("../../SharedData/dataset-generation/senate-1976-2020.csv")

# Filter to general elections only and valid total vote counts
df_gen = df[(df['stage'] == 'gen') & (df['totalvotes'].notna())]

# Sum Republican votes by year and state
rep_votes = df_gen[df_gen['party_simplified'] == 'REPUBLICAN'].groupby(
    ['year', 'state_po']
)['candidatevotes'].sum().reset_index(name='republican_votes')

# Sum total votes by year and state
total_votes = df_gen.groupby(
    ['year', 'state_po']
)['totalvotes'].sum().reset_index(name='total_votes')

# Merge and calculate the percentage
merged = pd.merge(rep_votes, total_votes, on=['year', 'state_po'])
merged['republican_vote_pct'] = (merged['republican_votes'] / merged['total_votes']) * 100
merged['republican_vote_pct'] = merged['republican_vote_pct'].round(2)

merged = merged.rename(columns={'state_po':'state'})

merged['date'] = pd.to_datetime(merged['year'].astype(str) + '-12-01')
merged = merged[['state', 'date', 'republican_vote_pct']]

# Display the result
print(merged.head())

  state       date  republican_vote_pct
0    AZ 1976-12-01                 8.67
1    CA 1976-12-01                10.04
2    CT 1976-12-01                14.43
3    DE 1976-12-01                11.16
4    FL 1976-12-01                12.34


In [7]:
# export new_df to a .csv
merged.to_csv('../../SharedData/dataset-generation-final/republican-votes-by-state.csv',index=False)