In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pres_url = "../../data/countypres_2000-2020.csv"

In [None]:
pres = pd.read_csv(pres_url)

In [None]:
geo_url = "../../data/geo.csv"

In [None]:
geo = pd.read_csv(geo_url)

In [None]:
geo.head(2)

In [None]:
pres.head(2)

In [None]:
#pres cleanup
pres = pres.rename(columns={"county_fips":"FIPS"})
pres["FIPS"] = pres["FIPS"]
pres = pres[(pres["state_po"]=="TX") | (pres["state_po"]=="FL") | (pres["state_po"]=="CA") | (pres["state_po"]=="IL") | (pres["state_po"]=="MI")]
pres.dropna(inplace=True)
pres_cleaned = pres

In [None]:
pres_cleaned.head(2)

In [None]:
#geo cleanup
geo.replace(',','', regex=True, inplace=True)
geo["Longitude"] = geo["Longitude"].map(lambda l: float(l[1: -2])*-1 if l[0] == "–" else float(l[1: -2]))
geo["Latitude"] = geo["Latitude"].map(lambda l: float(l[1: -2])*-1 if l[0] == "–" else float(l[1: -2]))
geo["FIPS"] = geo['FIPS'].map(lambda x: int("0" + str(x)))
geo = geo[(geo["State"]=="TX") | (geo["State"]=="FL") | (geo["State"]=="CA") | (geo["State"]=="IL")| (pres["state_po"]=="MI")]
geo.drop(["Land Areakm²", "Total Areakm²", "Water Areakm²", "County Seat(s) [3]", "Sort [1]", "County [2]", "State"], axis=1, inplace=True)
geo_clean = geo

In [None]:
geo_clean.head(2)

In [None]:
pres_groups = list(pres_cleaned.groupby(['year', 'FIPS']).groups.keys())

In [None]:
row_cols = ["year", "FIPS", "office", "state", "dem_votes", "rep_votes"]

In [None]:
pres_clean = pd.DataFrame(columns = row_cols)

In [None]:
error_group = None
for group in pres_groups:
    year = group[0]
    fips = group[1]
    office = 'president'
    g = pres_cleaned[(pres_cleaned['year'] == group[0]) & (pres_cleaned['FIPS'] == group[1])]
    g.index = np.arange(0, len(g))
    g_year = g.iloc[0]["year"]
    g_FIPS = g.iloc[0]["FIPS"]
    g_office = g.iloc[0]["office"]
    g_state = g.iloc[0]["state_po"]
    g_demvotes = g[g["party"]=="DEMOCRAT"]["candidatevotes"].values[0]
    g_repvotes = g[g["party"]=="REPUBLICAN"]["candidatevotes"].values[0]
    all_county = [[g_year, g_FIPS, g_office, g_state, g_demvotes , g_repvotes]]    
    pres_clean = pres_clean.append(pd.DataFrame(all_county, columns=row_cols))

In [None]:
#additional pres_clean cleaning:
pres_clean["total_votes"] = pres_clean["dem_votes"] + pres_clean["rep_votes"]


In [None]:
# Create target column
pres_clean["target"] = pres_clean.apply(lambda row: 0 if row["dem_votes"] > row["rep_votes"] else 1, axis=1)

In [None]:
def win_percentage(row):
    if row["target"]==0:
        return round(row["dem_votes"]/row["total_votes"] *100, 2)
    else:
        return round(row["rep_votes"]/row["total_votes"] *100, 2)

In [None]:
def win_margin(row):
    if row["target"]==0:
        return round((row["dem_votes"]/row["total_votes"] *100) - 50, 2)
    else:
        return round((row["rep_votes"]/row["total_votes"] *100) - 50, 2)

In [None]:
pres_clean["win_percentage"] = pres_clean.apply(win_percentage, axis=1)
pres_clean["win_margin"] = pres_clean.apply(win_margin, axis=1)

In [None]:
pres_clean[(pres_clean["year"]==2016) | (pres_clean["state"]=="CA")].head(2)

In [None]:
j = pres_clean.merge(geo_clean, on="FIPS", how="inner")

In [None]:
j[j["year"]==2016].shape

In [None]:
j[j["year"]==2020].to_csv("pres2020.csv")

## Visualizations

#### 1 = Republican win, 0 = Democrat win

In [None]:
sum_rep = j.rep_votes.sum()

In [None]:
sum_dem = j.dem_votes.sum()

In [None]:
sum_total_votes = sum_rep + sum_dem

In [None]:
def vote_inferential(df):
    sum_rep = j.rep_votes.sum()
    sum_dem = j.dem_votes.sum()
    sum_total = sum_rep + sum_dem
    rep_vote_per = round(sum_rep/sum_total *100, 2)
    dem_vote_per = round(sum_dem/sum_total *100, 2)
    return (rep_vote_per, dem_vote_per)

In [None]:
vote_inferential(j)

In [None]:
j.target.value_counts(normalize=True)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
plt.title("Total Votes v. Total Counties Won", size=20)
plt.tick_params(axis='both', labelsize=12)
plt.ylabel("Percentage", size=15)
sns.barplot(["R. Votes", "D. Votes", "R. Counties Won", "B. Counties Won"], [46.13,57.87, 81.67,18.34], palette=["red", "blue", "red", "blue"]);

In [None]:
wm = j[["win_margin"]]

In [None]:
wm["likelihood"] = wm["win_margin"].map(lambda margin: "unlikely" if margin > 30 else "likely")

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.histplot(data=wm, x=wm["win_margin"], bins=20, ax=ax, palette = ["#ffc300", "#003566"], hue=wm["likelihood"])
plt.text(wm["win_margin"].mean()+1, 150, "Mean", color = "black", size=15, fontweight='bold')
plt.text(wm["win_margin"].mean()+1, 137, "(19%)", color = "black", size=15, fontweight='bold')
plt.vlines(wm["win_margin"].mean(), 0, 210, color="black", linewidth=6)
plt.title("Win Margin Percentage Distribution", size=20)
plt.ylabel("County Elections", size=15)
plt.xlabel("Win Margin(Percentage)", size=15)