In [35]:
#import dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import sem
from scipy.stats import linregress
from pprint import pprint

#file locations
#election file
election_data_2004_csv = "Resources/election_data2004.csv"
#census file
census_data_2004_csv = "Resources/population2000-2016ALL.csv"

In [36]:
#read CSV files
election_data_2004= pd.read_csv(election_data_2004_csv)
census_data_2004= pd.read_csv(census_data_2004_csv)


In [37]:
#create column which combines State and County name into one column (Location) for precision when merging
#some county names are repeated in multiple states
election_data_2004['Location'] = (election_data_2004.state +", " +election_data_2004.county)
election_data_2004

Unnamed: 0,year,state,state_code,county,candidate,party,candidate_votes,total_votes,Location
0,2004,Alabama,AL,Autauga,John Kerry,democrat,4758.0,20081,"Alabama, Autauga"
1,2004,Alabama,AL,Autauga,George W. Bush,republican,15196.0,20081,"Alabama, Autauga"
2,2004,Alabama,AL,Baldwin,John Kerry,democrat,15599.0,69320,"Alabama, Baldwin"
3,2004,Alabama,AL,Baldwin,George W. Bush,republican,52971.0,69320,"Alabama, Baldwin"
4,2004,Alabama,AL,Barbour,John Kerry,democrat,4832.0,10777,"Alabama, Barbour"
...,...,...,...,...,...,...,...,...,...
6303,2004,Alaska,AK,District 38,George W. Bush,republican,2459.0,5075,"Alaska, District 38"
6304,2004,Alaska,AK,District 39,John Kerry,democrat,2365.0,5568,"Alaska, District 39"
6305,2004,Alaska,AK,District 39,George W. Bush,republican,2881.0,5568,"Alaska, District 39"
6306,2004,Alaska,AK,District 40,John Kerry,democrat,2328.0,5788,"Alaska, District 40"


In [4]:
#remove the word 'county' from county column in census data to make more location matches, create new column with these cleaner names
census_data_2004.head()
census_data_2004['new_county'] = census_data_2004['County'].str.replace(' County', '')


In [34]:
#census_data_2004
#confirm individual political parties, no green/other/nan present for this yeaer
election_data_2004['party'].value_counts()

democrat      3154
republican    3154
Name: party, dtype: int64

In [6]:
#filter out data to only see democrat party so there are less repeats in data when comparing
election_data_2004_11 = election_data_2004[election_data_2004['party']=="democrat"]
election_data_2004_11

Unnamed: 0,year,state,state_code,county,candidate,party,candidate_votes,total_votes,Location
0,2004,Alabama,AL,Autauga,John Kerry,democrat,4758.0,20081,"Alabama, Autauga"
2,2004,Alabama,AL,Baldwin,John Kerry,democrat,15599.0,69320,"Alabama, Baldwin"
4,2004,Alabama,AL,Barbour,John Kerry,democrat,4832.0,10777,"Alabama, Barbour"
6,2004,Alabama,AL,Bibb,John Kerry,democrat,2089.0,7600,"Alabama, Bibb"
8,2004,Alabama,AL,Blount,John Kerry,democrat,3938.0,21504,"Alabama, Blount"
...,...,...,...,...,...,...,...,...,...
6298,2004,Alaska,AK,District 36,John Kerry,democrat,3245.0,9241,"Alaska, District 36"
6300,2004,Alaska,AK,District 37,John Kerry,democrat,1970.0,5208,"Alaska, District 37"
6302,2004,Alaska,AK,District 38,John Kerry,democrat,2366.0,5075,"Alaska, District 38"
6304,2004,Alaska,AK,District 39,John Kerry,democrat,2365.0,5568,"Alaska, District 39"


In [7]:
#create separate lists of the location column in each dataframe
csv2 = election_data_2004_11.Location.tolist()
csv3 = census_data_2004.Location.tolist()

In [24]:
#find differences between columns
list_difference = []
for item in csv2:
  if item not in csv3:
    list_difference.append(item)

In [38]:
print(list_difference)

['Florida, Desoto', 'Louisiana, Acadia', 'Louisiana, Allen', 'Louisiana, Ascension', 'Louisiana, Assumption', 'Louisiana, Avoyelles', 'Louisiana, Beauregard', 'Louisiana, Bienville', 'Louisiana, Bossier', 'Louisiana, Caddo', 'Louisiana, Calcasieu', 'Louisiana, Caldwell', 'Louisiana, Cameron', 'Louisiana, Catahoula', 'Louisiana, Claiborne', 'Louisiana, Concordia', 'Louisiana, De Soto', 'Louisiana, East Baton Rouge', 'Louisiana, East Carroll', 'Louisiana, East Feliciana', 'Louisiana, Evangeline', 'Louisiana, Franklin', 'Louisiana, Grant', 'Louisiana, Iberia', 'Louisiana, Iberville', 'Louisiana, Jackson', 'Louisiana, Jefferson', 'Louisiana, Jefferson Davis', 'Louisiana, Lafayette', 'Louisiana, Lafourche', 'Louisiana, La Salle', 'Louisiana, Lincoln', 'Louisiana, Livingston', 'Louisiana, Madison', 'Louisiana, Morehouse', 'Louisiana, Natchitoches', 'Louisiana, Orleans', 'Louisiana, Ouachita', 'Louisiana, Plaquemines', 'Louisiana, Pointe Coupee', 'Louisiana, Rapides', 'Louisiana, Red River', 

In [39]:
#find differences in the other direction
list_difference2 = []
for item in csv3:
  if item not in csv2:
    list_difference2.append(item)

In [40]:
print(list_difference2)

['Alaska, Aleutians East Borough', 'Alaska, Aleutians West Census Area', 'Alaska, Anchorage Municipality', 'Alaska, Bethel Census Area', 'Alaska, Bristol Bay Borough', 'Alaska, Denali Borough', 'Alaska, Dillingham Census Area', 'Alaska, Fairbanks North Star Borough', 'Alaska, Haines Borough', 'Alaska, Hoonah-Angoon Census Area', 'Alaska, Juneau City and Borough', 'Alaska, Kenai Peninsula Borough', 'Alaska, Ketchikan Gateway Borough', 'Alaska, Kodiak Island Borough', 'Alaska, Lake and Peninsula Borough', 'Alaska, Matanuska-Susitna Borough', 'Alaska, Nome Census Area', 'Alaska, North Slope Borough', 'Alaska, Northwest Arctic Borough', 'Alaska, Petersburg Census Area', 'Alaska, Prince of Wales-Hyder Census Area', 'Alaska, Sitka City and Borough', 'Alaska, Skagway Municipality', 'Alaska, Southeast Fairbanks Census Area', 'Alaska, Valdez-Cordova Census Area', 'Alaska, Wade Hampton Census Area', 'Alaska, Wrangell City and Borough', 'Alaska, Yakutat City and Borough', 'Alaska, Yukon-Koyukuk C

In [41]:
#count the total length of each difference list
len(list_difference2)

143

In [42]:
#count the total length of each difference list
len(list_difference)

148

In [14]:
#create new dataframes from each list, name the only column 'location'
election_locations_2004 = pd.DataFrame(csv2, columns = ['location'])
census_locations_2004 = pd.DataFrame(csv3, columns = ['location'])

In [30]:
#census_locations_2004

In [31]:
#merging election data with common locations to create shorter, but matching data
result_locations_2004 = pd.merge(election_locations_2004, census_locations_2004, how='inner', left_on="location", right_on="location",
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)
            

In [32]:
#view resuts, 3006 rows.
result_locations_2004

Unnamed: 0,location
0,"Alabama, Autauga"
1,"Alabama, Baldwin"
2,"Alabama, Barbour"
3,"Alabama, Bibb"
4,"Alabama, Blount"
...,...
3001,"Wyoming, Sweetwater"
3002,"Wyoming, Teton"
3003,"Wyoming, Uinta"
3004,"Wyoming, Washakie"


In [18]:
election_result_locations_2004 = pd.merge(result_locations_2004, election_data_2004, how='inner', left_on="location", right_on="Location",
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [71]:
#view results
election_result_locations_2004

election_result_locations_2004_drop = election_result_locations_2004.drop(["location"], axis=1)


In [72]:
#resuults of ELECTION DATA to csv
election_result_locations_2004_drop.to_csv(r'Resources/ELECTION_results_FINAL_LOCATIONS_2004.csv', index = False, header=True)


In [21]:
#merging census data with common locations 

census_result_locations_2004 = pd.merge(result_locations_2004, census_data_2004, how='inner', left_on="location", right_on="Location",
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [73]:
census_result_locations_2004

census_result_locations_2004_drop = census_result_locations_2004.drop(['location', '2000', '2001', '2002', '2003', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012','2013', '2014', '2015', '2016'], axis=1)


In [74]:
#CENSUS data to CSV
census_result_locations_2004_drop.to_csv(r'Resources/CENSUS_results_FINAL_LOCATIONS_2004.csv', index = False, header=True)


In [75]:
#merging census data with common locations 

census_AND_election_2004_1 = pd.merge(election_result_locations_2004_drop, census_result_locations_2004_drop, how='inner', left_on="Location", right_on="Location",
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [76]:
census_AND_election_2004_1

Unnamed: 0,year,state,state_code,county,candidate,party,candidate_votes,total_votes,Location,State,County,2004,new_county
0,2004,Alabama,AL,Autauga,John Kerry,democrat,4758.0,20081,"Alabama, Autauga",Alabama,Autauga,48366,Autauga
1,2004,Alabama,AL,Autauga,George W. Bush,republican,15196.0,20081,"Alabama, Autauga",Alabama,Autauga,48366,Autauga
2,2004,Alabama,AL,Baldwin,John Kerry,democrat,15599.0,69320,"Alabama, Baldwin",Alabama,Baldwin,156266,Baldwin
3,2004,Alabama,AL,Baldwin,George W. Bush,republican,52971.0,69320,"Alabama, Baldwin",Alabama,Baldwin,156266,Baldwin
4,2004,Alabama,AL,Barbour,John Kerry,democrat,4832.0,10777,"Alabama, Barbour",Alabama,Barbour,28287,Barbour
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6067,2004,Wyoming,WY,Uinta,George W. Bush,republican,6081.0,8081,"Wyoming, Uinta",Wyoming,Uinta,19470,Uinta
6068,2004,Wyoming,WY,Washakie,John Kerry,democrat,855.0,4114,"Wyoming, Washakie",Wyoming,Washakie,7960,Washakie
6069,2004,Wyoming,WY,Washakie,George W. Bush,republican,3200.0,4114,"Wyoming, Washakie",Wyoming,Washakie,7960,Washakie
6070,2004,Wyoming,WY,Weston,John Kerry,democrat,580.0,3392,"Wyoming, Weston",Wyoming,Weston,6646,Weston


In [77]:
census_AND_election_2004_drop = census_AND_election_2004_1.drop(['state', 'county', 'new_county'], axis=1)


In [78]:
census_AND_election_2004_drop_rename = census_AND_election_2004_drop.rename(columns={"year":"Year", "state_code":"State_Code", "candidate":"Candidate", "party":"Party", "candidate_votes":"Candidate_Votes", "total_votes":"Total_Votes", "2004":"County_Pop_2004"
})




In [79]:
census_AND_election_2004_reorder = census_AND_election_2004_drop_rename[["County", "State", "State_Code", "Location", "Year", "Party", "Candidate", "Candidate_Votes", "Total_Votes", "County_Pop_2004"]]




In [80]:
census_AND_election_2004_reorder

Unnamed: 0,County,State,State_Code,Location,Year,Party,Candidate,Candidate_Votes,Total_Votes,County_Pop_2004
0,Autauga,Alabama,AL,"Alabama, Autauga",2004,democrat,John Kerry,4758.0,20081,48366
1,Autauga,Alabama,AL,"Alabama, Autauga",2004,republican,George W. Bush,15196.0,20081,48366
2,Baldwin,Alabama,AL,"Alabama, Baldwin",2004,democrat,John Kerry,15599.0,69320,156266
3,Baldwin,Alabama,AL,"Alabama, Baldwin",2004,republican,George W. Bush,52971.0,69320,156266
4,Barbour,Alabama,AL,"Alabama, Barbour",2004,democrat,John Kerry,4832.0,10777,28287
...,...,...,...,...,...,...,...,...,...,...
6067,Uinta,Wyoming,WY,"Wyoming, Uinta",2004,republican,George W. Bush,6081.0,8081,19470
6068,Washakie,Wyoming,WY,"Wyoming, Washakie",2004,democrat,John Kerry,855.0,4114,7960
6069,Washakie,Wyoming,WY,"Wyoming, Washakie",2004,republican,George W. Bush,3200.0,4114,7960
6070,Weston,Wyoming,WY,"Wyoming, Weston",2004,democrat,John Kerry,580.0,3392,6646


In [83]:
census_AND_election_2004_reorder.to_csv(r'Resources/Analysis Files/2004/CENSUS_and_POPULATION_FINAL_2004.csv', index = False, header=True)
