In [104]:
%matplotlib inline

# Import Dependencies
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [105]:

#Files to load
census_data_to_load = os.path.join("..","Resources","Data","demographic_data.csv")
gini_file_to_load = os.path.join("..","Resources","Data","Table_B19083_2020","Table_B19083_2020_data_with_overlays.csv")


In [106]:
# Read CSV file into DataFrame
census_df = pd.read_csv(census_data_to_load,low_memory=False)
census_df

Unnamed: 0,FILEID,STUSAB,SUMLEV,GEOVAR,GEOCOMP,CHARITER,CIFSN_x,LOGRECNO,GEOID,GEOCODE,...,P0050001,P0050002,P0050003,P0050004,P0050005,P0050006,P0050007,P0050008,P0050009,P0050010
0,PLST,MO,50,0,0,0,0,2,0500000US29001,29001,...,2637,279,50,20,194,15,2358,2332,0,26
1,PLST,MO,50,0,0,0,0,3,0500000US29003,29003,...,129,129,0,0,129,0,0,0,0,0
2,PLST,MO,50,0,0,0,0,4,0500000US29005,29005,...,90,77,9,0,68,0,13,0,0,13
3,PLST,MO,50,0,0,0,0,5,0500000US29007,29007,...,1896,1489,1242,0,225,22,407,0,0,407
4,PLST,MO,50,0,0,0,0,6,0500000US29009,29009,...,236,236,44,0,192,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3098,PLST,MD,50,0,0,0,0,24,0500000US24047,24047,...,745,622,218,10,387,7,123,0,0,123
3099,PLST,MD,50,0,0,0,0,25,0500000US24510,24510,...,18001,5865,2568,122,3067,108,12136,10098,0,2038
3100,PLST,DE,50,0,0,0,0,2,0500000US10001,10001,...,4081,979,111,42,764,62,3102,2813,93,196
3101,PLST,DE,50,0,0,0,0,3,0500000US10003,10003,...,15899,6348,3390,72,2747,139,9551,8232,0,1319


In [107]:
#We are interested in the keeping columns relating to Population, Housing Count, 
# "NAME":"County Name"
# GEOID - code with last 5 digits equal to state fips + 0(0) + county fips
# "P0010001":"Total Population"
# "P0010003": "White alone"
# "P0010004": Black alone
# "P0010005": American Indian and Alaska Native Alone
# "P0010006": Asian Alone
# "P0010007": Native Hawaiian and Other Pacific Islander alone
# "P0010008":  Some other race alone
# "P0010009":"Population of 2 or more races"
# "P0020002":"Total Population: Hispanic Or Latino"
# "P0020003":"Total Population: Not Hispanic or Latino"

# See more: https://www2.census.gov/census_2010/01-Redistricting_File--PL_94-171/0FILE_STRUCTURE.pdf

col=["NAME","GEOID","GEOCODE","P0010001","P0010003","P0010004","P0010005","P0010006",
                "P0010007","P0010008","P0010009","P0020002","P0020003"]

df = census_df.loc[:,col]
df.head()

Unnamed: 0,NAME,GEOID,GEOCODE,P0010001,P0010003,P0010004,P0010005,P0010006,P0010007,P0010008,P0010009,P0020002,P0020003
0,Adair County,0500000US29001,29001,25314,21843,1279,51,687,13,230,1211,661,24653
1,Andrew County,0500000US29003,29003,18135,16907,156,53,89,1,82,847,391,17744
2,Atchison County,0500000US29005,29005,5305,5008,23,25,8,0,30,211,73,5232
3,Audrain County,0500000US29007,29007,24962,21648,1307,89,114,2,374,1428,731,24231
4,Barry County,0500000US29009,29009,34534,28655,106,453,811,73,1693,2743,3333,31201


In [108]:
def add_one(num):
    return(num+1)

In [109]:
# replace zero values with a 1 so Shannon index calculates without giving NaNs
df[["P0010004","P0010005","P0010006","P0010007","P0010008","P0010009"]] = df[["P0010004","P0010005","P0010006","P0010007","P0010008","P0010009"]].apply(add_one)

In [110]:
# Calculate non-white population
df["Non-White"] = df[["P0010004","P0010005","P0010006","P0010007","P0010008","P0010009"]].sum(axis=1)


In [111]:
# df.drop(columns=["P0010004","P0010005","P0010006","P0010007","P0010008","P0010009"],inplace=True)
# df

In [112]:
# Rename the columns
# "P0010001":"Total Population"
# "P0010002" - Population of one race
# "P0010003": "White alone"
# "P0010004": Black alone
# "P0010005": American Indian and Alaska Native Alone
# "P0010006": Asian Alone
# "P0010007": Native Hawaiian and Other Pacific Islander alone
# "P0010008":  Some other race alone
# "P0010009":"Population of 2 or more races"
# "P0020002":"Total Population: Hispanic Or Latino"
# "P0020003":"Total Population: Not Hispanic or Latino"
# "P0020005": Total Population: Not Hispanic or Latino: White alone
# "P0020006":  Total Population: Not Hispanic or Latino: Black or African American alone
# "P0020007":  Total Population: Not Hispanic or Latino: American Indian and Alaska Native alone
# "P0020008":  Total Population: Not Hispanic or Latino: Asian alone
# "P0020009":  Total Population: Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone 
# "P0020010": Total Population: Not Hispanic or Latino: Some other race alone
# "P0020011": Total Population: Not Hispanic or Latino: Population of two or more races
df.rename(columns={"NAME": "County",
                                        "P0010001":"Total Population",
                                        "P0010003":"White",
                                        "P0010004": "Black",
                                        "P0010005": "Native",
                                        "P0010006": "Asian",
                                        "P0010007": "Pacific Islander",
                                        "P0010008":"Other",
                                        "P0010009":"2+ Races",
                                        "P0020002":"Hispanic",
                                        "P0020003":"Not Hispanic",
                                        },inplace=True)


In [113]:
df.head()

Unnamed: 0,County,GEOID,GEOCODE,Total Population,White,Black,Native,Asian,Pacific Islander,Other,2+ Races,Hispanic,Not Hispanic,Non-White
0,Adair County,0500000US29001,29001,25314,21843,1280,52,688,14,231,1212,661,24653,3477
1,Andrew County,0500000US29003,29003,18135,16907,157,54,90,2,83,848,391,17744,1234
2,Atchison County,0500000US29005,29005,5305,5008,24,26,9,1,31,212,73,5232,303
3,Audrain County,0500000US29007,29007,24962,21648,1308,90,115,3,375,1429,731,24231,3320
4,Barry County,0500000US29009,29009,34534,28655,107,454,812,74,1694,2744,3333,31201,5885


In [114]:
# Add columns for percentage of each race/ethnicity
df["% White"] = df["White"]/df["Total Population"]
df["% Black"] = df["Black"]/df["Total Population"]
df["% Native"] = df["Native"]/df["Total Population"]
df["% Asian"] = df["Asian"]/df["Total Population"]
df["% Pacific Islander"] = df["Pacific Islander"]/df["Total Population"]
df["% Other"] = df["Other"]/df["Total Population"]
df["% Non-White"] = df["Non-White"]/df["Total Population"]
df["% Hispanic"] = df["Hispanic"]/df["Total Population"]
df["% Not Hispanic"] = df["Not Hispanic"]/df["Total Population"]

In [115]:
# Simpson's diversity index = 1-(sum(n(n-1))/N(N-1))
df['Simpson Race DI'] = 1-(df['White']*(df['White']-1)+df['Black']*(df['Black']-1)+df['Native']*(df['Native']-1)+df['Asian']*(df['Asian']-1)+df['Pacific Islander']*(df['Pacific Islander']-1)+df['Other']*(df['Other']-1))/(df['Total Population']*(df['Total Population']-1))


In [116]:
df['Simpson Ethnic DI'] = 1-(df['Hispanic']*(df['Hispanic']-1) + df['Not Hispanic']*(df['Not Hispanic']-1))/(df['Total Population']*(df['Total Population']-1))


In [117]:
# Shannon diversity index = -1* sum((n/N) * ln(n/N))
df['Shannon Race DI']=(-1)*((df['% White'] * np.log(df['% White'])) +(df['% Hispanic'] * np.log(df['% Hispanic'])) + (df['% Black'] * np.log(df['% Black'])) + (df['% Native'] * np.log(df['% Native'])) + (df['% Asian'] * np.log(df['% Asian'])) + (df['% Pacific Islander'] * np.log(df['% Pacific Islander'])) + (df['% Other'] * np.log(df['% Other'])))

In [118]:
# 
df['Shannon Ethnic DI']=(-1)*((df['% Not Hispanic'] * np.log(df['% Not Hispanic'])) + (df['% Hispanic'] * np.log(df['% Hispanic'])))

In [119]:
# show final dataframe
#oregon_counties_pop_df = oregon_counties_pop_df[["County","GEOCODE","Total Population","White","% White","Non-White","% Non-White","Hispanic","% Hispanic","Not Hispanic","% Not Hispanic"]]
df.sample(n=10)

Unnamed: 0,County,GEOID,GEOCODE,Total Population,White,Black,Native,Asian,Pacific Islander,Other,...,% Asian,% Pacific Islander,% Other,% Non-White,% Hispanic,% Not Hispanic,Simpson Race DI,Simpson Ethnic DI,Shannon Race DI,Shannon Ethnic DI
1468,Newton County,0500000US48351,48351,12217,9375,2087,61,25,1,112,...,0.002046,8.2e-05,0.009168,0.233118,0.028157,0.971843,0.38187,0.054734,0.688494,0.128278
2178,Cloud County,0500000US20029,20029,9032,8338,124,45,41,10,78,...,0.004539,0.001107,0.008636,0.077502,0.035319,0.964681,0.147474,0.06815,0.350241,0.152771
1469,Nolan County,0500000US48353,48353,14738,10235,713,109,105,6,1551,...,0.007124,0.000407,0.105238,0.305944,0.363279,0.636721,0.504224,0.462646,1.079227,0.655281
2845,Lawrence County,0500000US21127,21127,16293,15787,37,3,24,5,47,...,0.001473,0.000307,0.002885,0.031425,0.009084,0.990916,0.061135,0.018003,0.117639,0.051747
2200,Greenwood County,0500000US20073,20073,6016,5494,29,36,37,2,53,...,0.00615,0.000332,0.00881,0.087766,0.03391,0.96609,0.165851,0.06553,0.329652,0.14808
2689,Toole County,0500000US30101,30101,4971,4268,33,347,38,2,47,...,0.007644,0.000402,0.009455,0.142627,0.035808,0.964192,0.257818,0.069065,0.553727,0.154384
1278,Raleigh County,0500000US54081,54081,74591,64025,5848,137,832,17,410,...,0.011154,0.000228,0.005497,0.141733,0.016048,0.983952,0.256938,0.03158,0.489256,0.08223
2000,Bristol city,0500000US51520,51520,17219,14754,1052,58,163,5,104,...,0.009466,0.00029,0.00604,0.143504,0.026424,0.973576,0.261959,0.051455,0.49569,0.122084
1370,Foard County,0500000US48155,48155,1095,924,20,1,6,2,82,...,0.005479,0.001826,0.074886,0.161644,0.179909,0.820091,0.282173,0.295353,0.765516,0.471255
2265,Wichita County,0500000US20203,20203,2152,1547,4,24,3,3,274,...,0.001394,0.001394,0.127323,0.283922,0.295539,0.704461,0.467041,0.416585,0.940112,0.607037


## Read Gini index data

In [120]:
gini_df = pd.read_csv(gini_file_to_load)

In [121]:
gini_df.head()

Unnamed: 0,Gini Index,Margin of Error!!Gini Index,GEOID,Geographic Area Name
0,0.4552,0.0326,0500000US01001,"Autauga County, Alabama"
1,0.4566,0.0119,0500000US01003,"Baldwin County, Alabama"
2,0.5047,0.0252,0500000US01005,"Barbour County, Alabama"
3,0.45,0.0408,0500000US01007,"Bibb County, Alabama"
4,0.4685,0.0247,0500000US01009,"Blount County, Alabama"


In [122]:
gini_df.drop(columns=["Margin of Error!!Gini Index",'Geographic Area Name'],inplace=True)
gini_df

Unnamed: 0,Gini Index,GEOID
0,0.4552,0500000US01001
1,0.4566,0500000US01003
2,0.5047,0500000US01005
3,0.4500,0500000US01007
4,0.4685,0500000US01009
...,...,...
3216,0.4942,0500000US72145
3217,0.4471,0500000US72147
3218,0.5419,0500000US72149
3219,0.4987,0500000US72151


## Merge the dataframes

In [123]:
gini_df.shape

(3221, 2)

In [124]:
df.shape

(3103, 27)

In [125]:
#movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on = 'kaggle_id', right_index = True, how = 'left')
data_frame = pd.merge(df,gini_df,how='inner',on='GEOID')

In [126]:
data_frame.shape

(3103, 28)

In [127]:
data_frame

Unnamed: 0,County,GEOID,GEOCODE,Total Population,White,Black,Native,Asian,Pacific Islander,Other,...,% Pacific Islander,% Other,% Non-White,% Hispanic,% Not Hispanic,Simpson Race DI,Simpson Ethnic DI,Shannon Race DI,Shannon Ethnic DI,Gini Index
0,Adair County,0500000US29001,29001,25314,21843,1280,52,688,14,231,...,0.000553,0.009125,0.137355,0.026112,0.973888,0.252059,0.050862,0.531060,0.120956,0.5398
1,Andrew County,0500000US29003,29003,18135,16907,157,54,90,2,83,...,0.000110,0.004577,0.068045,0.021561,0.978439,0.130719,0.042194,0.258520,0.104052,0.4138
2,Atchison County,0500000US29005,29005,5305,5008,24,26,9,1,31,...,0.000189,0.005844,0.057116,0.013761,0.986239,0.108767,0.027148,0.206341,0.072643,0.4299
3,Audrain County,0500000US29007,29007,24962,21648,1308,90,115,3,375,...,0.000120,0.015023,0.133002,0.029285,0.970715,0.244900,0.056856,0.490666,0.132246,0.4439
4,Barry County,0500000US29009,29009,34534,28655,107,454,812,74,1694,...,0.002143,0.049053,0.170412,0.096514,0.903486,0.308356,0.174402,0.704583,0.317354,0.4703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3098,Worcester County,0500000US24047,24047,52460,41845,6237,137,754,9,841,...,0.000172,0.016031,0.202459,0.039611,0.960389,0.349146,0.076086,0.705663,0.166706,0.4880
3099,Baltimore city,0500000US24510,24510,585708,163026,338479,2313,21211,187,28047,...,0.000319,0.047886,0.721670,0.078413,0.921587,0.584942,0.144529,1.162605,0.274876,0.5123
3100,Kent County,0500000US10001,10001,181851,107685,46999,1150,4430,126,5659,...,0.000693,0.031119,0.407872,0.076882,0.923118,0.580951,0.141942,1.092749,0.271086,0.4191
3101,New Castle County,0500000US10003,10003,570719,314231,146545,2157,35201,176,28715,...,0.000308,0.050314,0.449423,0.111291,0.888709,0.624572,0.197811,1.267836,0.349206,0.4607


In [128]:
data_frame.columns

Index(['County', 'GEOID', 'GEOCODE', 'Total Population', 'White', 'Black',
       'Native', 'Asian', 'Pacific Islander', 'Other', '2+ Races', 'Hispanic',
       'Not Hispanic', 'Non-White', '% White', '% Black', '% Native',
       '% Asian', '% Pacific Islander', '% Other', '% Non-White', '% Hispanic',
       '% Not Hispanic', 'Simpson Race DI', 'Simpson Ethnic DI',
       'Shannon Race DI', 'Shannon Ethnic DI', 'Gini Index'],
      dtype='object')