In [67]:
# Import Dependencies
import pandas as pd
import numpy as np
import os


In [68]:

#Files to load
census_data_to_load = os.path.join("..","Resources","Data","demographic_data.csv")
gini_file_to_load = os.path.join("..","Resources","Data","Table_B19083_2020","Table_B19083_2020_data_with_overlays.csv")


In [69]:
# Read CSV file into DataFrame
census_df = pd.read_csv(census_data_to_load,low_memory=False)
census_df

Unnamed: 0,FILEID,STUSAB,SUMLEV,GEOVAR,GEOCOMP,CHARITER,CIFSN_x,LOGRECNO,GEOID,GEOCODE,...,P0050001,P0050002,P0050003,P0050004,P0050005,P0050006,P0050007,P0050008,P0050009,P0050010
0,PLST,MO,50,0,0,0,0,2,0500000US29001,29001,...,2637,279,50,20,194,15,2358,2332,0,26
1,PLST,MO,50,0,0,0,0,3,0500000US29003,29003,...,129,129,0,0,129,0,0,0,0,0
2,PLST,MO,50,0,0,0,0,4,0500000US29005,29005,...,90,77,9,0,68,0,13,0,0,13
3,PLST,MO,50,0,0,0,0,5,0500000US29007,29007,...,1896,1489,1242,0,225,22,407,0,0,407
4,PLST,MO,50,0,0,0,0,6,0500000US29009,29009,...,236,236,44,0,192,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3098,PLST,MD,50,0,0,0,0,24,0500000US24047,24047,...,745,622,218,10,387,7,123,0,0,123
3099,PLST,MD,50,0,0,0,0,25,0500000US24510,24510,...,18001,5865,2568,122,3067,108,12136,10098,0,2038
3100,PLST,DE,50,0,0,0,0,2,0500000US10001,10001,...,4081,979,111,42,764,62,3102,2813,93,196
3101,PLST,DE,50,0,0,0,0,3,0500000US10003,10003,...,15899,6348,3390,72,2747,139,9551,8232,0,1319


In [70]:
#We are interested in the keeping columns relating to Population, Housing Count, 
# "NAME":"County Name"
# GEOID - code with last 5 digits equal to state fips + 0(0) + county fips
# "P0010001":"Total Population"
# "P0010003": "White alone"
# "P0010004": Black alone
# "P0010005": American Indian and Alaska Native Alone
# "P0010006": Asian Alone
# "P0010007": Native Hawaiian and Other Pacific Islander alone
# "P0010008":  Some other race alone
# "P0010009":"Population of 2 or more races"
# "P0020002":"Total Population: Hispanic Or Latino"
# "P0020003":"Total Population: Not Hispanic or Latino"

# See more: https://www2.census.gov/census_2010/01-Redistricting_File--PL_94-171/0FILE_STRUCTURE.pdf

col=["NAME","GEOID","GEOCODE","P0010001","P0010003","P0010004","P0010005","P0010006",
                "P0010007","P0010008","P0010009","P0020002","P0020003"]

df = census_df.loc[:,col]
df.head()

Unnamed: 0,NAME,GEOID,GEOCODE,P0010001,P0010003,P0010004,P0010005,P0010006,P0010007,P0010008,P0010009,P0020002,P0020003
0,Adair County,0500000US29001,29001,25314,21843,1279,51,687,13,230,1211,661,24653
1,Andrew County,0500000US29003,29003,18135,16907,156,53,89,1,82,847,391,17744
2,Atchison County,0500000US29005,29005,5305,5008,23,25,8,0,30,211,73,5232
3,Audrain County,0500000US29007,29007,24962,21648,1307,89,114,2,374,1428,731,24231
4,Barry County,0500000US29009,29009,34534,28655,106,453,811,73,1693,2743,3333,31201


In [71]:
# Create a function to add 1 to a number
def add_one(num):
    return(num+1)

In [72]:
# replace zero values in population columns with a 1 so Shannon index calculates without giving NaNs
df[["P0010004","P0010005","P0010006","P0010007","P0010008","P0010009"]] = df[["P0010004","P0010005","P0010006","P0010007","P0010008","P0010009"]].apply(add_one)

In [73]:
# Calculate non-white population
df["Not White"] = df[["P0010004","P0010005","P0010006","P0010007","P0010008","P0010009"]].sum(axis=1)


In [74]:
# Rename the columns
# "P0010001":"Total Population"
# "P0010002" - Population of one race
# "P0010003": "White alone"
# "P0010004": Black alone
# "P0010005": American Indian and Alaska Native Alone
# "P0010006": Asian Alone
# "P0010007": Native Hawaiian and Other Pacific Islander alone
# "P0010008":  Some other race alone
# "P0010009":"Population of 2 or more races"
# "P0020002":"Total Population: Hispanic Or Latino"
# "P0020003":"Total Population: Not Hispanic or Latino"
# "P0020005": Total Population: Not Hispanic or Latino: White alone
# "P0020006":  Total Population: Not Hispanic or Latino: Black or African American alone
# "P0020007":  Total Population: Not Hispanic or Latino: American Indian and Alaska Native alone
# "P0020008":  Total Population: Not Hispanic or Latino: Asian alone
# "P0020009":  Total Population: Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander alone 
# "P0020010": Total Population: Not Hispanic or Latino: Some other race alone
# "P0020011": Total Population: Not Hispanic or Latino: Population of two or more races
df.rename(columns={"NAME": "County",
                    "GEOCODE": "county_FIPS",
                    "P0010001":"Total Population",
                    "P0010003":"White",
                    "P0010004": "Black",
                    "P0010005": "Native",
                    "P0010006": "Asian",
                    "P0010007": "Pacific Islander",
                    "P0010008":"Other",
                    "P0010009":"Two or more Races",
                    "P0020002":"Hispanic",
                    "P0020003":"Not Hispanic",
                                        },inplace=True)


In [75]:
df.head()

Unnamed: 0,County,GEOID,county_FIPS,Total Population,White,Black,Native,Asian,Pacific Islander,Other,Two or more Races,Hispanic,Not Hispanic,Not White
0,Adair County,0500000US29001,29001,25314,21843,1280,52,688,14,231,1212,661,24653,3477
1,Andrew County,0500000US29003,29003,18135,16907,157,54,90,2,83,848,391,17744,1234
2,Atchison County,0500000US29005,29005,5305,5008,24,26,9,1,31,212,73,5232,303
3,Audrain County,0500000US29007,29007,24962,21648,1308,90,115,3,375,1429,731,24231,3320
4,Barry County,0500000US29009,29009,34534,28655,107,454,812,74,1694,2744,3333,31201,5885


In [76]:
# Add columns for percentage of each race/ethnicity
df["pct White"] = df["White"]/df["Total Population"]
df["pct Black"] = df["Black"]/df["Total Population"]
df["pct Native"] = df["Native"]/df["Total Population"]
df["pct Asian"] = df["Asian"]/df["Total Population"]
df["pct Pacific Islander"] = df["Pacific Islander"]/df["Total Population"]
df["pct Other"] = df["Other"]/df["Total Population"]
df["pct Not White"] = df["Not White"]/df["Total Population"]
df["pct Hispanic"] = df["Hispanic"]/df["Total Population"]
df["pct Not Hispanic"] = df["Not Hispanic"]/df["Total Population"]
df["pct Two or more Races"] = df["Two or more Races"]/df["Total Population"]

In [77]:
# Simpson's diversity index = 1-(sum(n(n-1))/N(N-1))
df['Simpson Race DI'] = 1 - \
    (
        df['White']*(df['White']-1) + \
        df['Black']*(df['Black']-1) + \
        df['Native']*(df['Native']-1) + \
        df['Asian']*(df['Asian']-1) + \
        df['Pacific Islander']*(df['Pacific Islander']-1) + \
        df['Other']*(df['Other']-1) + \
        df['Two or more Races'] * (df['Two or more Races']-1)
    ) \
    /(df['Total Population']*(df['Total Population']-1))


In [78]:
df['Simpson Ethnic DI'] = 1 - \
        (
            df['Hispanic']*(df['Hispanic']-1) + \
            df['Not Hispanic']*(df['Not Hispanic']-1)
        ) \
        /(df['Total Population']*(df['Total Population']-1))


In [79]:
# Shannon diversity index = -1* sum((n/N) * ln(n/N))
df['Shannon Race DI']=(-1) * \
    (
        (df['pct White'] * np.log(df['pct White'])) + \
        (df['pct Hispanic'] * np.log(df['pct Hispanic'])) + \
        (df['pct Black'] * np.log(df['pct Black'])) + \
        (df['pct Native'] * np.log(df['pct Native'])) + \
        (df['pct Asian'] * np.log(df['pct Asian'])) + \
        (df['pct Pacific Islander'] * np.log(df['pct Pacific Islander'])) + \
        (df['pct Other'] * np.log(df['pct Other'])) + \
        (df['pct Two or more Races'] * np.log(df['pct Two or more Races']))
    )

In [80]:
# Shannon diversity index for Hispanic/Not Hispanic
df['Shannon Ethnic DI']=(-1) * \
    (
        (df['pct Not Hispanic'] * np.log(df['pct Not Hispanic'])) + \
        (df['pct Hispanic'] * np.log(df['pct Hispanic']))
    )

In [81]:
# show final dataframe
#oregon_counties_pop_df = oregon_counties_pop_df[["County","GEOCODE","Total Population","White","% White","Non-White","% Non-White","Hispanic","% Hispanic","Not Hispanic","% Not Hispanic"]]
df.sample(n=10)

Unnamed: 0,County,GEOID,county_FIPS,Total Population,White,Black,Native,Asian,Pacific Islander,Other,...,pct Pacific Islander,pct Other,pct Not White,pct Hispanic,pct Not Hispanic,pct Two or more Races,Simpson Race DI,Simpson Ethnic DI,Shannon Race DI,Shannon Ethnic DI
2048,Grant County,0500000US41023,41023,7233,6523,14,66,24,11,70,...,0.001521,0.009678,0.098991,0.039126,0.960874,0.073414,0.181128,0.075201,0.540358,0.165157
1448,McMullen County,0500000US48311,48311,600,411,4,2,4,6,26,...,0.01,0.043333,0.325,0.373333,0.626667,0.255,0.464463,0.468692,1.243344,0.660706
1838,Burnett County,0500000US55013,55013,16526,14756,66,722,54,11,82,...,0.000666,0.004962,0.107467,0.01537,0.98463,0.05089,0.198199,0.030269,0.525611,0.079425
2090,Beaver County,0500000US40007,40007,5049,3811,5,47,16,8,798,...,0.001584,0.158051,0.246385,0.258269,0.741731,0.073282,0.399901,0.383208,1.123891,0.571239
2197,Grant County,0500000US20067,20067,7352,4223,11,165,15,6,1511,...,0.000816,0.205522,0.426415,0.525299,0.474701,0.194097,0.589719,0.498788,1.413409,0.691867
2028,Richmond city,0500000US51760,51760,226610,98140,91654,899,6260,116,15442,...,0.000512,0.068144,0.566948,0.104792,0.895208,0.062244,0.639563,0.187623,1.445759,0.335487
2242,Republic County,0500000US20157,20157,4674,4437,16,14,12,10,12,...,0.002139,0.002567,0.05199,0.019469,0.980531,0.038297,0.097357,0.038189,0.331645,0.095967
2380,Pickens County,0500000US13227,13227,33216,30400,302,142,195,7,478,...,0.000211,0.014391,0.084959,0.036067,0.963933,0.05112,0.159418,0.069534,0.511951,0.155237
2985,Sangamon County,0500000US17167,17167,196343,151998,25959,497,4336,66,1887,...,0.000336,0.009611,0.225885,0.027039,0.972961,0.059111,0.37914,0.052617,0.877167,0.124295
1052,Taos County,0500000US35055,35055,34489,19084,174,2290,211,16,5131,...,0.000464,0.148772,0.446838,0.505379,0.494621,0.220041,0.618814,0.499957,1.530447,0.693089


## Read Gini index data
##### https://www.census.gov/topics/income-poverty/income-inequality/about/metrics/gini-index.html 
##### The Gini Index is a summary measure of income inequality. The Gini coefficient incorporates the detailed shares data into a single statistic, which summarizes the dispersion of income across the entire income distribution. The Gini coefficient ranges from 0, indicating perfect equality (where everyone receives an equal share), to 1, perfect inequality (where only one recipient or group of recipients receives all the income). The Gini is based on the difference between the Lorenz curve (the observed cumulative income distribution) and the notion of a perfectly equal income distribution.

In [82]:
# Read gini file 
gini_df = pd.read_csv(gini_file_to_load)

In [83]:
gini_df.head()

Unnamed: 0,Gini Index,Margin of Error!!Gini Index,GEOID,Geographic Area Name
0,0.4552,0.0326,0500000US01001,"Autauga County, Alabama"
1,0.4566,0.0119,0500000US01003,"Baldwin County, Alabama"
2,0.5047,0.0252,0500000US01005,"Barbour County, Alabama"
3,0.45,0.0408,0500000US01007,"Bibb County, Alabama"
4,0.4685,0.0247,0500000US01009,"Blount County, Alabama"


In [84]:
gini_df.drop(columns=["Margin of Error!!Gini Index"],inplace=True)
gini_df

Unnamed: 0,Gini Index,GEOID,Geographic Area Name
0,0.4552,0500000US01001,"Autauga County, Alabama"
1,0.4566,0500000US01003,"Baldwin County, Alabama"
2,0.5047,0500000US01005,"Barbour County, Alabama"
3,0.4500,0500000US01007,"Bibb County, Alabama"
4,0.4685,0500000US01009,"Blount County, Alabama"
...,...,...,...
3216,0.4942,0500000US72145,"Vega Baja Municipio, Puerto Rico"
3217,0.4471,0500000US72147,"Vieques Municipio, Puerto Rico"
3218,0.5419,0500000US72149,"Villalba Municipio, Puerto Rico"
3219,0.4987,0500000US72151,"Yabucoa Municipio, Puerto Rico"


## Merge the dataframes

In [85]:
gini_df.shape

(3221, 3)

In [86]:
df.shape

(3103, 28)

In [87]:
#movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on = 'kaggle_id', right_index = True, how = 'left')
data_frame = pd.merge(df,gini_df,how='inner',on='GEOID')

In [88]:
data_frame.shape

(3103, 30)

In [89]:
data_frame

Unnamed: 0,County,GEOID,county_FIPS,Total Population,White,Black,Native,Asian,Pacific Islander,Other,...,pct Not White,pct Hispanic,pct Not Hispanic,pct Two or more Races,Simpson Race DI,Simpson Ethnic DI,Shannon Race DI,Shannon Ethnic DI,Gini Index,Geographic Area Name
0,Adair County,0500000US29001,29001,25314,21843,1280,52,688,14,231,...,0.137355,0.026112,0.973888,0.047879,0.249769,0.050862,0.676567,0.120956,0.5398,"Adair County, Missouri"
1,Andrew County,0500000US29003,29003,18135,16907,157,54,90,2,83,...,0.068045,0.021561,0.978439,0.046760,0.128535,0.042194,0.401734,0.104052,0.4138,"Andrew County, Missouri"
2,Atchison County,0500000US29005,29005,5305,5008,24,26,9,1,31,...,0.057116,0.013761,0.986239,0.039962,0.107177,0.027148,0.335013,0.072643,0.4299,"Atchison County, Missouri"
3,Audrain County,0500000US29007,29007,24962,21648,1308,90,115,3,375,...,0.133002,0.029285,0.970715,0.057247,0.241625,0.056856,0.654415,0.132246,0.4439,"Audrain County, Missouri"
4,Barry County,0500000US29009,29009,34534,28655,107,454,812,74,1694,...,0.170412,0.096514,0.903486,0.079458,0.302044,0.174402,0.905812,0.317354,0.4703,"Barry County, Missouri"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3098,Worcester County,0500000US24047,24047,52460,41845,6237,137,754,9,841,...,0.202459,0.039611,0.960389,0.050381,0.346609,0.076086,0.856209,0.166706,0.4880,"Worcester County, Maryland"
3099,Baltimore city,0500000US24510,24510,585708,163026,338479,2313,21211,187,28047,...,0.721670,0.078413,0.921587,0.055405,0.581873,0.144529,1.322896,0.274876,0.5123,"Baltimore city, Maryland"
3100,Kent County,0500000US10001,10001,181851,107685,46999,1150,4430,126,5659,...,0.407872,0.076882,0.923118,0.086928,0.573395,0.141942,1.305087,0.271086,0.4191,"Kent County, Delaware"
3101,New Castle County,0500000US10003,10003,570719,314231,146545,2157,35201,176,28715,...,0.449423,0.111291,0.888709,0.076570,0.618709,0.197811,1.464587,0.349206,0.4607,"New Castle County, Delaware"


In [90]:
# Replace spaces with underscore for exporting to sql
data_frame.columns = [ column.replace(' ','_') for column in data_frame.columns]

In [91]:
data_frame.columns

Index(['County', 'GEOID', 'county_FIPS', 'Total_Population', 'White', 'Black',
       'Native', 'Asian', 'Pacific_Islander', 'Other', 'Two_or_more_Races',
       'Hispanic', 'Not_Hispanic', 'Not_White', 'pct_White', 'pct_Black',
       'pct_Native', 'pct_Asian', 'pct_Pacific_Islander', 'pct_Other',
       'pct_Not_White', 'pct_Hispanic', 'pct_Not_Hispanic',
       'pct_Two_or_more_Races', 'Simpson_Race_DI', 'Simpson_Ethnic_DI',
       'Shannon_Race_DI', 'Shannon_Ethnic_DI', 'Gini_Index',
       'Geographic_Area_Name'],
      dtype='object')

In [92]:
data_frame = data_frame[['county_FIPS', 'Geographic_Area_Name', 'County', 'GEOID', 'Total_Population', 'White', 'Black',
       'Native', 'Asian', 'Pacific_Islander', 'Other', 'Two_or_more_Races',
       'Hispanic', 'Not_Hispanic', 'Not_White', 'pct_White', 'pct_Black',
       'pct_Native', 'pct_Asian', 'pct_Pacific_Islander', 'pct_Other',
       'pct_Not_White', 'pct_Hispanic', 'pct_Not_Hispanic',
       'pct_Two_or_more_Races', 'Simpson_Race_DI', 'Simpson_Ethnic_DI',
       'Shannon_Race_DI', 'Shannon_Ethnic_DI', 'Gini_Index'
       ]]



In [93]:
data_frame

Unnamed: 0,county_FIPS,Geographic_Area_Name,County,GEOID,Total_Population,White,Black,Native,Asian,Pacific_Islander,...,pct_Other,pct_Not_White,pct_Hispanic,pct_Not_Hispanic,pct_Two_or_more_Races,Simpson_Race_DI,Simpson_Ethnic_DI,Shannon_Race_DI,Shannon_Ethnic_DI,Gini_Index
0,29001,"Adair County, Missouri",Adair County,0500000US29001,25314,21843,1280,52,688,14,...,0.009125,0.137355,0.026112,0.973888,0.047879,0.249769,0.050862,0.676567,0.120956,0.5398
1,29003,"Andrew County, Missouri",Andrew County,0500000US29003,18135,16907,157,54,90,2,...,0.004577,0.068045,0.021561,0.978439,0.046760,0.128535,0.042194,0.401734,0.104052,0.4138
2,29005,"Atchison County, Missouri",Atchison County,0500000US29005,5305,5008,24,26,9,1,...,0.005844,0.057116,0.013761,0.986239,0.039962,0.107177,0.027148,0.335013,0.072643,0.4299
3,29007,"Audrain County, Missouri",Audrain County,0500000US29007,24962,21648,1308,90,115,3,...,0.015023,0.133002,0.029285,0.970715,0.057247,0.241625,0.056856,0.654415,0.132246,0.4439
4,29009,"Barry County, Missouri",Barry County,0500000US29009,34534,28655,107,454,812,74,...,0.049053,0.170412,0.096514,0.903486,0.079458,0.302044,0.174402,0.905812,0.317354,0.4703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3098,24047,"Worcester County, Maryland",Worcester County,0500000US24047,52460,41845,6237,137,754,9,...,0.016031,0.202459,0.039611,0.960389,0.050381,0.346609,0.076086,0.856209,0.166706,0.4880
3099,24510,"Baltimore city, Maryland",Baltimore city,0500000US24510,585708,163026,338479,2313,21211,187,...,0.047886,0.721670,0.078413,0.921587,0.055405,0.581873,0.144529,1.322896,0.274876,0.5123
3100,10001,"Kent County, Delaware",Kent County,0500000US10001,181851,107685,46999,1150,4430,126,...,0.031119,0.407872,0.076882,0.923118,0.086928,0.573395,0.141942,1.305087,0.271086,0.4191
3101,10003,"New Castle County, Delaware",New Castle County,0500000US10003,570719,314231,146545,2157,35201,176,...,0.050314,0.449423,0.111291,0.888709,0.076570,0.618709,0.197811,1.464587,0.349206,0.4607


# Connect to SQL DB 
### "../Database/database.sqlite3"

In [94]:
import sqlite3

In [95]:
# Create a SQL connection to the existing SQLite database
conn = sqlite3.connect('/Users/jennadodge/uofo-virt-data-pt-12-2021-u-b/Water_Quality_Analysis/Database/database.sqlite3')


In [96]:
cur = conn.cursor()

In [97]:
dtypes_dict= {'county_FIPS': 'INTEGER PRIMARY KEY',
        'Geographic_Area_Name':'TEXT',
        'County':'TEXT',
        'GEOID':'TEXT',
        'Total_Population':'INTEGER',
        'White':'INTEGER',
        'pct_White':'REAL',
        'Black':'INTEGER',
        'pct_Black':'REAL',
        'Native':'INTEGER',
        'pct_Native':'REAL',
        'Asian':'INTEGER',
        'pct_Asian':'REAL',
        'Pacific_Islander':'INTEGER',
        'pct_Pacific_Islander':'REAL',
        'Other':'INTEGER',
        'pct_Other':'REAL',
        'Two_or_more_Races':'INTEGER',
        'pct_Two_or_more_Races':'REAL',
        'Not_White':'INTEGER',
        'pct_Not_White':'REAL',
        'Hispanic':'INTEGER',
        'pct_Hispanic':'REAL',
        'Not_Hispanic':'INTEGER',
        'pct_Not_Hispanic':'REAL',
        'Simpson_Race_DI':'REAL',
        'Simpson_Ethnic_DI':'REAL',
        'Shannon_Race_DI':'REAL',
        'Shannon_Ethnic_DI':'REAL',
        'Gini_Index':'REAL'}

In [98]:
# Dataframe.to_sql
data_frame.to_sql("Census_Data", conn, if_exists="replace",index=False, dtype=dtypes_dict)

In [99]:
# Close the connection
conn.close()