# STL LaunchCode Women+ Capstone Project
by Hafsa and Nicole

In [46]:
# Dependencies and modules:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import time
import pprint as pp
import seaborn as sns
import statistics
import math
from scipy import stats
import json
import html
import lxml

from api import my_api

# Audible cell-completion notification:
from IPython.display import Audio
Audio('Bicycle-bell.mp3', autoplay=True)

## API Call to Federal Census Database for American Community Survey 5-Year Data (2009-2019)

In [47]:
# define api key:
api_key = my_api

# define api base url for :
base_url_pov = 'https://api.census.gov/data/2019/acs/acs5/profile?get=group(DP03)&for=county:*&in=state:*' 
# create response object from an api call request:
req1 = requests.get(base_url_pov)
print(req1)  # response 200!

pov_data = req1.json()
Audio('Bicycle-bell.mp3', autoplay=True)

<Response [200]>


In [48]:
poverty = pd.DataFrame(pov_data)
# Setting first row as header:
poverty.columns = poverty.iloc[0] 
# Pulling header out of dataframe:
poverty = poverty[1:]
# Checking results:
poverty.head()

Unnamed: 0,DP03_0001E,DP03_0001EA,DP03_0001M,DP03_0001MA,DP03_0001PE,DP03_0001PEA,DP03_0001PM,DP03_0001PMA,DP03_0002E,DP03_0002EA,...,DP03_0137M,DP03_0137MA,DP03_0137PE,DP03_0137PEA,DP03_0137PM,DP03_0137PMA,GEO_ID,NAME,state,county
1,17630,,131,,17630,,-888888888,(X),9285,,...,-888888888,(X),27.2,,4.5,,0500000US17051,"Fayette County, Illinois",17,51
2,24117,,83,,24117,,-888888888,(X),13168,,...,-888888888,(X),20.6,,3.3,,0500000US17107,"Logan County, Illinois",17,107
3,19422,,92,,19422,,-888888888,(X),10718,,...,-888888888,(X),31.8,,4.3,,0500000US17165,"Saline County, Illinois",17,165
4,551803,,565,,551803,,-888888888,(X),380960,,...,-888888888,(X),18.8,,1.0,,0500000US17097,"Lake County, Illinois",17,97
5,11427,,107,,11427,,-888888888,(X),5895,,...,-888888888,(X),31.8,,5.1,,0500000US17127,"Massac County, Illinois",17,127


### Creating dictionary of variable definitions:

In [49]:
# Displaying variable labels to determine features to keep:

var_df = pd.read_csv("ACSDP5YSPT2010.DP03_data_with_overlays_2022-02-02T201846.csv")
trans_var = var_df.T

pd.set_option("display.max.rows", None)
pd.set_option('display.max_colwidth', None)
trans_var = trans_var.reset_index()
trans_var

Unnamed: 0,index,0,1
0,DP03_0001E,Estimate!!EMPLOYMENT STATUS!!Population 16 years and over,238733844
1,DP03_0001M,Estimate Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over,14879
2,DP03_0001PE,Percent!!EMPLOYMENT STATUS!!Population 16 years and over,238733844
3,DP03_0001PM,Percent Margin of Error!!EMPLOYMENT STATUS!!Population 16 years and over,(X)
4,DP03_0002E,Estimate!!EMPLOYMENT STATUS!!In labor force,155163977
5,DP03_0002M,Estimate Margin of Error!!EMPLOYMENT STATUS!!In labor force,102484
6,DP03_0002PE,Percent!!EMPLOYMENT STATUS!!In labor force,65.0
7,DP03_0002PM,Percent Margin of Error!!EMPLOYMENT STATUS!!In labor force,0.1
8,DP03_0003E,Estimate!!EMPLOYMENT STATUS!!In labor force!!Civilian labor force,154037474
9,DP03_0003M,Estimate Margin of Error!!EMPLOYMENT STATUS!!In labor force!!Civilian labor force,94897


In [50]:
# Removing "margin of error" and "percentage estimate" rows:
trans_vars = trans_var[trans_var["index"].str.contains("PE") == True]

trans_vars

Unnamed: 0,index,0,1
2,DP03_0001PE,Percent!!EMPLOYMENT STATUS!!Population 16 years and over,238733844
6,DP03_0002PE,Percent!!EMPLOYMENT STATUS!!In labor force,65.0
10,DP03_0003PE,Percent!!EMPLOYMENT STATUS!!In labor force!!Civilian labor force,64.5
14,DP03_0004PE,Percent!!EMPLOYMENT STATUS!!In labor force!!Civilian labor force!!Employed,59.4
18,DP03_0005PE,Percent!!EMPLOYMENT STATUS!!In labor force!!Civilian labor force!!Unemployed,5.1
22,DP03_0006PE,Percent!!EMPLOYMENT STATUS!!In labor force!!Armed Forces,0.5
26,DP03_0007PE,Percent!!EMPLOYMENT STATUS!!Not in labor force,35.0
30,DP03_0008PE,Percent!!EMPLOYMENT STATUS!!Civilian labor force,154037474
34,DP03_0009PE,Percent!!EMPLOYMENT STATUS!!Percent Unemployed,7.9
38,DP03_0010PE,Percent!!EMPLOYMENT STATUS!!Females 16 years and over,122702038


In [4]:
# Downloaded variable names as json from census.gov. Reading those to list:
with open("variables.json","r") as v:
    poverty_variables = json.loads(v.read())
poverty_variables

{'variables': {'for': {'label': "Census API FIPS 'for' clause",
   'concept': 'Census API Geography Specification',
   'predicateType': 'fips-for',
   'group': 'N/A',
   'limit': 0,
   'predicateOnly': True},
  'in': {'label': "Census API FIPS 'in' clause",
   'concept': 'Census API Geography Specification',
   'predicateType': 'fips-in',
   'group': 'N/A',
   'limit': 0,
   'predicateOnly': True},
  'ucgid': {'label': 'Uniform Census Geography Identifier clause',
   'concept': 'Census API Geography Specification',
   'predicateType': 'ucgid',
   'group': 'N/A',
   'limit': 0,
   'predicateOnly': True,
   'hasGeoCollectionSupport': True},
  'DP02_0126E': {'label': 'Estimate!!ANCESTRY!!Total population!!Czech',
   'concept': 'SELECTED SOCIAL CHARACTERISTICS IN THE UNITED STATES',
   'predicateType': 'int',
   'group': 'DP02',
   'limit': 0,
   'attributes': 'DP02_0126EA,DP02_0126M,DP02_0126MA'},
  'DP05_0050PE': {'label': 'Percent!!RACE!!Total population!!One race!!Asian!!Vietnamese',
 

In [None]:
pd.concat((pd.read_json(d) for d in data), axis=0)

In [5]:
the_variables = poverty.columns
the_variables

Index(['DP03_0001E', 'DP03_0001EA', 'DP03_0001M', 'DP03_0001MA', 'DP03_0001PE',
       'DP03_0001PEA', 'DP03_0001PM', 'DP03_0001PMA', 'DP03_0002E',
       'DP03_0002EA',
       ...
       'DP03_0137M', 'DP03_0137MA', 'DP03_0137PE', 'DP03_0137PEA',
       'DP03_0137PM', 'DP03_0137PMA', 'GEO_ID', 'NAME', 'state', 'county'],
      dtype='object', name=0, length=1100)

In [6]:
# Creating a key for column names with their definition:

variable_legend = {'column_name':[],
             'definition':[],
             'concept':[]}

#loop over json and add data to dictionary
for variable in the_variables:

    try:
            
    #add values to our variables dictionary
        variable_legend['column_name'].append(poverty_variables['variables'])
        variable_legend['definition'].append(poverty_variables['variables'][variable]['label'])
        variable_legend['concept'].append(poverty_variables['variables'][variable]['concept'])
        
        print("Processing variable:" + variable)
    
    except:
        print(variable+ ' Not Found')
        pass


Processing variable:DP03_0001E
DP03_0001EA Not Found
DP03_0001M Not Found
DP03_0001MA Not Found
Processing variable:DP03_0001PE
DP03_0001PEA Not Found
DP03_0001PM Not Found
DP03_0001PMA Not Found
Processing variable:DP03_0002E
DP03_0002EA Not Found
DP03_0002M Not Found
DP03_0002MA Not Found
Processing variable:DP03_0002PE
DP03_0002PEA Not Found
DP03_0002PM Not Found
DP03_0002PMA Not Found
Processing variable:DP03_0003E
DP03_0003EA Not Found
DP03_0003M Not Found
DP03_0003MA Not Found
Processing variable:DP03_0003PE
DP03_0003PEA Not Found
DP03_0003PM Not Found
DP03_0003PMA Not Found
Processing variable:DP03_0004E
DP03_0004EA Not Found
DP03_0004M Not Found
DP03_0004MA Not Found
Processing variable:DP03_0004PE
DP03_0004PEA Not Found
DP03_0004PM Not Found
DP03_0004PMA Not Found
Processing variable:DP03_0005E
DP03_0005EA Not Found
DP03_0005M Not Found
DP03_0005MA Not Found
Processing variable:DP03_0005PE
DP03_0005PEA Not Found
DP03_0005PM Not Found
DP03_0005PMA Not Found
Processing variable:

In [10]:
variable_legend['definition']

['Estimate!!EMPLOYMENT STATUS!!Population 16 years and over',
 'Percent!!EMPLOYMENT STATUS!!Population 16 years and over',
 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force',
 'Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force',
 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force',
 'Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force',
 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force!!Employed',
 'Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force!!Employed',
 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force!!Unemployed',
 'Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force!!Unemployed',
 'Estimate!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Arme

In [21]:
# passing data to dataframe:
variables_df = pd.DataFrame(poverty_variables)
variables_df

Unnamed: 0,variables
AIANHH,"{'label': 'Geography', 'group': 'N/A', 'limit'..."
AIHHTL,"{'label': 'Geography', 'group': 'N/A', 'limit'..."
AIRES,"{'label': 'Geography', 'group': 'N/A', 'limit'..."
ANRC,"{'label': 'Geography', 'group': 'N/A', 'limit'..."
CBSA,"{'label': 'Geography', 'group': 'N/A', 'limit'..."
...,...
UA,"{'label': 'Geography', 'group': 'N/A', 'limit'..."
ZCTA,"{'label': 'Geography', 'group': 'N/A', 'limit'..."
for,"{'label': 'Census API FIPS 'for' clause', 'con..."
in,"{'label': 'Census API FIPS 'in' clause', 'conc..."


In [27]:
base_url_wat = 'https://enviro.epa.gov/enviro/efservice/SDW_CONTAM_VIOL_CITY/PWSID/IL3141937/JSON'
req2 = requests.get(base_url_wat)
print(req2)  # response 200!

json_data_wat = req2.json()

<Response [200]>


In [28]:
# Serializing json 
json_object_wat = json.dumps(json_data_wat)
  
# Writing to .json
with open("water.json", "w") as outfile:
    outfile.write(json_object_wat)

In [29]:
# load data using Python JSON module
with open("water.json","r") as f:
    water_data = json.loads(f.read())

# Reading data
water_df = pd.DataFrame(water_data)
water_df

Unnamed: 0,PWSID,PWSNAME,STATE,COUNTYSERVED,GEOLOCATION_CITY,VIOID,CCODE,CNAME,SOURCES,DEFINITION,...,CTYPE,VCODE,VNAME,VTYPE,VIOLMEASURE,ENFACTIONTYPE,ENFACTIONNAME,ENFDATE,COMPPERBEGINDATE,COMPPERENDDATE
0,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,MERNA,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SFJ,St Formal NOV issued,19-JUL-10,01-APR-10,30-JUN-10
1,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,MERNA,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SIF,St Public Notif received,19-JUL-10,01-APR-10,30-JUN-10
2,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,GRIDLEY,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SFJ,St Formal NOV issued,19-JUL-10,01-APR-10,30-JUN-10
3,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,GRIDLEY,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SIF,St Public Notif received,19-JUL-10,01-APR-10,30-JUN-10
4,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,BELLFLOWER,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SFJ,St Formal NOV issued,19-JUL-10,01-APR-10,30-JUN-10
5,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,BELLFLOWER,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SIF,St Public Notif received,19-JUL-10,01-APR-10,30-JUN-10
6,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,BLOOMINGTON,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SFJ,St Formal NOV issued,19-JUL-10,01-APR-10,30-JUN-10
7,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,BLOOMINGTON,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SIF,St Public Notif received,19-JUL-10,01-APR-10,30-JUN-10
8,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,CROPSEY,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SFJ,St Formal NOV issued,19-JUL-10,01-APR-10,30-JUN-10
9,IL3141937,TIMBER PNTE OTDR CNTR-POOL BHS,IL,MCLEAN,CROPSEY,1,3100,Coliform (TCR),Coliforms are naturally present in the environ...,Coliforms are bacteria that are naturally pres...,...,TCR,22,"MCL, Monthly (TCR)",MCL,Maximum Contaminant Level Violation,SIF,St Public Notif received,19-JUL-10,01-APR-10,30-JUN-10
