DSC540 Term Project
Kurt Stoneburner

California COVID019 Ethnicity Data
https://data.ca.gov/dataset/covid-19-cases/resource/7e477adb-d7ab-4d4b-a198-dc4c6dc634c9

API Example: https://data.ca.gov/api/3/action/datastore_search?resource_id=7e477adb-d7ab-4d4b-a198-dc4c6dc634c9&limit=5

Requests Documentation: https://www.w3schools.com/python/ref_requests_response.asp

In [4]:
import requests
import json
import pandas as pd

In [5]:
#//*** Build Dictionary to hold Global values.
#//*** Placing Globals in a dictionary, keeps things tidy and helps with scope.
g = {
    #//*** Values for the API call
    "api" : {
        "url" : "https://data.ca.gov",
        "ethnic" : {
            "url" : "/api/3/action/datastore_search?resource_id=7e477adb-d7ab-4d4b-a198-dc4c6dc634c9",
            "colnames" : [], #//*** Column names
            #//*** Additional Column name attributes. Probably not needed. But ingesting anyway.
            #//*** Key - colname, value is attributes
            "attrib" : {}
        },#//*** end Ethnic
        "cases" : {
            "url" : "/api/3/action/datastore_search?resource_id=926fd08f-cc91-4828-af38-bd45de97f8c3",
            "colnames" : [], #//*** Column names
            #//*** Additional Column name attributes. Probably not needed. But ingesting anyway.
            #//*** Key - colname, value is attributes
            "attrib" : {}, 
        }#//*** end Cases
            
        
        
    } #//*** CLOSE api

} #//***CLOSE g

In [6]:
#//*** Get the whole database 100 records at a time.
#//*** This request gets the first 100 records. Future calls are handled in a loop
#response = requests.get(g['api']['url']+g["api"]["ethnic"]["url"])

#print(response)
#print(cases_url)


In [7]:
#//*** Build a data frame returning all values from a California Data Source API
def build_df_from_CA_API(url):
    
    #//*** Build the attributes for the API. This includes column names and column attributes which includes column
    #//*** Type and other details

    #//*** Request the URL
    response = requests.get(url)
    
    #//*** Check for valid response
    if response.ok == False:
        #//*** Trouble with API, so some error handling here.
        print("Trouble fetching API data")
        print(response)
    else:
        #//*** Valid Response
        #//*** Convert response.content to a dictionary using JSON
        rawOBJ=json.loads(response.content)
        
        #//*** Peek at the results
        #print(rawOBJ["result"].keys())
        #for key, value in rawOBJ.ites():
        #    print(f"{key} : {value}")
        
        #//*** Initialize list of column names
        colnames = []
        
        #//*** Attrib_dict contains the attributes of each column
        #//*** key = Column name
        #//*** value = dictionary of attributes
        attrib_dict = {}
        
        #//*** Parse the [results][fields] key for data
        rawFields = rawOBJ["result"]['fields']
        
        #//*** Loop through the rawfields dictionary.
        #//*** each LoopOBJ contains a column name and column attributes
        for loopOBJ in rawFields:
            
            #//*** Build temporary attributes for each loop instance
            loopAttrib = {}

            #//*** All Columns have an info field except _id.
            if 'info' in loopOBJ.keys():
                loopAttrib = loopOBJ["info"]

            #//*** Add Type to loopAttrib
            loopAttrib['type'] = loopOBJ['type']

            #//*** The column name is the ID field. Append the id field to the colnames list
            colnames.append(loopOBJ['id'])

            #//*** Assign the attributes dictionary based on column name
            attrib_dict[ loopOBJ['id'] ] = loopAttrib
        
        """
        #//*** Display column names and attributes
        print(f"Column Names: {colnames}")
        
        print("Attrib_dict")
        for x in colnames:
            print(attrib_dict[x])
        """
        
        #//*************************************
        #//*** Process the row an column data 
        #//*************************************

        #//*** Build dictionary to hold raw data (rd)
        rd = {}

        #//*** Use each column as a key, create and empty list for each column
        for x in colnames:
            rd[x] = []

        #//################################################################################################
        #//*** While rawOBJ['success'] is true. Which implies we've successfully retrieved and API request
        #//*** And is our loop mechanism to keep requesting records in 100 record incremenets.
        #//################################################################################################
        while rawOBJ["success"]:

            #//*** Get Records as a List for each entry
            rawRecords = rawOBJ['result']["records"]

            #//*** Print a visual note for each loop iteraction / API call
            print(f"Processing {len(rawRecords)}")

            #//*** Parse Each Record.
            for record in rawRecords:
                #//*** Each Record is an object.
                #//*** Each key is a column name.
                #//*** Loop through the Column names and append the value to the column stored in rd

                #//*** This is the sauce to that converts the object values into lists based on columns
                #//*** It's kind of cool that the sausage is essentially made with two lines of code
                #//*** The rest is just setup and control code.
                for col in colnames:
                    #//*** Assign each element to the appropriate column
                    rd[col].append(record[col])

            #//################################
            #//*** Check if loop needs to end.
            #//################################
            #//*** If the number of records returned is less than the limit, we are done
            if len(rawOBJ['result']['records']) < rawOBJ['result']['limit']:
                print("Quitting Loop")
                break

            #//*** Check if there are more records to grab
            #//*** next contains the URL of the next request
            #//*** The API is limited to 100 records per API request.
            if 'next' in rawOBJ['result']['_links'].keys():
                ##//***API CODE HERE
                nextCall = rawOBJ['result']['_links']['next']

                #//*** Add the Next value to the base API call
                response = requests.get(g['api']['url']+nextCall)
                rawOBJ=json.loads(response.content)
                if rawOBJ["success"] == False:
                    #//*** Break if Success returns False
                    break

            else:
                #//*** Quit Here
                break
        ########################################################
        #//*** END while rawOBJ['success'] == True
        #//*** Data successfully gathered to the rd dictionary
        ########################################################

        #//*** Build the dataframe
        df = pd.DataFrame()

        #//*** Create a column based on the values gathered in rd[column name]
        for col in colnames:
            df[col] = rd[col]

        #//*** return the dataframe, column names, attribute dictionary
        return df,colnames,attrib_dict

#//*** END build_df_from_CA_API
        


In [8]:
####################################################################################################
#//*** Build ethnic_df from the API
#//*** This is broken out as a function to keep the code cleaner
####################################################################################################

covid_ethnic_df = pd.DataFrame

ethnic_url = g['api']['url']+g["api"]["ethnic"]["url"]
covid_ethnic_df = build_df_from_CA_API(ethnic_url)[0]
#covid_ethnic_df, g['api']['ethnic']['colnames'], g['api']['ethnic']['attrib'] = buil_df_from_CA_API(ethnic_url)


Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 40
Quitting Loop


In [9]:
####################################################################################################
#//*** Build the covid_cases_df from the API. These are the county COVID numbers by date
#//*** This is broken out as a function to keep the code cleaner
####################################################################################################
cases_url = g['api']['url']+g["api"]["cases"]["url"]
covid_cases_df = build_df_from_CA_API(cases_url)[0]


Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing

In [10]:
covid_cases_df.head(20)

Unnamed: 0,_id,county,totalcountconfirmed,totalcountdeaths,newcountconfirmed,newcountdeaths,date
0,1,Santa Clara,151.0,6.0,151,6,2020-03-18T00:00:00
1,2,Santa Clara,183.0,8.0,32,2,2020-03-19T00:00:00
2,3,Santa Clara,246.0,8.0,63,0,2020-03-20T00:00:00
3,4,Santa Clara,269.0,10.0,23,2,2020-03-21T00:00:00
4,5,Santa Clara,284.0,13.0,15,3,2020-03-22T00:00:00
5,6,Santa Clara,336.0,13.0,52,0,2020-03-23T00:00:00
6,7,Santa Clara,389.0,17.0,53,4,2020-03-24T00:00:00
7,8,Santa Clara,452.0,20.0,63,3,2020-03-25T00:00:00
8,9,Santa Clara,487.0,20.0,35,0,2020-03-26T00:00:00
9,10,Santa Clara,557.0,25.0,70,5,2020-03-27T00:00:00


In [11]:
print(covid_ethnic_df.columns)
print(covid_ethnic_df['race_ethnicity'].unique())
print(covid_ethnic_df['_id'].unique())
print(covid_ethnic_df.head(10))


Index(['_id', 'race_ethnicity', 'cases', 'case_percentage', 'deaths',
       'death_percentage', 'percent_ca_population', 'date'],
      dtype='object')
['Latino' 'White' 'Asian' 'Black' 'Multiracial'
 'American Indian or Alaska Native' 'Native Hawaiian or Pacific Islander'
 'Other' 'Multi-Race' 'Native Hawaiian and other Pacific Islander']
[   1    2    3 ... 2238 2239 2240]
   _id race_ethnicity  cases  case_percentage  deaths  death_percentage  \
0    1         Latino   5276            35.99     170             28.38   
1    2         Latino   5910            37.18     203             29.72   
2    3         Latino   6433            37.80     226             29.70   
3    4         Latino   7013            38.51     254             29.85   
4    5         Latino   7627            39.41     281             30.58   
5    6         Latino   8195            40.28     314             31.24   
6    7         Latino   8397            40.37     326             31.38   
7    8         Latino

In [12]:
#//*** Process Flat File: California Ethnicity demographics - cc-est2019-alldata-06.csv
raw_ethnic_pop_df = pd.read_csv("cc-est2019-alldata-06.csv")

#//*** Data includes values for last twelve years. We only want data for the last year.

#//*** Rebuild raw_ethnic_pop_df using only the last year (most recent) data
raw_ethnic_pop_df = raw_ethnic_pop_df[raw_ethnic_pop_df['YEAR']==raw_ethnic_pop_df['YEAR'].max()]

#//*** Ethnic data is broken down by age. At this stage we will only use the totals of all ages
#//*** Only use AGEGRP == 0
raw_ethnic_pop_df = raw_ethnic_pop_df[raw_ethnic_pop_df['AGEGRP']==raw_ethnic_pop_df['AGEGRP'].min()]


raw_ethnic_pop_df.head(20)


Unnamed: 0,SUMLEV,STATE,COUNTY,STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,...,HWAC_MALE,HWAC_FEMALE,HBAC_MALE,HBAC_FEMALE,HIAC_MALE,HIAC_FEMALE,HAAC_MALE,HAAC_FEMALE,HNAC_MALE,HNAC_FEMALE
209,50,6,1,California,Alameda County,12,0,1671329,823247,848082,...,166972,162697,10421,10961,12411,12008,9100,9346,2296,2392
437,50,6,3,California,Alpine County,12,0,1129,609,520,...,50,41,4,2,28,26,3,1,0,1
665,50,6,5,California,Amador County,12,0,39752,21638,18114,...,3275,1939,101,49,318,225,94,53,35,21
893,50,6,7,California,Butte County,12,0,219186,108473,110713,...,17400,16812,502,554,1702,1780,406,399,167,159
1121,50,6,9,California,Calaveras County,12,0,45905,22847,23058,...,2720,2644,65,65,261,286,106,76,26,20
1349,50,6,11,California,Colusa County,12,0,21547,10975,10572,...,6477,6024,65,54,242,202,68,48,42,39
1577,50,6,13,California,Contra Costa County,12,0,1153526,564187,589339,...,136151,136348,6839,7116,7438,7178,6960,6950,1570,1491
1805,50,6,15,California,Del Norte County,12,0,27812,15186,12626,...,3167,1642,74,52,499,392,56,40,11,13
2033,50,6,17,California,El Dorado County,12,0,192843,96158,96685,...,11740,11679,313,241,1065,936,390,354,113,107
2261,50,6,19,California,Fresno County,12,0,999101,498648,500453,...,248241,242710,7502,7891,15658,15487,6049,6036,1187,1155


In [42]:
#//*** Convert Applicable federal based census codes to California Census Codes.
#//*** Description of Federal Column Values
#//*** https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/cc-est2019-alldata.pdf

#//*** Notably, Federal census regards Hispanic as an ethnicity not a race. For Example: People can be Hispanic White,
#//*** Hispanic Black, or Hispanic Asian.
#//*** California treats all hispanics as Latino
#//*** Latino = H_MALE, H_FEMALE Hispanic
#//*** White - NHWA_MALE, NHWA_FEMALE (Not Hispanic White)
#//*** Asian - NHAA_MALE, NHAA_FEMALE (Not Hispanic Asian) 
#//*** Black - NHBA_MALE, NHBA_FEMALE (Not Hispanic Black) 

#//*** Amer Indian - NHIA_MALE, NHIA_FEMALE (Not Hispanic, American Indian) 

#//*** Hawaiian - NHNA_MALE, NHNA_FEMALE (Not Hispanic, Hawaiian) 

#//*** California has the following columns: Multiracial, Other, Multirace. I could not find a good definition of these
#//*** These represent less than 5% of the population. Small but not too small to be ignored. These will combined into
#//*** Single attribute Other and combined with NHTOM_MALE, NHTOM_FEMALE - Not Hispanic Two or more races

#//*** Build a new data frame to hold the sanitized values.
pop_attrib_df = pd.DataFrame()

#//*** County Name will be the Common attribute to link to the timeseries Data.
#//*** Standardize the County name. Remove County from the column name
pop_attrib_df['county'] = raw_ethnic_pop_df['CTYNAME'].str.replace(" County","")
pop_attrib_df['population'] = raw_ethnic_pop_df['TOT_POP']

clean_cols = { 'Latino' : ['H_MALE', 'H_FEMALE'], 
              'White' : ['NHWA_MALE', 'NHWA_FEMALE'],
              'Asian' : ['NHAA_MALE', 'NHAA_FEMALE'],
              'Black' : ['NHBA_MALE', 'NHBA_FEMALE'],
              'American Indian or Alaska Native' : ['NHIA_MALE','NHIA_FEMALE'],
              'Hawaiian' : ['NHNA_MALE', 'NHNA_FEMALE'],
              'Other' : ['NHTOM_MALE', 'NHTOM_FEMALE']
            
            }

#//*** Combine male and female columns and store to column with same name as California Data
#//*** Loop through the clean_cols dictionary, key is California name, value is Federal columns to combine
#//*** These are the easy 1:1 columns
#//*** Hawaiian and Other will need adjustment in the Califnornia Side of the Dataset.


#//*** California Column name = Federal category male + Federal Category female
for ca_name,fed_names in clean_cols.items():
    pop_attrib_df[ca_name] = raw_ethnic_pop_df[fed_names[0]] + raw_ethnic_pop_df[fed_names[1]] 

#              'Native Hawaiian or Pacific Islander' :
#              'Native Hawaiian and other Pacific Islander'
#            'Other'


                county  population   Latino    White    Asian   Black  \
209            Alameda     1671329   373055   512134   529698  169954   
437             Alpine        1129      139      692       18       4   
665             Amador       39752     5753    30742      575     994   
893              Butte      219186    37731   155415    10573    3526   
1121         Calaveras       45905     5967    36672      719     420   
1349            Colusa       21547    13018     7344      266     220   
1577      Contra Costa     1153526   300420   492393   204045  100798   
1805         Del Norte       27812     5596    17236      802     917   
2033         El Dorado      192843    25378   148903     8974    1696   
2261            Fresno      999101   537180   286049   103430   46274   
2489             Glenn       28393    12079    14334      752     240   
2717          Humboldt      135558    16354   100078     3671    1686   
2945          Imperial      181215   154088    1816

In [14]:
thisRow = raw_ethnic_pop_df.iloc[0]

totmale = thisRow['TOT_MALE']

male_cols = []

for col in thisRow.index:
    if "_MALE" in col and not "TOT_MALE" in col and not "AC_MALE" in col:
        male_cols.append(col)

print(f"Total Male: {totmale}")
print(f"{male_cols}")

print(thisRow)

#Total Hispanic - H_MALE, H_FEMALE 



for index in male_cols:
    print(f" {index} : {thisRow.loc[index]}")
#print(raw_ethnic_pop_df.head(20))


Total Male: 823247
['WA_MALE', 'BA_MALE', 'IA_MALE', 'AA_MALE', 'NA_MALE', 'TOM_MALE', 'NH_MALE', 'NHWA_MALE', 'NHBA_MALE', 'NHIA_MALE', 'NHAA_MALE', 'NHNA_MALE', 'NHTOM_MALE', 'H_MALE', 'HWA_MALE', 'HBA_MALE', 'HIA_MALE', 'HAA_MALE', 'HNA_MALE', 'HTOM_MALE']
SUMLEV                     50
STATE                       6
COUNTY                      1
STNAME             California
CTYNAME        Alameda County
                    ...      
HIAC_FEMALE             12008
HAAC_MALE                9100
HAAC_FEMALE              9346
HNAC_MALE                2296
HNAC_FEMALE              2392
Name: 209, Length: 80, dtype: object
 WA_MALE : 414416
 BA_MALE : 88167
 IA_MALE : 9048
 AA_MALE : 259991
 NA_MALE : 7534
 TOM_MALE : 44091
 NH_MALE : 634636
 NHWA_MALE : 256400
 NHBA_MALE : 81150
 NHIA_MALE : 1957
 NHAA_MALE : 254719
 NHNA_MALE : 6423
 NHTOM_MALE : 33987
 H_MALE : 188611
 HWA_MALE : 158016
 HBA_MALE : 7017
 HIA_MALE : 7091
 HAA_MALE : 5272
 HNA_MALE : 1111
 HTOM_MALE : 10104
