DSC540 Term Project
Kurt Stoneburner

California COVID019 Ethnicity Data
https://data.ca.gov/dataset/covid-19-cases/resource/7e477adb-d7ab-4d4b-a198-dc4c6dc634c9

API Example: https://data.ca.gov/api/3/action/datastore_search?resource_id=7e477adb-d7ab-4d4b-a198-dc4c6dc634c9&limit=5

Requests Documentation: https://www.w3schools.com/python/ref_requests_response.asp

In [1]:
import requests
import json
import pandas as pd

In [2]:
#//*** Build Dictionary to hold Global values.
#//*** Placing Globals in a dictionary, keeps things tidy and helps with scope.
g = {
    #//*** Values for the API call
    "api" : {
        "url" : "https://data.ca.gov",
        "ethnic" : {
            "url" : "/api/3/action/datastore_search?resource_id=7e477adb-d7ab-4d4b-a198-dc4c6dc634c9",
            "colnames" : [], #//*** Column names
            #//*** Additional Column name attributes. Probably not needed. But ingesting anyway.
            #//*** Key - colname, value is attributes
            "attrib" : {}, 
        } #//*** CLOSE api.ethnic
        
    } #//*** CLOSE api

} #//***CLOSE g

In [None]:
#//*** Get the whole database 100 records at a time.
#//*** This request gets the first 100 records. Future calls are handled in a loop
#response = requests.get(g['api']['url']+g["api"]["ethnic"]["url"])

#print(response)


In [30]:
#//*** Build a data frame returning all values from a California Data Source API
def build_df_from_CA_API(url):
    
    #//*** Build the attributes for the API. This includes column names and column attributes which includes column
    #//*** Type and other details

    #//*** Request the URL
    response = requests.get(url)
    
    #//*** Check for valid response
    if response.ok == False:
        #//*** Trouble with API, so some error handling here.
        print("Trouble fetching API data")
        print(response)
    else:
        #//*** Valid Response
        #//*** Convert response.content to a dictionary using JSON
        rawOBJ=json.loads(response.content)
        
        #//*** Peek at the results
        #print(rawOBJ["result"].keys())
        #for key, value in rawOBJ.ites():
        #    print(f"{key} : {value}")
        
        #//*** Initialize list of column names
        colnames = []
        
        #//*** Attrib_dict contains the attributes of each column
        #//*** key = Column name
        #//*** value = dictionary of attributes
        attrib_dict = {}
        
        #//*** Parse the [results][fields] key for data
        rawFields = rawOBJ["result"]['fields']
        
        #//*** Loop through the rawfields dictionary.
        #//*** each LoopOBJ contains a column name and column attributes
        for loopOBJ in rawFields:
            
            #//*** Build temporary attributes for each loop instance
            loopAttrib = {}

            #//*** All Columns have an info field except _id.
            if 'info' in loopOBJ.keys():
                loopAttrib = loopOBJ["info"]

            #//*** Add Type to loopAttrib
            loopAttrib['type'] = loopOBJ['type']

            #//*** The column name is the ID field. Append the id field to the colnames list
            colnames.append(loopOBJ['id'])

            #//*** Assign the attributes dictionary based on column name
            attrib_dict[ loopOBJ['id'] ] = loopAttrib
        
        """
        #//*** Display column names and attributes
        print(f"Column Names: {colnames}")
        
        print("Attrib_dict")
        for x in colnames:
            print(attrib_dict[x])
        """
        
        #//*************************************
        #//*** Process the row an column data 
        #//*************************************

        #//*** Build dictionary to hold raw data (rd)
        rd = {}

        #//*** Use each column as a key, create and empty list for each column
        for x in colnames:
            rd[x] = []

        #//################################################################################################
        #//*** While rawOBJ['success'] is true. Which implies we've successfully retrieved and API request
        #//*** And is our loop mechanism to keep requesting records in 100 record incremenets.
        #//################################################################################################
        while rawOBJ["success"]:

            #//*** Get Records as a List for each entry
            rawRecords = rawOBJ['result']["records"]

            #//*** Print a visual note for each loop iteraction / API call
            print(f"Processing {len(rawRecords)}")

            #//*** Parse Each Record.
            for record in rawRecords:
                #//*** Each Record is an object.
                #//*** Each key is a column name.
                #//*** Loop through the Column names and append the value to the column stored in rd

                #//*** This is the sauce to that converts the object values into lists based on columns
                #//*** It's kind of cool that the sausage is essentially made with two lines of code
                #//*** The rest is just setup and control code.
                for col in colnames:
                    #//*** Assign each element to the appropriate column
                    rd[col].append(record[col])

            #//################################
            #//*** Check if loop needs to end.
            #//################################
            #//*** If the number of records returned is less than the limit, we are done
            if len(rawOBJ['result']['records']) < rawOBJ['result']['limit']:
                print("Quitting Loop")
                break

            #//*** Check if there are more records to grab
            #//*** next contains the URL of the next request
            #//*** The API is limited to 100 records per API request.
            if 'next' in rawOBJ['result']['_links'].keys():
                ##//***API CODE HERE
                nextCall = rawOBJ['result']['_links']['next']

                #//*** Add the Next value to the base API call
                response = requests.get(g['api']['url']+nextCall)
                rawOBJ=json.loads(response.content)
                if rawOBJ["success"] == False:
                    #//*** Break if Success returns False
                    break

            else:
                #//*** Quit Here
                break
        ########################################################
        #//*** END while rawOBJ['success'] == True
        #//*** Data successfully gathered to the rd dictionary
        ########################################################

        #//*** Build the dataframe
        df = pd.DataFrame()

        #//*** Create a column based on the values gathered in rd[column name]
        for col in colnames:
            df[col] = rd[col]

        #//*** return the dataframe, column names, attribute dictionary
        return df,colnames,attrib_dict

#//*** END build_df_from_CA_API
        
####################################################################################################
#//*** Build ethnic_df from the API
#//*** This is broken out as a function to keep the code cleaner
####################################################################################################

ethnic_df = pd.DataFrame

ethnic_url = g['api']['url']+g["api"]["ethnic"]["url"]
ethnic_df = build_df_from_CA_API(ethnic_url)[0]
#ethnic_df, g['api']['ethnic']['colnames'], g['api']['ethnic']['attrib'] = buil_df_from_CA_API(ethnic_url)


Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 100
Processing 32
Quitting Loop


In [29]:
#//*** Process Flat File: 

   _id race_ethnicity  cases  case_percentage  deaths  death_percentage  \
0    1         Latino   5276            35.99     170             28.38   
1    2         Latino   5910            37.18     203             29.72   
2    3         Latino   6433            37.80     226             29.70   
3    4         Latino   7013            38.51     254             29.85   
4    5         Latino   7627            39.41     281             30.58   
5    6         Latino   8195            40.28     314             31.24   
6    7         Latino   8397            40.37     326             31.38   
7    8         Latino   9090            40.52     337             31.09   
8    9         Latino   9701            41.03     364             31.11   
9   10         Latino  10385            42.05     406             32.04   

   percent_ca_population                 date  
0                   38.9  2020-04-13T00:00:00  
1                   38.9  2020-04-14T00:00:00  
2                   38.9  2020