In [1]:
import pandas as pd
import numpy as np
import os
import sqlite3
import datetime

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

output_width = 1000
#output_width = 80 #//*** Normal Output width

#//*** Normal Output width
pd.set_option("display.width", output_width)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


Work with the Data for Term Project

Data is downloaded using DSC540 StoneburnerKurt TermProject _ Load_PreProcess_Data
which I supposed should be added as an external library if this was a production project.

Coded is separated to keep the project more manageable

In [2]:
#//*** Load all data from a database 
db_filename = 'covid_data.sqldb'

#//*** Verify database instance exists
if os.path.exists(db_filename) == False:
    #//*** Throw a warning if databse does not exist
    print(f"Trouble Connecting to database: {db_filename}")
else:
    #//*** Start a database connection
    con = sqlite3.connect(db_filename)
    
    #//*** Use a manual list of names to convert from SQL. This is much less elegant coming out
    #//*** Than it did going in. But it is expedient
    
    for df_name in ['covid_ethnic_df','covid_cases_df','pop_attrib_df','covid_project_df']:

        #//******************************************    
        #//*** Build loop_df based on string name
        #//******************************************
        if df_name == 'covid_ethnic_df':
            covid_ethnic_df = pd.read_sql(f"SELECT * FROM {df_name}",con=con)
        elif df_name == 'covid_cases_df':
            covid_cases_df = pd.read_sql(f"SELECT * FROM {df_name}",con=con).set_index('index')
        elif df_name == 'pop_attrib_df':
            pop_attrib_df = pd.read_sql(f"SELECT * FROM {df_name}",con=con).set_index('index')
        elif df_name == 'covid_project_df':
            covid_project_df = pd.read_sql(f"SELECT * FROM {df_name}",con=con).set_index('index')
            covid_project_df['date'] = pd.to_datetime(covid_project_df['date'])
        else:
            #//*** Display and error message for items missed. 
            print(f"Failed to process: {df_name}")
            continue

        

        #print(con.execute(query).fetchall())
        
        

    #//*** Close and Exit the Database. For 
    con.close()
    con.__exit__


In [3]:
#//*** Set the index column as the index
for df in [covid_ethnic_df, covid_cases_df, pop_attrib_df,covid_project_df]:
    if 'index' in df.columns:
        df = df.set_index('index')

In [4]:
#//*************************************
#//*** Import stored/preprocessed data
#//*************************************

#//*** Columns to remove from imported CSVs. We should be able to kill these on import if we were cool.
#//*** But we're not, so we'll use an expedient column delete list.

del_cols = ['Unnamed: 0', '_id']
#//*** Load datframes from file, because we mess them up
#covid_ethnic_df = pd.read_csv("z_covid_ethnic_df.csv")
#covid_cases_df = pd.read_csv("z_covid_cases_df.csv")
#pop_attrib_df = pd.read_csv("z_pop_attrib_df.csv")
#covid_project_df = pd.read_csv("z_covid_project_df.csv")

#//***********************************************************************************
#//*** Remove excess columns from read_csv
#//*** Use the loop in case we need to delete columns that are not exclusive to all
#//***********************************************************************************
for x in del_cols:
    if x in covid_cases_df.columns:
        covid_cases_df.drop([x], axis=1, inplace=True)

    if x in covid_ethnic_df.columns:
        covid_ethnic_df.drop([x], axis=1, inplace=True)
    
    if x in pop_attrib_df.columns:
        pop_attrib_df.drop([x], axis=1, inplace=True)
    
    if x in covid_project_df.columns:
        covid_project_df.drop([x], axis=1, inplace=True)
        
print(covid_cases_df.head())

#//*** Drop the counties labeled as unassigned.
#//*** We don't have data on those individuals
print(f"Length Before removing Unassigned County: {len(covid_cases_df)}")

covid_cases_df = covid_cases_df[~covid_cases_df['county'].isin(['Unassigned'])]

print(f"Length After removing Unassigned County: {len(covid_cases_df)}")

#//*** Remove the 'Out Of Country' listings
print(f"Length Before removing Out Of Country County: {len(covid_cases_df)}")

covid_cases_df = covid_cases_df[~covid_cases_df['county'].isin(['Out Of Country'])]

print(f"Length After removing Out Of Country County: {len(covid_cases_df)}")


#//*** Drop the rows with Other listed as ethnicity. This reopresents less than .1 % pf the population and doesn't have an
#//*** Equivalent in the federal data.
print(f"Length Before removing Other Race: {len(covid_ethnic_df)}")

covid_ethnic_df = covid_ethnic_df[~covid_ethnic_df['race_ethnicity'].isin(['Other'])]

print(f"Length After removing Other Race: {len(covid_ethnic_df)}")



            county  totalcountconfirmed  totalcountdeaths  newcountconfirmed  newcountdeaths                 date
index                                                                                                            
0      Santa Clara                151.0               6.0                151               6  2020-03-18T00:00:00
14978   Sacramento                 46.0               3.0                 46               3  2020-03-18T00:00:00
14642  Los Angeles                 47.0               0.0                 47               0  2020-03-18T00:00:00
14308         Napa                  0.0               0.0                  0               0  2020-03-18T00:00:00
19651         Yolo                  4.0               0.0                  4               0  2020-03-18T00:00:00
Length Before removing Unassigned County: 19985
Length After removing Unassigned County: 19651
Length Before removing Out Of Country County: 19651
Length After removing Out Of Country County: 19342
Leng

In [5]:
print(covid_ethnic_df['race_ethnicity'].unique())

['Latino' 'Native Hawaiian or Pacific Islander'
 'American Indian or Alaska Native' 'Asian' 'White' 'Black' 'Multiracial'
 'Native Hawaiian and other Pacific Islander' 'Multi-Race']


In [6]:
#//*** Combine Pacific Islander and Hawaiian values into one.
#//*** Hawaiian isn't a good choice of variable name. It's kind of racist and definitely non-inclusive.
#//*** But I need this to work, before I can reconsider a different variable name

covid_ethnic_df['race_ethnicity']=covid_ethnic_df['race_ethnicity'].str.replace('Native Hawaiian or Pacific Islander','Hawaiian')
covid_ethnic_df['race_ethnicity']=covid_ethnic_df['race_ethnicity'].str.replace('Native Hawaiian and other Pacific Islander','Hawaiian')
covid_ethnic_df['race_ethnicity']=covid_ethnic_df['race_ethnicity'].str.replace('Multi-Race','Multiracial' )
covid_ethnic_df['race_ethnicity']=covid_ethnic_df['race_ethnicity'].str.replace('American Indian or Alaska Native','Native' )

In [7]:
#print(pop_attrib_df)

HTML(pop_attrib_df.to_html())

Unnamed: 0_level_0,cty_fibs,county,population,Latino,White,Asian,Black,American Indian or Alaska Native,Hawaiian,Multiracial,0rf_num,0rf_rate,0rf_err,1-2rf_num,1-2rf_rate,1-2rf_err,3plrf_num,3plrf_rate,3plrf_err
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,1,Alameda,1671329,373055,512134,529698,169954,4157,13474,68857,52135,3.16,1.99,1159312,70.37,6.98,436060,26.47,6.84
1,3,Alpine,1129,139,692,18,4,243,0,33,241,21.89,8.4,579,52.59,10.35,281,25.52,9.31
2,5,Amador,39752,5753,30742,575,994,606,83,999,7169,20.35,5.97,17817,50.59,7.16,10235,29.06,6.49
3,7,Butte,219186,37731,155415,10573,3526,3390,465,8086,44433,19.46,5.32,122804,53.79,6.88,61078,26.75,6.02
4,9,Calaveras,45905,5967,36672,719,420,562,98,1467,9940,21.93,6.38,22900,50.53,7.46,12478,27.53,6.6
5,11,Colusa,21547,13018,7344,266,220,293,73,333,4223,19.65,5.75,12343,57.45,7.1,4920,22.9,6.19
6,13,Contra Costa,1153526,300420,492393,204045,100798,3126,5379,47365,150886,13.16,3.91,710710,61.99,6.99,284932,24.85,6.49
7,15,Del Norte,27812,5596,17236,802,917,2059,42,1160,5189,20.77,6.47,13874,55.54,7.66,5915,23.68,6.47
8,17,El Dorado,192843,25378,148903,8974,1696,1500,328,6064,55253,29.06,6.34,94483,49.69,7.03,40421,21.26,5.76
9,19,Fresno,999101,537180,286049,103430,46274,5967,1437,18764,68407,6.95,3.13,612805,62.24,7.1,303388,30.81,6.79


In [8]:

#//*** Combine covid_project State numbers with covid_cases_df, This gives us State wide numbers to reference

#//*** Sort Both dataframes by date
covid_cases_df = covid_cases_df.sort_values(by='date')
covid_project_df = covid_project_df.sort_values(by='date')



#//********************************************************
#//*** trim COVID project dates to match covid_cases
#//********************************************************
#//*** Get the First date of the covid_cases_df. This is our target for the slice
cases_start_date = covid_cases_df['date'].iloc[0]


project_start_iloc = covid_project_df.index.get_loc(covid_project_df[ covid_project_df['date'] == cases_start_date].iloc[0].name)
#print(covid_project_df[project_start_iloc:])
covid_project_df = covid_project_df[project_start_iloc:]

#for group in covid_cases_df.groupby('date'):
#    print(group[1].head(5))
    

#//*** Synchronize end date
last_iloc = len(covid_project_df)-1

print(f"These Should be equal: {last_iloc == len(covid_project_df[:last_iloc])}")

#//*** Synchronize the end date
last_iloc =  len(covid_cases_df.groupby('date'))

#//*** If these are the same, then no issue.
#//*** If COVID Project is Longer then it will trim to size.
#//*** If Cases are longer, i'll need code to trim the other way.
covid_project_df = covid_project_df[:last_iloc]



#//*** Get the total entries per day
#//*** We'll use this value to generate a duplicate state population to be associated with every county
#number_of_entries = len(covid_cases_df[covid_cases_df['date'] == cases_start_date])


state_confirmed = []

for date in covid_cases_df['date'].unique():
    #print(f"{len(covid_cases_df[ covid_cases_df['date'] == date ])} {date} {covid_project_df[ covid_project_df['date'] == date]['date'].iloc[0]}")
    for _ in range(len(covid_cases_df[ covid_cases_df['date'] == date ])):
        state_confirmed.append(covid_project_df[ covid_project_df['date'] == date]['confirmed'].iloc[0])

print(f"These Should be Equal: {len(state_confirmed)} {len(covid_cases_df)}")

#//*** Add the State Totals to the County Values
covid_cases_df['state_confirmed'] = state_confirmed


These Should be equal: True
These Should be Equal: 19342 19342


In [9]:
print(covid_cases_df)

                county  totalcountconfirmed  totalcountdeaths  newcountconfirmed  newcountdeaths                 date  state_confirmed
index                                                                                                                                 
0          Santa Clara                151.0               6.0                151               6  2020-03-18T00:00:00              611
18983           Amador                  0.0               0.0                  0               0  2020-03-18T00:00:00              611
1004          Tuolumne                  0.0               0.0                  0               0  2020-03-18T00:00:00              611
18649           Plumas                  0.0               0.0                  0               0  2020-03-18T00:00:00              611
2005   San Luis Obispo                  5.0               0.0                  5               0  2020-03-18T00:00:00              611
...                ...                  ...            

In [10]:
#//*** Sort Time Series by date and reset index
covid_cases_df = covid_cases_df.sort_values(by='date')
covid_ethnic_df = covid_ethnic_df.sort_values(by='date')

#//*** Reset the index
#covid_ethnic_df.reset_index(inplace=True)





#//*** Get first Ethnic_df date
ethnic_start_date = covid_ethnic_df['date'].iloc[0]
print(f"Ethinic State: {ethnic_start_date}")

#//*************************************************************************************************
#//*** Get the iloc (index #) of the first covid_case_df entry to match the date in covid_ethic_df
#//*** Compound code
#//*** 1. Get the entries where the date matches ethnic start date
#//*** 2. Get the first value from the list
#//*** 3. Get the Index (name) of that entry
#//*** 4. Get the iloc value of the name entry. This is the value to slice from covid_cases_df
#//*************************************************************************************************
#//*** I hate these, but I see the appeal
#//*************************************************************************************************
covid_start_iloc = covid_cases_df.index.get_loc(covid_cases_df[ covid_cases_df['date'] == ethnic_start_date].iloc[0].name)

#print(covid_cases_df.iloc[covid_start_iloc])
#//*** Merge Time Series covid_ethnic_df - covid_cases_df
covid_cases_df = covid_cases_df.iloc[covid_start_iloc:]

#//*** Start the Bg Table DF with a subset of
#bt_df = covid_ethnic_df
#print
#print(covid_ethnic_df.head())
#print(bt_df.head())
    
    
bt_df = pd.merge(covid_ethnic_df,covid_cases_df,how='left', on='date')

bt_df = pd.merge(bt_df,pop_attrib_df,how='left', on='county')

if 'index' in bt_df.columns:
    bt_df.drop(['index'], axis=1, inplace=True)

print(f"Total Rows: {len(bt_df)}")

#for group in bt_df.groupby('date'):
#    print(group[1])
#    break

Ethinic State: 2020-04-13T00:00:00
Total Rows: 125048


In [11]:
#//*** Rename Columns
g = {
    "bt_rename" : {
        "race_ethnicity" : "race",
        "case_percentage" : "case_percent",
        "death_percentage" : "death_percent",
        "percent_ca_population" : "percent_ca_pop",
        "totalcountconfirmed" : "confirmed",
        "totalcountdeaths" : "deaths",
        "newcountconfirmed" : "newconfirm",
        "newcountdeaths" : "newdeath",
        "population" : "pop",
        "American Indian or Alaska Native" : "Native"
    }
}
bt_df = bt_df.rename(columns = g["bt_rename"])

cols = list(bt_df.columns)

#//*** Verify values in race column match the attribute columns.
#//*** Double checking our renaming work. It's good to make sure we have everything aligned going in.

print(cols)
for race in bt_df['race'].unique():
    print(f"{race} {race in cols}")

#//*** After much retcon the columns and data match



['race', 'cases', 'case_percent', 'deaths', 'death_percent', 'percent_ca_pop', 'date', 'county', 'confirmed', 'deaths', 'newconfirm', 'newdeath', 'state_confirmed', 'cty_fibs', 'pop', 'Latino', 'White', 'Asian', 'Black', 'Native', 'Hawaiian', 'Multiracial', '0rf_num', '0rf_rate', '0rf_err', '1-2rf_num', '1-2rf_rate', '1-2rf_err', '3plrf_num', '3plrf_rate', '3plrf_err']
Latino True
Hawaiian True
Native True
Asian True
White True
Black True
Multiracial True


In [12]:
#//*** Reduce Excess Columns.
#//*** Build ethnic_pop columns which is the population of the identified race_ethnicity for the row.
#//*** Each daily entry is by race and county. Only one race value is applicable per row.\
#//*** Build a new column that indicates the county population of that category
#//*** Build a new column that indicates the percentage of that race in that county.

#//*** Temporary lists to convert to columns
cty_ethnic_pop = []
cty_ethnic_percent = []

#//*** Loop through each row of bt_df
for index,row in bt_df.iterrows():
    try:
        #//*** get the value from the appropriate row['race'] column.
        #//*** Example if row['race'] == 'Latino'
        #//*** Get the 'Latino' population for the county, which would be row ['Latino']
        cty_ethnic_pop.append( row [ row['race'] ] )

        #//*** Build the percentage of racial population in county.
        cty_ethnic_percent.append( row [ row['race'] ] / row [ 'pop'] )

    except: 
        #//*** Stop and print failing atributes
        #//*** These should all be retcon appropriate
        print(f"{row['race']} {row['county']}")
        break

#//*** For readability, replace Latino and White columns with ethnic_pop and ethnic_percent
bt_df['Latino'] = pd.Series(data=cty_ethnic_pop)
bt_df['White'] = pd.Series(data=cty_ethnic_percent)


#//*** Remove excess columns
for col in ['cty_fibs', 'Asian', 'Black', 'Native', 'Hawaiian', 'Multiracial']:
    if col in bt_df.columns:
        bt_df.pop(col)

        
#//*** Rename Latino column to pop_race        
if 'Latino' in bt_df.columns:
    bt_df = bt_df.rename(columns = {'Latino' : 'pop_race'})

#//*** Rename White column to percent_cty_pop        
if 'White' in bt_df.columns:
    bt_df = bt_df.rename(columns = {'White' : 'percent_cty_pop'})



In [13]:
print(len(bt_df))
#//*** Print a few rows from each day to get a feel for the total data
for group in bt_df.groupby('date'):
    print(group[1].head(5))

    


125048
     race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date       county  confirmed  deaths  newconfirm  newdeath  state_confirmed       pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
0  Latino   5276         35.99     170          28.38            38.9  2020-04-13T00:00:00      Ventura      311.0     9.0           5         0            22348    846006    365835         0.432426    79671      9.42     3.35     535084       63.26       7.11     231114       27.32       6.76
1  Latino   5276         35.99     170          28.38            38.9  2020-04-13T00:00:00  Los Angeles     9367.0   320.0         234        24            22348  10039107   4881970         0.486295   294048      2.94     2.00    6698377       66.87       7.15    3024086       30.19       7.03
2  Latino   5276         35.99     170          28.38            38.9  2020-04-13T00:00:00         Napa     

             race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date         county  confirmed  deaths  newconfirm  newdeath  state_confirmed      pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
8932  Multiracial    339           0.9       9            0.4             2.2  2020-05-05T00:00:00      Riverside     4309.0   190.0          88         6            56212  2470546     58137         0.023532   284387     11.71     4.28    1508350       62.10       7.16     636217       26.19       6.58
8933  Multiracial    339           0.9       9            0.4             2.2  2020-05-05T00:00:00         Shasta       31.0     4.0           0         0            56212   180080      6770         0.037594    47610     26.62     6.43      89878       50.24       7.04      41394       23.14       5.74
8934  Multiracial    339           0.9       9            0.4             2.2  2020-05-0

19492  Black   4172           5.2     399           10.0             6.0  2020-05-31T00:00:00           Napa      114.0     3.0           1         0           110583  137744      2859         0.020756    15295     11.08     3.81      81947       59.35       7.36      40841       29.58       7.04
        race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date         county  confirmed  deaths  newconfirm  newdeath  state_confirmed      pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
19894  Black   4218           5.2     408           10.0             6.0  2020-06-01T00:00:00           Yuba       26.0     1.0           0         0           113006    78668      2841         0.036114    12440     16.02     5.10      46124       59.39       7.42      19094       24.59       6.50
19895  Black   4218           5.2     408           10.0             6.0  2020-06-01T00:00:00           

30454  Latino  80875          56.6    2348           41.1            38.9  2020-06-27T00:00:00     Santa Cruz      350.0     3.0           4         0           206433  273213     92922         0.340108    33624     12.74     3.78     160164       60.68       7.33      70169       26.58       6.90
        race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date     county  confirmed  deaths  newconfirm  newdeath  state_confirmed     pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
30856  White  24775          16.9    1857           32.3            36.6  2020-06-28T00:00:00     Nevada      103.0     1.0           1         0           211243   99755     84530         0.847376    25819     25.98     6.04      48361       48.67       6.95      25184       25.35       6.11
30857  White  24775          16.9    1857           32.3            36.6  2020-06-28T00:00:00    Ventura     2747

         race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date      county  confirmed  deaths  newconfirm  newdeath  state_confirmed      pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
41818  Native    662           0.2      31            0.4             0.5  2020-07-25T00:00:00      Orange    33978.0   562.0         448         6           445400  3175692      6216         0.001957   140901      4.45     2.41    2100896       66.41       6.90     921576       29.13       6.72
41819  Native    662           0.2      31            0.4             0.5  2020-07-25T00:00:00       Modoc        0.0     0.0           0         0           445400     8841       259         0.029295     2204     25.46     6.76       4222       48.78       7.95       2230       25.76       7.06
41820  Native    662           0.2      31            0.4             0.5  2020-07-25T00:00:00  Santa Cruz   

54408  Latino  266430          59.5    5786           48.2            38.9  2020-08-25T00:00:00       Sierra        6.0     0.0           0         0           673095     3005       381         0.126789      592     19.82     8.62       1418       47.47      10.31        977       32.71       9.45
        race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date      county  confirmed  deaths  newconfirm  newdeath  state_confirmed     pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
54810  Black  19321           4.3     951            7.8             6.0  2020-08-26T00:00:00      Solano     5131.0    46.0          17         0           679099  447643     61264         0.136859    46521     10.61     3.88     281603       64.25       7.21     110154       25.13       6.75
54811  Black  19321           4.3     951            7.8             6.0  2020-08-26T00:00:00        Napa     1

66182  Latino  335051          61.1    7282           48.5            38.9  2020-09-23T00:00:00          Modoc       24.0     0.0           0         0           787470     8841      1291         0.146024     2204     25.46     6.76       4222       48.78       7.95       2230       25.76       7.06
        race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date           county  confirmed  deaths  newconfirm  newdeath  state_confirmed     pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
66584  Asian  30456           5.5    1765           11.7            15.4  2020-09-24T00:00:00  San Luis Obispo     3480.0    29.0          33         0           790640  283111     10259         0.036237    44264     16.33     4.84     154658       57.07       7.12      72085       26.60       6.49
66585  Asian  30456           5.5    1765           11.7            15.4  2020-09-24T00:00:00      

78362  Latino  385597          61.1    8296           48.6            38.9  2020-10-23T00:00:00  Trinity       26.0     0.0           0         0           886865   12285       907         0.073830     2279     18.41     6.88       6164       49.78       8.03       3939       31.81       7.12
         race   cases  case_percent  deaths  death_percent  percent_ca_pop                 date       county  confirmed  deaths  newconfirm  newdeath  state_confirmed      pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
78764  Latino  387923          61.1    8324           48.6            38.9  2020-10-24T00:00:00  Santa Clara    23978.0   388.0         156         0           892810  1927852    482298         0.250174    74471      3.88     2.11    1373320       71.51       6.70     472797       24.62       6.55
78765  Latino  387923          61.1    8324           48.6            38.9  2020-10-24T00:00:00         Yolo

90136  Black  32998           4.2    1369            7.4             6.0  2020-11-21T00:00:00     Monterey    13886.0   113.0         148         0          1087714  434061     10919         0.025155    31346      7.42     3.24     269307       63.71       7.29     122057       28.87       6.98
        race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date  county  confirmed  deaths  newconfirm  newdeath  state_confirmed     pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
90538  Black  33234           4.2    1370            7.4             6.0  2020-11-22T00:00:00    Yuba     1672.0    10.0          29         0          1102033   78668      2841         0.036114    12440     16.02     5.10      46124       59.39       7.42      19094       24.59       6.50
90539  Black  33234           4.2    1370            7.4             6.0  2020-11-22T00:00:00  Madera     5745.0    80.0  

            race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date       county  confirmed  deaths  newconfirm  newdeath  state_confirmed      pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
101906  Hawaiian   7466           0.6     115            0.5             0.3  2020-12-20T00:00:00  San Joaquin    38268.0   561.0         928         0          1854456   762148      4641         0.006089    34459      4.63     2.37     489260       65.78       7.08     220112       29.59       6.89
101907  Hawaiian   7466           0.6     115            0.5             0.3  2020-12-20T00:00:00         Yolo     7121.0   104.0          96         0          1854456   220500       928         0.004209    16149      7.64     3.39     139576       66.00       7.09      55765       26.37       6.68
101908  Hawaiian   7466           0.6     115            0.5             0.3  2020-12-20T00:00:00

113278  Latino  1222272          54.9   15018           46.2            38.9  2021-01-17T00:00:00   Calaveras     1608.0    23.0          12         0          2942475    45905      5967         0.129986     9940     21.93     6.38      22900       50.53       7.46      12478       27.53       6.60
         race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date       county  confirmed  deaths  newconfirm  newdeath  state_confirmed      pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
113680  Black  89707           4.0    2173            6.6             6.0  2021-01-18T00:00:00    Mendocino     2919.0    32.0           7         0          2973174    86749       708         0.008161    16030     18.43     5.23      44837       51.55       7.10      26110       30.02       6.42
113681  Black  89707           4.0    2173            6.6             6.0  2021-01-18T00:00:00       Tul

124646  Multiracial  38562           1.5     569            1.2             2.2  2021-02-14T00:00:00  Merced    27601.0   375.0          60         0          3399878  277680      5051         0.018190    26224      9.72     4.19     170334       63.15       7.32      73172       27.13       6.75


In [14]:
"""

cols = list(hier_df.columns)

for x in range(0,len(cols)):
    print(x)

hier_df = bt_df.set_index([bt_df['date'],bt_df['county']])
print(hier_df)

#HTML(hier_df.to_html())
"""

"\n\ncols = list(hier_df.columns)\n\nfor x in range(0,len(cols)):\n    print(x)\n\nhier_df = bt_df.set_index([bt_df['date'],bt_df['county']])\nprint(hier_df)\n\n#HTML(hier_df.to_html())\n"

In [15]:
#for col in covid_project_df:
#    print(f'{covid_project_df[col].str.replace("[","").str.replace("]","")}')

In [16]:
race_list = bt_df['race'].unique()
print(race_list)
print(len(bt_df))

#//*** Initialize race Dict
rd = {}

for race in race_list:
    rd[race] = bt_df [ bt_df['race'] == race ] 
#print( bt_df [ bt_df['race'] == 'Latino'] )

for race in race_list:
    print(rd[race].head(10))


['Latino' 'Hawaiian' 'Native' 'Asian' 'White' 'Black' 'Multiracial']
125048
     race  cases  case_percent  deaths  death_percent  percent_ca_pop                 date         county  confirmed  deaths  newconfirm  newdeath  state_confirmed       pop  pop_race  percent_cty_pop  0rf_num  0rf_rate  0rf_err  1-2rf_num  1-2rf_rate  1-2rf_err  3plrf_num  3plrf_rate  3plrf_err
0  Latino   5276         35.99     170          28.38            38.9  2020-04-13T00:00:00        Ventura      311.0     9.0           5         0            22348    846006    365835         0.432426    79671      9.42     3.35     535084       63.26       7.11     231114       27.32       6.76
1  Latino   5276         35.99     170          28.38            38.9  2020-04-13T00:00:00    Los Angeles     9367.0   320.0         234        24            22348  10039107   4881970         0.486295   294048      2.94     2.00    6698377       66.87       7.15    3024086       30.19       7.03
2  Latino   5276         35.99   

In [17]:
#//*** Let's run some correllations for funsies
pop_cols = ['population', 'Latino','White','Asian','Black','American Indian or Alaska Native', 'Hawaiian' ]
rf_cols = ['0rf_num', '1-2rf_num',  '3plrf_num']
rf_cols = ['0rf_rate', '1-2rf_rate',  '3plrf_rate']

for x in pop_cols:
    for y in rf_cols:
        print(f"{x} {y} - {pop_attrib_df[ [x,y] ].corr().iloc[0].iloc[1] }")

population 0rf_rate - -0.4567961826407012
population 1-2rf_rate - 0.45180422307614304
population 3plrf_rate - 0.1425255103467081
Latino 0rf_rate - -0.40905527183588614
Latino 1-2rf_rate - 0.3837575774517347
Latino 3plrf_rate - 0.1806483757865183
White 0rf_rate - -0.464199120421134
White 1-2rf_rate - 0.4797158976341499
White 3plrf_rate - 0.09234516802071475
Asian 0rf_rate - -0.5087228664495507
Asian 1-2rf_rate - 0.5231641998293637
Asian 3plrf_rate - 0.1079399281808924
Black 0rf_rate - -0.38279510407369116
Black 1-2rf_rate - 0.3729504695343319
Black 3plrf_rate - 0.13387953801240074
American Indian or Alaska Native 0rf_rate - -0.3905801458839876
American Indian or Alaska Native 1-2rf_rate - 0.3961581308641285
American Indian or Alaska Native 3plrf_rate - 0.09669106495836434
Hawaiian 0rf_rate - -0.5532497192513318
Hawaiian 1-2rf_rate - 0.5874851204300123
Hawaiian 3plrf_rate - 0.0700536245231904
