In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### New York State Trends

In [104]:
#pull in the comprehensive csv
covid_df = pd.read_csv('Team_work/data/covid_comperhensive.csv')
#Select new york state (StateFIPS: 36) as the state to analyze
covid_df_ny = covid_df[covid_df['StateFIPS'] == 36]
covid_df_ny

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2023-01-10_confirmed,2023-01-11_confirmed,2023-01-12_confirmed,2023-01-13_confirmed,2023-01-14_confirmed,2023-01-15_confirmed,2023-01-16_confirmed,2023-01-10_death,2023-01-11_death,2023-01-12_death,2023-01-13_death,2023-01-14_death,2023-01-15_death,2023-01-16_death,population
1632,0,Statewide Unallocated,NY,36,0,0,0,0,0,0,0,546,546,548,548,548,548,548,0
1633,0,Statewide Unallocated,NY,36,0,0,0,0,0,0,0,546,546,548,548,548,548,548,0
1634,0,Statewide Unallocated,NY,36,0,0,0,0,0,0,0,546,546,548,548,548,548,548,0
1635,0,Statewide Unallocated,NY,36,0,0,0,0,0,0,0,546,546,548,548,548,548,548,0
1636,0,Statewide Unallocated,NY,36,0,0,0,0,0,0,0,546,546,548,548,548,548,548,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4486,36115,Washington County,NY,36,15439,15492,15492,15492,15492,15492,15492,136,136,136,136,136,136,136,61204
4487,36117,Wayne County,NY,36,21502,21581,21581,21581,21581,21581,21581,202,202,202,202,202,202,202,89918
4488,36119,Westchester County,NY,36,328129,329596,329596,329596,329596,329596,329596,2940,2940,2947,2947,2947,2947,2947,967506
4489,36121,Wyoming County,NY,36,9802,9817,9817,9817,9817,9817,9817,90,90,90,90,90,90,90,39859


In [105]:
#group the data together in one row by summing the similarly named columns together
covid_df_ny = covid_df_ny.groupby('State').sum()
covid_df_ny

Unnamed: 0_level_0,countyFIPS,StateFIPS,2023-01-10_confirmed,2023-01-11_confirmed,2023-01-12_confirmed,2023-01-13_confirmed,2023-01-14_confirmed,2023-01-15_confirmed,2023-01-16_confirmed,2023-01-10_death,2023-01-11_death,2023-01-12_death,2023-01-13_death,2023-01-14_death,2023-01-15_death,2023-01-16_death,population
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
NY,2235844,4068,6473463,6499315,6499315,6499315,6499315,6499315,6499315,102387,102387,102770,102770,102770,102770,102770,19453561


In [106]:
#print only the confirmed cases column
covid_df_ny.iloc[:, 2:9].T

State,NY
2023-01-10_confirmed,6473463
2023-01-11_confirmed,6499315
2023-01-12_confirmed,6499315
2023-01-13_confirmed,6499315
2023-01-14_confirmed,6499315
2023-01-15_confirmed,6499315
2023-01-16_confirmed,6499315


#### Observation

The confirmed number of cases in NY state have increased over the last week of data.

In [107]:
#print only the deaths column
covid_df_ny.iloc[:, 9:16].T

State,NY
2023-01-10_death,102387
2023-01-11_death,102387
2023-01-12_death,102770
2023-01-13_death,102770
2023-01-14_death,102770
2023-01-15_death,102770
2023-01-16_death,102770


#### Observations

Likewise, we can see that the number of deaths are also increasing in the last week of data.

Something that should be noted between the two sets of data, is that there are very jarring jumps between dates, and that they remain static for some time. This is because, according to NY government, covid confirmation and death data is on a weekly data update schedule. The data updates every thursday, and contributes to why we see rapid jumps between cases and deaths.

This information can be found on www.nyc.gov by following this [link](https://www.nyc.gov/site/doh/covid/covid-19-data.page)


# Adding Enrichment Dataset

The enrichment dataset described here can be found at this [link](https://data.census.gov/table?q=dp&tid=ACSDP1Y2018.DP05)

Here we are looking to see if race demographics are related in any way to our main COVID-19 datasets. We also want to drop any irrelevant columns.

In [100]:
race_df = pd.read_csv('Team_work/data/race_demographics.csv')
for col_name in race_df.columns:
    if col_name.endswith('Estimate') or col_name.endswith('Margin of Error'):
        race_df = race_df.drop(col_name, axis=1)
for col_name in race_df.columns:
    if col_name.endswith('Percent'):
        race_df.rename(columns = {col_name : col_name[:-9]}, inplace=True)
race_df

Unnamed: 0,Label,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Puerto Rico
0,White,67.7%,64.4%,78.0%,76.5%,59.5%,84.1%,75.2%,68.2%,42.2%,...,77.3%,73.5%,85.7%,94.1%,67.4%,74.8%,93.0%,85.3%,91.5%,65.1%
1,Black or African American,26.7%,3.4%,4.7%,15.2%,5.8%,4.2%,11.0%,22.5%,45.5%,...,16.8%,12.3%,1.3%,1.2%,19.2%,3.9%,3.8%,6.4%,0.6%,12.5%
2,American Indian and Alaska Native,0.5%,15.1%,4.6%,0.7%,0.8%,1.0%,0.3%,0.5%,0.3%,...,0.3%,0.5%,1.1%,0.3%,0.3%,1.3%,0.1%,0.9%,2.8%,0.1%
3,Cherokee tribal grouping,N,0.1%,0.1%,N,0.0%,0.1%,N,N,N,...,0.1%,0.1%,0.0%,N,0.1%,0.0%,N,0.0%,0.1%,N
4,Chippewa tribal grouping,N,0.1%,0.0%,N,0.0%,0.0%,N,N,N,...,0.0%,0.0%,0.0%,N,0.0%,0.0%,N,0.3%,0.0%,N
5,Navajo tribal grouping,N,0.0%,2.1%,N,0.0%,0.1%,N,N,N,...,0.0%,0.0%,0.5%,N,0.0%,0.0%,N,0.0%,0.2%,N
6,Sioux tribal grouping,N,0.1%,0.0%,N,0.0%,0.1%,N,N,N,...,0.0%,0.0%,0.1%,N,0.0%,0.0%,N,0.0%,0.2%,N
7,Asian,1.3%,6.3%,3.3%,1.6%,14.7%,3.2%,4.6%,4.0%,3.9%,...,1.8%,5.0%,2.4%,1.9%,6.5%,8.8%,0.7%,2.8%,1.0%,0.2%
8,Asian Indian,0.3%,0.2%,0.8%,0.3%,2.1%,0.6%,2.0%,1.6%,0.9%,...,0.4%,1.6%,0.4%,0.1%,1.8%,1.7%,0.1%,0.6%,0.1%,N
9,Chinese,0.2%,0.4%,0.7%,0.2%,4.1%,0.6%,1.0%,0.9%,1.2%,...,0.3%,0.8%,0.5%,0.6%,0.9%,2.0%,0.2%,0.5%,0.3%,N


In order to be able to merge this dataframe with the covid dataframe, we need to make a common column between the two. The name 'State' is present in both, but we'll have to convert the full names to the abreviation in order to merge the two together.

In [101]:
states = [
        'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID',
        'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO',
        'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA',
        'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR'
         ]

In [102]:
#create a dictionary of the state abbreviations
states_acronyms = {'State' : states}
#make a new dataframe using the state abbreviations as the first column
race_final_df = pd.DataFrame(data=states_acronyms)
#Give the new dataframe the percentage information from race_df
for cell, row in race_df.iterrows():
    race_final_df[row.values[0]] = row.values[1:]
race_final_df

Unnamed: 0,State,White,Black or African American,American Indian and Alaska Native,Cherokee tribal grouping,Chippewa tribal grouping,Navajo tribal grouping,Sioux tribal grouping,Asian,Asian Indian,...,Japanese,Korean,Vietnamese,Other Asian,Native Hawaiian and Other Pacific Islander,Native Hawaiian,Guamanian or Chamorro,Samoan,Other Pacific Islander,Some other race
0,AL,67.7%,26.7%,0.5%,N,N,N,N,1.3%,0.3%,...,0.1%,0.2%,0.2%,0.2%,0.0%,0.0%,0.0%,0.0%,0.0%,1.7%
1,AK,64.4%,3.4%,15.1%,0.1%,0.1%,0.0%,0.1%,6.3%,0.2%,...,0.1%,0.5%,0.1%,1.1%,1.1%,0.1%,0.2%,0.6%,0.2%,1.3%
2,AZ,78.0%,4.7%,4.6%,0.1%,0.0%,2.1%,0.0%,3.3%,0.8%,...,0.2%,0.2%,0.4%,0.5%,0.2%,0.1%,0.0%,0.0%,0.1%,5.2%
3,AR,76.5%,15.2%,0.7%,N,N,N,N,1.6%,0.3%,...,0.1%,0.1%,0.3%,0.5%,0.3%,N,N,N,N,2.8%
4,CA,59.5%,5.8%,0.8%,0.0%,0.0%,0.0%,0.0%,14.7%,2.1%,...,0.7%,1.2%,1.7%,1.7%,0.4%,0.1%,0.1%,0.1%,0.2%,13.8%
5,CO,84.1%,4.2%,1.0%,0.1%,0.0%,0.1%,0.1%,3.2%,0.6%,...,0.2%,0.4%,0.4%,0.6%,0.1%,0.0%,0.1%,0.0%,0.0%,3.5%
6,CT,75.2%,11.0%,0.3%,N,N,N,N,4.6%,2.0%,...,0.1%,0.3%,0.2%,0.7%,0.0%,N,N,N,N,5.5%
7,DE,68.2%,22.5%,0.5%,N,N,N,N,4.0%,1.6%,...,0.1%,0.2%,0.2%,0.7%,0.1%,N,N,N,N,2.0%
8,DC,42.2%,45.5%,0.3%,N,N,N,N,3.9%,0.9%,...,0.2%,0.5%,0.2%,0.4%,0.1%,N,N,N,N,4.4%
9,FL,74.6%,16.0%,0.3%,0.0%,0.0%,0.0%,0.0%,2.8%,0.8%,...,0.1%,0.1%,0.3%,0.4%,0.1%,0.0%,0.0%,0.0%,0.0%,3.3%


In [103]:
#Combine the COVID-19 comprehensive csv with the race data
combined_df = covid_df.merge(race_final_df, on=['State'], how='left')
combined_df

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2023-01-10_confirmed,2023-01-11_confirmed,2023-01-12_confirmed,2023-01-13_confirmed,2023-01-14_confirmed,2023-01-15_confirmed,...,Japanese,Korean,Vietnamese,Other Asian,Native Hawaiian and Other Pacific Islander,Native Hawaiian,Guamanian or Chamorro,Samoan,Other Pacific Islander,Some other race
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0.1%,0.2%,0.2%,0.2%,0.0%,0.0%,0.0%,0.0%,0.0%,1.7%
1,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0.1%,0.2%,0.2%,0.2%,0.0%,0.0%,0.0%,0.0%,0.0%,1.7%
2,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0.1%,0.2%,0.2%,0.2%,0.0%,0.0%,0.0%,0.0%,0.0%,1.7%
3,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0.1%,0.2%,0.2%,0.2%,0.0%,0.0%,0.0%,0.0%,0.0%,1.7%
4,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0.1%,0.2%,0.2%,0.2%,0.0%,0.0%,0.0%,0.0%,0.0%,1.7%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5738,56037,Sweetwater County,WY,56,12430,12430,12437,12437,12437,12437,...,0.2%,0.1%,0.0%,0.2%,0.2%,N,N,N,N,1.5%
5739,56039,Teton County,WY,56,12035,12035,12045,12045,12045,12045,...,0.2%,0.1%,0.0%,0.2%,0.2%,N,N,N,N,1.5%
5740,56041,Uinta County,WY,56,6318,6318,6333,6333,6333,6333,...,0.2%,0.1%,0.0%,0.2%,0.2%,N,N,N,N,1.5%
5741,56043,Washakie County,WY,56,2727,2727,2731,2731,2731,2731,...,0.2%,0.1%,0.0%,0.2%,0.2%,N,N,N,N,1.5%
