# Get April cases for covid and relevant columns
* required datasets: United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv, us_state_population.csv
* also adds a column of abbreviated state names

In [16]:
# importing libraries
import pandas as pd 
import numpy as np

In [17]:
# loading covid cases and deaths data in dataframe

us_covid = pd.read_csv("United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv") 
us_covid.head(50)

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,12/08/2020,NM,109947,,,0,0.0,1756,,,0,0.0,12/09/2020 02:45:40 PM,,Not agree
1,08/21/2020,MD,103523,,,624,0.0,3685,3546.0,139.0,11,1.0,08/22/2020 02:21:09 PM,,Agree
2,03/05/2020,GA,2,,,-5,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree
3,02/08/2020,WA,1,,,0,,0,,,0,,03/26/2020 04:22:39 PM,,
4,04/11/2020,ND,293,,,15,,7,,,1,,04/10/2020 04:22:39 PM,Agree,Not agree
5,11/14/2020,VA,201961,183455.0,18506.0,1161,191.0,3800,3527.0,273.0,1,0.0,11/15/2020 03:12:13 PM,Agree,Agree
6,12/02/2020,NY,346492,,,5775,0.0,10117,,,56,0.0,12/03/2020 03:21:08 PM,Not agree,Not agree
7,01/23/2020,WY,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree
8,04/10/2020,MA,20845,,,1904,,599,,,96,,04/09/2020 04:22:39 PM,Agree,Agree
9,11/15/2020,WA,130040,,,2309,0.0,2519,,,0,0.0,11/16/2020 06:40:02 PM,,


We want to remove the the columns that are unnecessary and clean the data set. We are concerned with the new cases, total cases, new deaths and total deaths.

In [18]:
#remove columns
to_drop = ['conf_cases','prob_cases','pnew_case','conf_death','prob_death','pnew_death','created_at','consent_cases','consent_deaths']
us_covid.drop(to_drop, inplace=True, axis=1)
us_covid.rename(columns={'submission_date':'date'},inplace=True)
us_covid.head()

Unnamed: 0,date,state,tot_cases,new_case,tot_death,new_death
0,12/08/2020,NM,109947,0,1756,0
1,08/21/2020,MD,103523,624,3685,11
2,03/05/2020,GA,2,-5,0,0
3,02/08/2020,WA,1,0,0,0
4,04/11/2020,ND,293,15,7,1


We will now remove all the dates for each state, except for the month of April, as this is when we are doing the Sentiment Analysis of Tweets.

In [19]:
#remove dates except for april for each state
april = us_covid[(us_covid['date'] >= '04/01/2020') & (us_covid['date'] <= '04/30/2020')]
april.head()

Unnamed: 0,date,state,tot_cases,new_case,tot_death,new_death
4,04/11/2020,ND,293,15,7,1
8,04/10/2020,MA,20845,1904,599,96
16,04/07/2020,MS,2003,88,67,8
27,04/19/2020,MI,31424,707,2391,84
34,04/14/2020,ND,341,10,9,1


Export the cleaned data set to a csv file.

In [20]:
#export csv
april.to_csv(r'april_covid_deaths_cases.csv', index = False)

Upload the dataset containing the population of each state for 2020 in each US state. Replace the name with the file path.

In [10]:
population = pd.read_csv("us_state_population.csv") 
population.rename(columns={'Population Estimate (as of July 1 2020)': 'population_estimate'},inplace=True)
print(population)

                   State  population_estimate
0                 Alaska               734002
1                Alabama              4908621
2               Arkansas              3038999
3                Arizona              7378494
4             California             39937489
5               Colorado              5845526
6            Connecticut              3563077
7   District of Columbia               720687
8               Delaware               982895
9                Florida             21992985
10               Georgia             10736059
11                Hawaii              1412687
12                  Iowa              3179849
13                 Idaho              1826156
14              Illinois             12659682
15               Indiana              6745354
16                Kansas              2910357
17              Kentucky              4499692
18             Louisiana              4645184
19         Massachusetts              6976597
20              Maryland          

The following is a dictionary sourced from a GitHub repository with the abbreviated names for each state.

In [11]:
# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain.  To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.

us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
print(us_state_abbrev)

{'Alabama': 'AL', 'Alaska': 'AK', 'American Samoa': 'AS', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'District of Columbia': 'DC', 'Florida': 'FL', 'Georgia': 'GA', 'Guam': 'GU', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Northern Mariana Islands': 'MP', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Puerto Rico': 'PR', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virgin Islands': 'VI', 'Virginia': 'VA', 'Washingt

Add a column with the state abbreviations to the dataframe from the dictionary.


In [12]:
population['state_abbreviation'] = population['State'].map(us_state_abbrev)
print(population)
population.info()

                   State  population_estimate state_abbreviation
0                 Alaska               734002                 AK
1                Alabama              4908621                 AL
2               Arkansas              3038999                 AR
3                Arizona              7378494                 AZ
4             California             39937489                 CA
5               Colorado              5845526                 CO
6            Connecticut              3563077                 CT
7   District of Columbia               720687                 DC
8               Delaware               982895                 DE
9                Florida             21992985                 FL
10               Georgia             10736059                 GA
11                Hawaii              1412687                 HI
12                  Iowa              3179849                 IA
13                 Idaho              1826156                 ID
14              Illinois 

In [13]:
# save dataframe
population.to_csv(r'us_population_abbreviated_state_name.csv', index = False)