In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
data = "data/covid_deaths_by_state.csv"
death_df = pd.read_csv(data)
death_df.head()

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,conf_death,prob_death,new_death,pnew_death,created_at,consent_cases,consent_deaths
0,01/22/2020,CO,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree
1,01/23/2020,CO,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree
2,01/24/2020,CO,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree
3,01/25/2020,CO,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree
4,01/26/2020,CO,0,,,0,,0,,,0,,03/26/2020 04:22:39 PM,Agree,Agree


In [5]:
#revoming unneccessary columns  
updated_df = death_df.drop(columns = ['created_at', 'prob_cases', 'pnew_case', 'prob_death', 'pnew_death', 
                                      'consent_cases', 'consent_deaths', 'new_death', 'conf_death', 'conf_cases'])
updated_df.head()

Unnamed: 0,submission_date,state,tot_cases,new_case,tot_death
0,01/22/2020,CO,0,0,0
1,01/23/2020,CO,0,0,0
2,01/24/2020,CO,0,0,0
3,01/25/2020,CO,0,0,0
4,01/26/2020,CO,0,0,0


In [6]:
#renaming columns to proper names 
renamed_df = updated_df.rename(columns={'submission_date': 'Submission Date', 'state': 'State', 
                                       'tot_cases': 'Total Cases', 'conf_cases' : 'Confirmed Cases',
                                       'new_case' : 'New Cases', 'tot_death' : 'Total Deaths', 
                                        'conf_death' : 'Confirmed Deaths'})
renamed_df.head()

Unnamed: 0,Submission Date,State,Total Cases,New Cases,Total Deaths
0,01/22/2020,CO,0,0,0
1,01/23/2020,CO,0,0,0
2,01/24/2020,CO,0,0,0
3,01/25/2020,CO,0,0,0
4,01/26/2020,CO,0,0,0


In [7]:
#dropping 0 case submissions before the first reported date 
clean_df = renamed_df.dropna(thresh=5)
clean_df.head()

Unnamed: 0,Submission Date,State,Total Cases,New Cases,Total Deaths
0,01/22/2020,CO,0,0,0
1,01/23/2020,CO,0,0,0
2,01/24/2020,CO,0,0,0
3,01/25/2020,CO,0,0,0
4,01/26/2020,CO,0,0,0


In [8]:
#look through dataframe and only keep values where total cases =! 0
#if total cases = 0, drop row 
updated_df = clean_df[clean_df['Total Cases'] != 0]
updated_df.head()

Unnamed: 0,Submission Date,State,Total Cases,New Cases,Total Deaths
43,03/05/2020,CO,1,1,0
44,03/06/2020,CO,2,1,0
45,03/07/2020,CO,8,6,0
46,03/08/2020,CO,8,0,0
47,03/09/2020,CO,12,4,0


In [9]:
#look through data frame to ensure that all states are of the 50 states 
#if state does not equal actual state abbreviation , drop row 
updated_df['State'].unique()


array(['CO', 'FL', 'AZ', 'SC', 'CT', 'NE', 'IA', 'NM', 'KY', 'WY', 'ND',
       'WA', 'TN', 'MA', 'PA', 'NYC', 'OH', 'AL', 'VA', 'MI', 'CA', 'NJ',
       'MS', 'IL', 'TX', 'GA', 'LA', 'WI', 'NV', 'IN', 'PR', 'MD', 'OR',
       'NY', 'OK', 'NC', 'ID', 'UT', 'AR', 'MO', 'DE', 'MN', 'WV', 'RI',
       'DC', 'ME', 'KS', 'SD', 'NH', 'HI', 'MT', 'AK', 'VT', 'GU', 'VI',
       'MP'], dtype=object)

In [10]:
#create list of states 
state_list = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL'
,'GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN'
,'MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR'
,'PA','RI','SC','SD','TN','TX','UT' ,'VT','VA','WA','WV', 'WI', 'WY']

In [11]:
#drop territories and other abbreviations not in the 50 states list 
final_df = updated_df[updated_df['State'].isin(state_list)]

#check to see if there is 50 states in df now
final_df['State'].unique()

array(['CO', 'FL', 'AZ', 'SC', 'CT', 'NE', 'IA', 'NM', 'KY', 'WY', 'ND',
       'WA', 'TN', 'MA', 'PA', 'OH', 'AL', 'VA', 'MI', 'CA', 'NJ', 'MS',
       'IL', 'TX', 'GA', 'LA', 'WI', 'NV', 'IN', 'MD', 'OR', 'NY', 'OK',
       'NC', 'ID', 'UT', 'AR', 'MO', 'DE', 'MN', 'WV', 'RI', 'ME', 'KS',
       'SD', 'NH', 'HI', 'MT', 'AK', 'VT'], dtype=object)

In [12]:
#display final df 
final_df.head()

Unnamed: 0,Submission Date,State,Total Cases,New Cases,Total Deaths
43,03/05/2020,CO,1,1,0
44,03/06/2020,CO,2,1,0
45,03/07/2020,CO,8,6,0
46,03/08/2020,CO,8,0,0
47,03/09/2020,CO,12,4,0


In [13]:
#create database connection 
#connection_string = "postgres:postgres@localhost:5432/covid_death_db"
#engine = create_engine(f'postgresql://{connection_string}')

In [14]:
#confirm tables 
#engine.table_names()

In [15]:
#load dataframe into database
#final_df.to_sql(name='covid_deaths', con=engine, if_exists='append',index=False)