In [1]:
import pandas as pd
import json
import requests
import pycountry
from sqlalchemy import create_engine

In [2]:
######### COUNTRY TABLE ###############

In [3]:
#File from: https://worldpopulationreview.com/
file='Resources/countries.json'

In [4]:
#Opening the file and saving the data in "data"
with open(file) as f:
    data = json.load(f)

In [5]:
#Creating empty arrays to append data to
country_id=[]
name=[]
pop=[]
area=[]
density=[]
growth=[]
world_percent=[]
rank=[]

#Looping through the data and appending to the previously created arrays
for index, row in enumerate(data['data']):
    country_id.append(row['cca2'])
    name.append(row['name'])
    pop.append(float(row['pop2020']))

In [6]:
#Creating a Data Frame
country_df=pd.DataFrame({
    'country_id':country_id,
    'country_name':name,
    'population':pop,
})

In [7]:
country_df=country_df.set_index('country_id')
country_df.head()

Unnamed: 0_level_0,country_name,population
country_id,Unnamed: 1_level_1,Unnamed: 2_level_1
CN,China,1439323.776
IN,India,1380004.385
US,United States,331002.651
ID,Indonesia,273523.615
PK,Pakistan,220892.34


In [8]:
######### STATE TABLE ###############

In [9]:
#Getting the table from an html table
url='https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population'

In [10]:
#Reading the html of the url above
state_info=pd.read_html(url)

In [11]:
#Storing the first element of the array ad a dataframe 
state_info_df=state_info[0]
state_info_df.head()

Unnamed: 0_level_0,Rank,Rank,State,Census population,Census population,"Change, 2010–2019","Change, 2010–2019",Total U.S. House of Representatives Seats,"Estimated population per electoral vote, 2019[note 1]",Census population per House seat,Census population per House seat,"Percent of the total U.S. population, 2019[note 2]"
Unnamed: 0_level_1,Current,2010,State,"Estimate, July 1, 2019[5]","April 1, 2010[6]",Percent[note 3],Absolute,Total U.S. House of Representatives Seats,"Estimated population per electoral vote, 2019[note 1]","Estimated, 2019",2010,"Percent of the total U.S. population, 2019[note 2]"
0,1.0,1.0,California,39512223,37254523,6.1%,2257700,53,718404,745514,702885,11.91%
1,2.0,2.0,Texas,28995881,25145561,15.3%,3850320,36,763050,805441,698503,8.74%
2,3.0,4.0,Florida,21477737,18801310,14.2%,2676427,27,740611,795472,696468,6.47%
3,4.0,3.0,New York,19453561,19378102,0.4%,75459,27,670812,720502,717707,5.86%
4,5.0,6.0,Pennsylvania,12801989,12702379,0.8%,99610,18,640099,711222,705715,3.86%


In [12]:
#getting rid of the top level of columns
state_info_df.columns = state_info_df.columns.get_level_values(1)

In [13]:
#Deleting columns
state_info_df=state_info_df.drop(columns=['Current','2010','April 1, 2010[6]','Percent[note 3]','Absolute',
                          'Total U.S. House of Representatives Seats','Estimated population per electoral vote, 2019[note 1]',
                         'Estimated, 2019','2010','Percent of the total U.S. population, 2019[note 2]'])


In [14]:
#Renaming columns
state_info_df=state_info_df.rename(columns={'State':'state_name','Estimate, July 1, 2019[5]':'population'})

In [15]:
#Viewing a sample of the DF
state_info_df.head()

Unnamed: 0,state_name,population
0,California,39512223
1,Texas,28995881
2,Florida,21477737
3,New York,19453561
4,Pennsylvania,12801989


In [16]:
#Pasted a dictionary of abreviations and states to add to the table
states = {"AL":"Alabama","AK":"Alaska","AZ":"Arizona","AR":"Arkansas","CA":"California","CO":"Colorado","CT":"Connecticut",
          "DE":"Delaware","FL":"Florida","GA":"Georgia","HI":"Hawaii","ID":"Idaho","IL":"Illinois","IN":"Indiana",
          "IA":"Iowa","KS":"Kansas","KY":"Kentucky","LA":"Louisiana","ME":"Maine","MD":"Maryland","MA":"Massachusetts",
          "MI":"Michigan","MN":"Minnesota","MS":"Mississippi","MO":"Missouri","MT":"Montana","NE":"Nebraska","NV":"Nevada",
          "NH":"New Hampshire","NJ":"New Jersey","NM":"New Mexico","NY":"New York","NC":"North Carolina","ND":"North Dakota",
          "OH":"Ohio","OK":"Oklahoma","OR":"Oregon","PA":"Pennsylvania","RI":"Rhode Island","SC":"South Carolina",
          "SD":"South Dakota","TN":"Tennessee","TX":"Texas","UT":"Utah","VT":"Vermont","VA":"Virginia","WA":"Washington",
          "WV":"West Virginia","WI":"Wisconsin","WY":"Wyoming"}

In [17]:
#Created a Dataframe from that dictionary
state_abrv_df=pd.DataFrame({
    'state_id':list(states.keys()),
    'state_name':list(states.values())
})
#Displaying the first rows of the DF
state_abrv_df.head()

Unnamed: 0,state_id,state_name
0,AL,Alabama
1,AK,Alaska
2,AZ,Arizona
3,AR,Arkansas
4,CA,California


In [18]:
#Merging the 2 Dataframes
state_df=state_abrv_df.merge(state_info_df,on='state_name',how='left')

In [19]:
#Seting the ID as the index and displaying the DF
state_df=state_df.set_index('state_id')
state_df.head()

Unnamed: 0_level_0,state_name,population
state_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AL,Alabama,4903185
AK,Alaska,731545
AZ,Arizona,7278717
AR,Arkansas,3017825
CA,California,39512223


In [20]:
######### COUNTRY CASES ###############

In [21]:
countries_df=pd.read_csv('Resources/Country_Cases.csv')
countries_df.head()

Unnamed: 0,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,tests_units
0,ABW,Aruba,3/13/2020,2,2,0,0,18.733,18.733,0.0,0.0,,,,,
1,ABW,Aruba,3/20/2020,4,2,0,0,37.465,18.733,0.0,0.0,,,,,
2,ABW,Aruba,3/24/2020,12,8,0,0,112.395,74.93,0.0,0.0,,,,,
3,ABW,Aruba,3/25/2020,17,5,0,0,159.227,46.831,0.0,0.0,,,,,
4,ABW,Aruba,3/26/2020,19,2,0,0,177.959,18.733,0.0,0.0,,,,,


In [22]:
countries_df['Country_ID']=""

In [23]:
country_id=[]
for row in countries_df['iso_code']:
    country_=pycountry.countries.get(alpha_3=row)
    country_id.append(country_.alpha_2)
        
countries_df['Country_ID']=country_id
countries_df["id"]=countries_df.index
countries_df.head()

Unnamed: 0,iso_code,location,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,tests_units,Country_ID,id
0,ABW,Aruba,3/13/2020,2,2,0,0,18.733,18.733,0.0,0.0,,,,,,AW,0
1,ABW,Aruba,3/20/2020,4,2,0,0,37.465,18.733,0.0,0.0,,,,,,AW,1
2,ABW,Aruba,3/24/2020,12,8,0,0,112.395,74.93,0.0,0.0,,,,,,AW,2
3,ABW,Aruba,3/25/2020,17,5,0,0,159.227,46.831,0.0,0.0,,,,,,AW,3
4,ABW,Aruba,3/26/2020,19,2,0,0,177.959,18.733,0.0,0.0,,,,,,AW,4


In [24]:
countries_df=countries_df[['id','date','Country_ID','total_cases','new_cases','total_deaths','new_deaths',
                           'total_cases_per_million','new_cases_per_million','total_deaths_per_million',
                           'new_deaths_per_million','total_tests','new_tests','total_tests_per_thousand',
                           'new_tests_per_thousand']]

In [25]:
countries_df.rename(columns = {"date":"report_date",
                              "Country_ID":"country_id"},inplace = True)

In [26]:
# conntecting to DB
connection_string = "postgres:postgres@localhost:5432/covid19_db"
engine = create_engine(f'postgresql://{connection_string}')

In [27]:
#retrieve existing tables
engine.table_names()

['state',
 'us_states_cases',
 'country',
 'hospital_beds',
 'index_prices',
 'country_cases',
 'us_unemployment_stats',
 'gas_price']

In [28]:
#Use pandas to load csv converted DataFrame into database
country_df.to_sql(name='country', con=engine, if_exists='append', index=True)


In [29]:
state_df.to_sql(name='state', con=engine, if_exists='append', index=True)

In [30]:
# read data from teable
pd.read_sql_query('select * from country', con=engine).head()

Unnamed: 0,country_id,country_name,population
0,CN,China,1439324
1,IN,India,1380004
2,US,United States,331003
3,ID,Indonesia,273524
4,PK,Pakistan,220892


In [31]:
# read data from teable
pd.read_sql_query('select * from state', con=engine).head()

Unnamed: 0,state_id,state_name,population
0,AL,Alabama,4903185
1,AK,Alaska,731545
2,AZ,Arizona,7278717
3,AR,Arkansas,3017825
4,CA,California,39512223


In [32]:
#Use pandas to load csv converted DataFrame into database
countries_df.to_sql(name='country_cases', con=engine, if_exists='append', index=False)

In [33]:
# read data from teable
pd.read_sql_query('select * from country_cases', con=engine).head()

Unnamed: 0,id,report_date,country_id,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,new_cases_per_million,total_deaths_per_million,new_deaths_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand
0,0,2020-03-13,AW,2,2,0,0,18.733,18.733,0.0,0.0,,,,
1,1,2020-03-20,AW,4,2,0,0,37.465,18.733,0.0,0.0,,,,
2,2,2020-03-24,AW,12,8,0,0,112.395,74.93,0.0,0.0,,,,
3,3,2020-03-25,AW,17,5,0,0,159.227,46.831,0.0,0.0,,,,
4,4,2020-03-26,AW,19,2,0,0,177.959,18.733,0.0,0.0,,,,
