# Covid-19 to SQL

- Updates data from John Hopkins
- Populates CovidLocs tale
- (Re-)populates CovidCases table

In [1]:
import pandas as pd
import sqlalchemy as sal
import requests
from os.path import basename
import getpass
import numpy as np

In [2]:
endpoint = "capstone.clihskgj8i7s.us-west-2.rds.amazonaws.com"
user="group3"
db="db1"
pw=getpass.getpass("Enter database password")

Enter database password········


In [3]:
engine = sal.create_engine('postgresql://%s:%s@%s/%s' % (user, pw, endpoint, db))

In [4]:
url1="https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"
url2="https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv"


In [5]:
for url in [url1, url2]:
    r=requests.get(url)

    with open (basename(url), "wb") as fid:
        fid.write(r.content)

In [6]:
!ls *.csv

time_series_covid19_confirmed_US.csv  time_series_covid19_deaths_US.csv


In [7]:
dfd=pd.read_csv(basename(url1))
dfc=pd.read_csv(basename(url2))

In [8]:
dfd.columns[0:13]

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population',
       '1/22/20'],
      dtype='object')

In [9]:
dfc.columns[0:13]

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', '1/22/20', '1/23/20'],
      dtype='object')

In [10]:
dfc["Combined_Key"].head()

0    Autauga, Alabama, US
1    Baldwin, Alabama, US
2    Barbour, Alabama, US
3       Bibb, Alabama, US
4     Blount, Alabama, US
Name: Combined_Key, dtype: object

In [11]:
pd.read_sql('select * from CovidLocs limit 1', engine)

Unnamed: 0,uid,iso2,iso3,code3,fips,admin2,province_state,country_region,latitude,longitude,combined_key,population
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",55869.0


In [13]:
#CovidLocs["latlon"] = CovidLocs[["Lat","Long_"]].apply(lambda x: tuple([*x]), axis=1)
#CovidLocs=CovidLocs.drop(columns=["Lat","Long_"], axis=1)

In [14]:
#CovidLocs=CovidLocs[["iso2","iso3","code3","fips","admin2","province_state", \
#                   "country_region","combined_key","latlon","population"]]

In [64]:
#CovidLocs[CovidLocs["admin2"] == np.nan]
#CovidLocs[CovidLocs["iso3"] == "ASM"]
#CovidLocs.loc[84070002,:]

In [66]:
def get_covidlocs(indf):
    CovidLocs=indf.iloc[:,0:12]
    cols=list(CovidLocs.columns.str.lower())
    cols[8] = "latitude"
    cols[9] = "longitude"
    CovidLocs.columns=cols

    CovidLocs.set_index("uid", drop=True, inplace=True)

    # Set 'Unassigned' admin2 regions to None such that they will be null in the database
    mask=CovidLocs["admin2"] == "Unassigned"
    CovidLocs.loc[mask,"admin2"] = np.nan

    # set zero latitudes and longitudes also to none 
    for col in ["latitude","longitude","population"]:
        mask=CovidLocs[col] == 0.0
        CovidLocs.loc[mask,col] = np.nan
        
    return CovidLocs

In [67]:
CovidLocs=get_covidlocs(dfd)

In [68]:
CovidLocs.head(n=5)

Unnamed: 0_level_0,iso2,iso3,code3,fips,admin2,province_state,country_region,latitude,longitude,combined_key,population
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",55869.0
84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,"Baldwin, Alabama, US",223234.0
84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,"Barbour, Alabama, US",24686.0
84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,"Bibb, Alabama, US",22394.0
84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,"Blount, Alabama, US",57826.0


Admin2, fips, latutide and longitude now contain NaN (null) values. I had to remove the 'not null' constraints from these attributes to make the code below work:

We might not want to update the `CovidLocs` table each day, assuming these values remain unchanged.  So I commented out the cells below.

In [69]:
#drop all values from covidlocs
#res=engine.execute("delete from covidlocs")

In [70]:
#CovidLocs.to_sql("covidlocs", engine, if_exists='append');

In [71]:
pd.read_sql('select * from covidlocs order by uid desc limit 5', engine, index_col="uid")

Unnamed: 0_level_0,iso2,iso3,code3,fips,admin2,province_state,country_region,latitude,longitude,combined_key,population
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
84099999,US,USA,840,99999.0,,Grand Princess,US,,,"Grand Princess, US",
84090056,US,USA,840,90056.0,,Wyoming,US,,,"Unassigned, Wyoming, US",
84090055,US,USA,840,90055.0,,Wisconsin,US,,,"Unassigned, Wisconsin, US",
84090054,US,USA,840,90054.0,,West Virginia,US,,,"Unassigned, West Virginia, US",
84090053,US,USA,840,90053.0,,Washington,US,,,"Unassigned, Washington, US",


There are a lot of undefined values and zeros in the data.  Checking that these were also present in the dataframe (sorting the same way):

In [72]:
CovidLocs.sort_values(by="uid", ascending=False).head()

Unnamed: 0_level_0,iso2,iso3,code3,fips,admin2,province_state,country_region,latitude,longitude,combined_key,population
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
84099999,US,USA,840,99999.0,,Grand Princess,US,,,"Grand Princess, US",
84090056,US,USA,840,90056.0,,Wyoming,US,,,"Unassigned, Wyoming, US",
84090055,US,USA,840,90055.0,,Wisconsin,US,,,"Unassigned, Wisconsin, US",
84090054,US,USA,840,90054.0,,West Virginia,US,,,"Unassigned, West Virginia, US",
84090053,US,USA,840,90053.0,,Washington,US,,,"Unassigned, Washington, US",


Now create table CovidCases with uid, datestamp, cases and type:

In [73]:
pd.read_sql('select * from covidcases limit 1', engine)

Unnamed: 0,cid,cloc,rdate,ncas,ctype
0,1,84001001,2020-01-22,0,C


My first 'naive' attempt at converting the data into one row per day and location.  Would take way too long.

In [34]:
"""
#start column for cases and deaths, respectively
nd_i={"C": 11, "D": 12}
df={"C": dfc, "D": dfd}

c=0

for ct in nd_i.keys():
    adf=df[ct]
    for l in range(nloc):
        print("\rprocessing loc %d out of %d" % (l+1,nloc), end="")
        for n in range(nd_i[ct],nd):
            uid=adf.loc[l,"UID"]
            date=adf.columns[n]
            ncas=adf.iloc[l,n]
            covidcases.loc[c,"cloc"] = int(uid)
            covidcases.loc[c,"rdate"] = str(date)
            covidcases.loc[c,"ncas"] = int(ncas)
            covidcases.loc[c,"ctype"] = str(ct)
            c+=1
    print(" ok.")
""";

A much faster way to form the data into one row per date and location:

In [35]:
#xi=list([0])+list(range(11,nd))
def covid19_col2row(indf, ctype="C"):
    #start_col: columns id where data starts (11 for confirmed and 12 for death tables)
    assert ctype == "C" or ctype == "D", "unknown ctype"
    if ctype == "C":
        start_col=10
    elif ctype == "D":
        start_col=11
    tmp=indf.set_index("UID", drop=True)
    st=tmp.iloc[:,start_col:].stack()
    st=st.reset_index(level=[0,1])
    st.columns=["cloc","rdate","ncas"]
    st["ctype"] = ctype
    
    return st

In [36]:
stc=covid19_col2row(dfc, ctype="C")
std=covid19_col2row(dfd, ctype="D")

In [37]:
covidcases=pd.concat([stc,std], axis=0)

In [53]:
nrows=len(covidcases)
print("Number of rows in covidcases: %d" % nrows)

Number of rows in covidcases: 2585160


In [56]:
#covidcases.to_sql?

The pandas `to_sql()` command crashes when trying to push all 2.5 million rows to the database, even when using a small value for the chunksize.  I am writing just a few 1,000 at a time and loading the data during a loop:

**Update**: It seems to work when using the method 'multi' instead of 'none'.  Takes a while though.  Maybe a beefier EC2 and/or RDS instance would help.

In [59]:
import time

In [62]:
res=engine.execute("delete from covidcases")

In [63]:
import time
t1=time.time()
covidcases.to_sql("covidcases", engine, if_exists="append", 
                    index=False, chunksize=1000, method="multi")
t2=time.time()
print("Time for writing %d rows: %5.1fs" % (nrows, t2-t1))

Time for writing 2585160 rows: 224.6s


In [54]:
#from ipywidgets import IntProgress
#from IPython.display import display

In [58]:
"""
step=10000
wb=IntProgress(min=0, max=nrows)
display(wb)
for n in range(0, nrows, step):
    wb.value=n
    covidcases.iloc[n:n+step,:].to_sql("covidcases", engine, if_exists="append", 
                            index=False, chunksize=100)
    print("\rWriting rows %d to %d (total %d)" % (n, n+step, nrows), end="")
""";