# Covid-19 to SQL

- Updates data from John Hopkins
- Populates CovidLocs tale
- (Re-)populates CovidCases table

In [1]:
import pandas as pd
import sqlalchemy as sal
import requests
from os.path import basename
import getpass
import numpy as np

In [3]:
endpoint = "capstone.clihskgj8i7s.us-west-2.rds.amazonaws.com"
user="group3"
db="db1"
pw=getpass.getpass("Enter database password")

Enter database password········


In [4]:
engine = sal.create_engine('postgresql://%s:%s@%s/%s' % (user, pw, endpoint, db))

In [5]:
url1="https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"
url2="https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv"


In [6]:
for url in [url1, url2]:
    r=requests.get(url)

    with open (basename(url), "wb") as fid:
        fid.write(r.content)

In [7]:
!ls *.csv

time_series_covid19_confirmed_US.csv  time_series_covid19_deaths_US.csv


In [8]:
dfd=pd.read_csv(basename(url1))
dfc=pd.read_csv(basename(url2))

In [9]:
dfd.columns[0:13]

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Population',
       '1/22/20'],
      dtype='object')

In [10]:
dfc.columns[0:13]

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', '1/22/20', '1/23/20'],
      dtype='object')

In [11]:
dfc["Combined_Key"].head()

0    Autauga, Alabama, US
1    Baldwin, Alabama, US
2    Barbour, Alabama, US
3       Bibb, Alabama, US
4     Blount, Alabama, US
Name: Combined_Key, dtype: object

In [19]:
pd.read_sql('select * from CovidLocs limit 1', engine)

Unnamed: 0,uid,iso2,iso3,code3,fips,admin2,province_state,country_region,latitude,longitude,combined_key,population
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",55869.0


In [13]:
CovidLocs=dfd.iloc[:,0:12]

In [14]:
#CovidLocs["latlon"] = CovidLocs[["Lat","Long_"]].apply(lambda x: tuple([*x]), axis=1)
#CovidLocs=CovidLocs.drop(columns=["Lat","Long_"], axis=1)

In [None]:
#CovidLocs=CovidLocs[["iso2","iso3","code3","fips","admin2","province_state", \
#                   "country_region","combined_key","latlon","population"]]

In [None]:
cols=list(CovidLocs.columns.str.lower())
cols[8] = "latitude"
cols[9] = "longitude"
CovidLocs.columns=cols

In [None]:
CovidLocs.set_index("uid", drop=True, inplace=True)

In [None]:
import numpy as np

#CovidLocs[CovidLocs["admin2"] == np.nan]
#CovidLocs[CovidLocs["iso3"] == "ASM"]
CovidLocs.loc[84070002,:]

In [None]:
# Set 'Unassigned' admin2 regions to None such that they will be null in the database
mask=CovidLocs["admin2"] == "Unassigned"
CovidLocs.loc[mask,"admin2"] = np.nan

In [None]:
# set zero latitudes and longitudes also to none 
for col in ["latitude","longitude","population"]:
    mask=CovidLocs[col] == 0.0
    CovidLocs.loc[mask,col] = np.nan

In [None]:
CovidLocs.head(n=5)

Admin2, fips, latutide and longitude now contain NaN (null) values. I had to remove the 'not null' constraints from these attributes to make the code below work:

In [None]:
CovidLocs.to_sql("covidlocs", engine, if_exists='append');

In [8]:
pd.read_sql('select * from covidlocs order by uid desc limit 5', engine, index_col="uid")

Unnamed: 0_level_0,iso2,iso3,code3,fips,admin2,province_state,country_region,latitude,longitude,combined_key,population
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
84099999,US,USA,840,99999.0,,Grand Princess,US,,,"Grand Princess, US",
84090056,US,USA,840,90056.0,,Wyoming,US,,,"Unassigned, Wyoming, US",
84090055,US,USA,840,90055.0,,Wisconsin,US,,,"Unassigned, Wisconsin, US",
84090054,US,USA,840,90054.0,,West Virginia,US,,,"Unassigned, West Virginia, US",
84090053,US,USA,840,90053.0,,Washington,US,,,"Unassigned, Washington, US",


There are a lot of undefined values and zeros in the data.  Checking that these were also present in the dataframe (sorting the same way):

In [None]:
CovidLocs.sort_values(by="uid", ascending=False).head()

Now create table CovidCases with uid, datestamp, cases and type:

In [47]:
pd.read_sql('select * from covidcases', engine)

Unnamed: 0,cid,cloc,rdate,ncas,ctype


In [125]:
covidcases=pd.DataFrame(columns=["cloc","rdate","ncas","ctype"])
covidcases

Unnamed: 0,cloc,rdate,ncas,ctype


In [38]:
nloc,nd=dfc.shape

In [35]:
#rs=engine.execute("insert into covidcases(cloc,rdate,ctype) values(84099999,'01/22/20','C');")

In [23]:
rs.fetchall()

[]

In [42]:
covidcases.loc[0,"cloc"] = 0#= [0,'01/22/20','C']

My first 'naive' attempt at converting the data into one row per day and location.  Would take way too long.

In [88]:
"""
#start column for cases and deaths, respectively
nd_i={"C": 11, "D": 12}
df={"C": dfc, "D": dfd}

c=0

for ct in nd_i.keys():
    adf=df[ct]
    for l in range(nloc):
        print("\rprocessing loc %d out of %d" % (l+1,nloc), end="")
        for n in range(nd_i[ct],nd):
            uid=adf.loc[l,"UID"]
            date=adf.columns[n]
            ncas=adf.iloc[l,n]
            covidcases.loc[c,"cloc"] = int(uid)
            covidcases.loc[c,"rdate"] = str(date)
            covidcases.loc[c,"ncas"] = int(ncas)
            covidcases.loc[c,"ctype"] = str(ct)
            c+=1
    print(" ok.")
""";

In [85]:
#xi=list([0])+list(range(11,nd))
tmp=dfc.set_index("UID", drop=True)
st=tmp.iloc[:,10:].stack()
st=st.reset_index(level=[0,1])

In [87]:
st.tail()

Unnamed: 0,UID,level_1,0
1292575,84056045,2/7/21,616
1292576,84056045,2/8/21,618
1292577,84056045,2/9/21,618
1292578,84056045,2/10/21,617
1292579,84056045,2/11/21,617


In [76]:
mui.labels

  """Entry point for launching an IPython kernel.


FrozenList([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]])

In [70]:
#pd.DataFrame(st, columns=["ncas"])

Unnamed: 0_level_0,Unnamed: 1_level_0,ncas
UID,Unnamed: 1_level_1,Unnamed: 2_level_1
84001001,1/22/20,0
84001001,1/23/20,0
84001001,1/24/20,0
84001001,1/25/20,0
84001001,1/26/20,0
84001001,1/27/20,0
84001001,1/28/20,0
84001001,1/29/20,0
84001001,1/30/20,0
84001001,1/31/20,0


In [133]:
type(covidcases.loc[0,"cloc"])

int

In [117]:
#covidcases.to_sql?

In [136]:
covidcases.to_sql("covidcases", engine, if_exists="append", index=False, chunksize=1)

In [53]:
nd*nloc

1325980