In [9]:
%load_ext autoreload
%autoreload 2

In [195]:
%matplotlib inline
pd.set_option('display.max_columns', 500)  
pd.set_option('display.max_rows', 500)   

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import boto3
import re
from datetime import datetime

import sagemaker
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig
from sagemaker.model_monitor import DataCaptureConfig, DatasetFormat, DefaultModelMonitor
from sagemaker.s3 import S3Uploader, S3Downloader

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

%cd /root/predicting-coronavirus
%pip install mpu
import src.data_import as di

/root/predicting-coronavirus
Note: you may need to restart the kernel to use updated packages.


In [4]:
sess = boto3.Session()
sm = sess.client('sagemaker')
role = sagemaker.get_execution_role()

In [15]:
!cd data/covid-19-data;git pull origin master

From https://github.com/nytimes/covid-19-data
 * branch            master     -> FETCH_HEAD
Already up to date.


In [7]:
account_id = sess.client('sts', region_name=sess.region_name).get_caller_identity()["Account"]
bucket = 'sagemaker-studio-{}-{}'.format(sess.region_name, account_id)
prefix = 'capstone2'

try:
    if sess.region_name == "us-east-1":
        sess.client('s3').create_bucket(Bucket=bucket)
    else:
        sess.client('s3').create_bucket(Bucket=bucket, 
                                        CreateBucketConfiguration={'LocationConstraint': sess.region_name})
except Exception as e:
    print("Looks like you already have a bucket of this name. That's good. Uploading the data files...")

# Return the URLs of the uploaded file, so they can be reviewed or used elsewhere
s3url = S3Uploader.upload('data/covid-19-data/us-counties.csv', 's3://{}/{}/{}'.format(bucket, prefix,'nyt'))
print(s3url)


s3://sagemaker-studio-us-east-1-752222400982/capstone2/nyt/us-counties.csv


In [238]:
nyt_df = di.extract_nyt()

In [239]:
nyt_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128256 entries, 0 to 129746
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    128256 non-null  datetime64[ns]
 1   county  128256 non-null  object        
 2   state   128256 non-null  object        
 3   fips    128256 non-null  int64         
 4   cases   128256 non-null  int64         
 5   deaths  128256 non-null  int64         
 6   sc      128256 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 7.8+ MB


In [13]:
airports_df = di.extract_airports()
counties_df = di.extract_geography()

In [244]:
hhi = di.extract_hhi()
edu = di.extract_edu()
sip = di.extract_sip()
pubtrans = di.extract_public_transport()
housing = di.extract_housing()
election = di.extract_election()
geo = di.extract_geography()

In [215]:
hhi = di.extract_hhi()
hhi

Unnamed: 0,sc,fips,households,mean_hhi,median_hhi
0,Alabama:Shelby,1117,80944,71230,96481
1,Alabama:Talladega,1121,31219,40315,58110
2,Alabama:Tuscaloosa,1125,74053,54459,71000
3,Arizona:Pinal,4021,147936,59058,69864
4,California:Mendocino,6045,33794,51830,66691
...,...,...,...,...,...
833,California:Butte,6007,88636,51652,72948
834,California:El Dorado,6017,72774,82742,108803
835,California:Imperial,6025,41764,48984,60686
836,California:Kern,6029,273167,51579,71441


In [251]:
big_df = pd.merge(hhi, geo, how='inner', left_on=['fips'], right_on=['fips'])
big_df = pd.merge(edu, big_df, how='inner', left_on=['fips'], right_on=['fips'])
big_df = pd.merge(housing, big_df, how='inner', left_on=['fips'], right_on=['fips'])
big_df = pd.merge(pubtrans, big_df, how='inner', left_on=['fips'], right_on=['fips'])

big_df = pd.merge(big_df, sip, how='left', left_on=['state'], right_on=['state'])



In [252]:
big_df.head()

Unnamed: 0,sc,fips,percent_commuter,sc_x,median_house_price,median_rent,percent_big_buildings,sc_y,pop_over_25,hs,ba_plus,ma_plus,pop_over_65,sc_x.1,households,mean_hhi,median_hhi,sc_y.1,state,population,area-km,SIP,lifted
0,Alabama:Shelby,1117,0.0,Alabama:Shelby,217900,1034,3.5,Alabama:Shelby,149389,19.3,26.9,13.8,33295,Alabama:Shelby,80944,71230,96481,Alabama:Shelby,Alabama,195085,2032.96,2020-04-03 00:00:00,2020-04-30 00:00:00
1,Alabama:Talladega,1121,N,Alabama:Talladega,115800,650,2.9,Alabama:Talladega,56275,34.8,9.2,5.5,14422,Alabama:Talladega,31219,40315,58110,Alabama:Talladega,Alabama,82291,1908.24,2020-04-03 00:00:00,2020-04-30 00:00:00
2,Alabama:Tuscaloosa,1125,0.2,Alabama:Tuscaloosa,173800,852,8.7,Alabama:Tuscaloosa,130662,31.2,17.7,12.2,27472,Alabama:Tuscaloosa,74053,54459,71000,Alabama:Tuscaloosa,Alabama,194656,3423.329,2020-04-03 00:00:00,2020-04-30 00:00:00
3,Arizona:Pinal,4021,0.2,Arizona:Pinal,200200,1059,1.1,Arizona:Pinal,309013,29.3,12.3,7.4,90825,Arizona:Pinal,147936,59058,69864,Arizona:Pinal,Arizona,375770,13896.87,2020-03-30 00:00:00,2020-05-15 00:00:00
4,California:Mendocino,6045,0.0,California:Mendocino,381900,1100,3.0,California:Mendocino,61385,25.3,14.9,8.8,19072,California:Mendocino,33794,51830,66691,California:Mendocino,California,87841,9081.386,2020-03-19 00:00:00,2020-05-15 00:00:00


In [247]:
big_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 827 entries, 0 to 826
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sc                     827 non-null    object 
 1   fips                   827 non-null    int64  
 2   percent_commuter       827 non-null    object 
 3   sc_x                   827 non-null    object 
 4   median_house_price     827 non-null    int64  
 5   median_rent            827 non-null    int64  
 6   percent_big_buildings  827 non-null    object 
 7   sc_y                   827 non-null    object 
 8   pop_over_25            827 non-null    int64  
 9   hs                     827 non-null    float64
 10  ba_plus                827 non-null    float64
 11  ma_plus                827 non-null    float64
 12  pop_over_65            827 non-null    int64  
 13  sc_x                   827 non-null    object 
 14  households             827 non-null    int64  
 15  mean_h

In [250]:
big_df[big_df['state'] == 'New York'].head(100)

Unnamed: 0,sc,fips,percent_commuter,sc_x,median_house_price,median_rent,percent_big_buildings,sc_y,pop_over_25,hs,ba_plus,ma_plus,pop_over_65,sc_x.1,households,mean_hhi,median_hhi,sc_y.1,state,population,area-km
14,New York:Rockland,36087,7.3,New York:Rockland,454500,1469,7.7,New York:Rockland,203545,23.5,23.2,19.3,51109,New York:Rockland,99502,89812,118960,New York:Rockland,New York,311687,449.493
15,New York:Sullivan,36105,0.8,New York:Sullivan,179900,888,1.7,New York:Sullivan,53383,35.4,12.8,9.2,14198,New York:Sullivan,28900,52274,72044,New York:Sullivan,New York,77547,2507.45
46,New York:Ulster,36111,3.9,New York:Ulster,243700,1163,3.4,New York:Ulster,130157,29.7,17.9,13.8,35125,New York:Ulster,69154,63889,88816,New York:Ulster,New York,182493,2911.756
74,New York:Madison,36053,0.3,New York:Madison,133800,760,2.2,New York:Madison,47705,30.8,13.5,13.5,13087,New York:Madison,26127,59678,74332,New York:Madison,New York,73442,1696.033
97,New York:Queens,36081,51.4,New York:Queens,577400,1588,32.2,New York:Queens,1646542,26.9,21.0,12.4,357630,New York:Queens,788110,69320,89633,New York:Queens,New York,2230722,281.097
98,New York:Westchester,36119,22.8,New York:Westchester,560800,1483,24.9,New York:Westchester,670570,19.4,23.7,25.4,165337,New York:Westchester,352498,94811,151637,New York:Westchester,New York,949113,1114.982
142,New York:Warren,36113,1.3,New York:Warren,195200,876,3.8,New York:Warren,48290,32.1,16.7,15.1,14361,New York:Warren,28007,56482,75644,New York:Warren,New York,65707,2245.396
143,New York:Wayne,36117,0.1,New York:Wayne,130600,719,3.0,New York:Wayne,64140,33.3,14.3,10.4,17292,New York:Wayne,35927,61515,79920,New York:Wayne,New York,93772,1563.903
164,New York:Bronx,36005,58.4,New York:Bronx,436100,1188,63.4,New York:Bronx,934391,27.7,13.4,7.3,183165,New York:Bronx,507370,38467,57617,New York:Bronx,New York,1385108,109.029
192,New York:Broome,36007,4.5,New York:Broome,119500,761,5.6,New York:Broome,129231,32.9,15.7,12.4,36793,New York:Broome,75539,51125,68268,New York:Broome,New York,200600,1827.926


In [205]:
hhi = di.extract_hhi()
#hhi
a = hhi[hhi['state'] == 'Virginia']
for row in hhi:
    print('{}'.format(row))

sc
fips
state
county
households
mean_hhi
median_hhi
