<a href="https://colab.research.google.com/github/aarsanjani/meansquares/blob/master/NY_LabelingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Workflow

This colab aims in labeling the NY data for super spread week, safe week and improvement week as per the 'number of cases' increasing or decreasing every week.

Idea: 
* compute 7 -day moving average
* 14 days data compare with next 14 days rolling window
* split the % as buckets 
* label the data as per bucket

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
from pandas import Series, datetime
from pandas.plotting import scatter_matrix, autocorrelation_plot
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
import random
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture

  import pandas.util.testing as tm


In [3]:
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
import warnings; warnings.simplefilter('ignore')


In [4]:
!ls  '/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/'

CA_Covid-19.csv     Newyork_combined.csv  queens_all_combined.csv
mask_rule.csv	    NY_Covid-19.csv	  queens_all_filtered.csv
mask_rule_data.csv  NY-CovidAug21.csv	  Queens_county.csv


# Loading NY data

In [5]:
NY_combined_data = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/Newyork_combined.csv',low_memory=False)
NY_combined_data.head()

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.0,0
1,2020-03-02,36001,Albany County,11.0,15.0,29.0,5.0,3.0,0.0,4.02,4.68,97,584.364958,0,0.0,0
2,2020-03-03,36001,Albany County,8.0,15.0,41.0,8.0,3.0,-1.0,12.09,4.778,100,584.364958,0,0.0,0
3,2020-03-04,36001,Albany County,7.0,8.0,6.0,2.0,3.0,0.0,11.97,4.486,93,584.364958,0,0.0,0
4,2020-03-05,36001,Albany County,5.0,13.0,18.0,2.0,3.0,-1.0,18.74,5.129,107,584.364958,0,0.0,0


## Computing rolling average

In [6]:
NY_combined_data.iloc[:,15]

0        0
1        0
2        0
3        0
4        0
        ..
10195    1
10196    0
10197    0
10198    0
10199    0
Name: New cases, Length: 10200, dtype: int64

In [7]:
df = pd.DataFrame(columns=['Date','fips_x','County Name','retail and recreation','grocery and pharmacy','parks','transit stations','workplaces','residential','driving','m50','m50_index','population_density','mask_rule_active','mask_wearing_percent','New cases','rolling_avg_new_cases'])


In [8]:
NY_counties = NY_combined_data['County Name'].unique()

for county in NY_counties:
  county_data = NY_combined_data[NY_combined_data['County Name'] == county]
  #print(county_data.iloc[:,15])
  county_data['rolling_avg_new_cases'] = county_data.iloc[:,15].rolling(window=7).mean()
  df = df.append(county_data)

df.head()

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.0,0,
1,2020-03-02,36001,Albany County,11.0,15.0,29.0,5.0,3.0,0.0,4.02,4.68,97,584.364958,0,0.0,0,
2,2020-03-03,36001,Albany County,8.0,15.0,41.0,8.0,3.0,-1.0,12.09,4.778,100,584.364958,0,0.0,0,
3,2020-03-04,36001,Albany County,7.0,8.0,6.0,2.0,3.0,0.0,11.97,4.486,93,584.364958,0,0.0,0,
4,2020-03-05,36001,Albany County,5.0,13.0,18.0,2.0,3.0,-1.0,18.74,5.129,107,584.364958,0,0.0,0,


In [9]:
print(df.shape)
NY_combined_data.shape

(10200, 17)


(10200, 16)

In [10]:
NY_combined_data = df.copy()
NY_combined_data

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.000,0,
1,2020-03-02,36001,Albany County,11.0,15.0,29.0,5.0,3.0,0.0,4.02,4.680,97,584.364958,0,0.000,0,
2,2020-03-03,36001,Albany County,8.0,15.0,41.0,8.0,3.0,-1.0,12.09,4.778,100,584.364958,0,0.000,0,
3,2020-03-04,36001,Albany County,7.0,8.0,6.0,2.0,3.0,0.0,11.97,4.486,93,584.364958,0,0.000,0,
4,2020-03-05,36001,Albany County,5.0,13.0,18.0,2.0,3.0,-1.0,18.74,5.129,107,584.364958,0,0.000,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10195,2020-08-15,36123,Yates County,0.0,0.0,0.0,0.0,4.0,0.0,519.68,7.904,113,73.676584,1,0.884,1,0.428571
10196,2020-08-16,36123,Yates County,0.0,0.0,0.0,0.0,2.0,0.0,356.46,4.966,71,73.676584,1,0.884,0,0.285714
10197,2020-08-17,36123,Yates County,0.0,14.0,0.0,0.0,-23.0,0.0,281.57,4.181,60,73.676584,1,0.884,0,0.285714
10198,2020-08-18,36123,Yates County,0.0,0.0,0.0,0.0,0.0,0.0,276.14,3.576,51,73.676584,1,0.884,0,0.285714


In [11]:
NY_combined_data[NY_combined_data['Date'] == '2020-08-01']

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
151,2020-08-01,36001,Albany County,-33.0,-10.0,0.0,-27.0,-14.0,1.0,35.56,2.26,47,584.364958,1,0.788,12,11.0
321,2020-08-01,36003,Allegany County,-6.0,14.0,0.0,0.0,-12.0,0.0,138.45,5.929,105,44.778541,1,0.771,0,0.0
491,2020-08-01,36007,Broome County,-21.0,-4.0,0.0,1.0,-14.0,1.0,91.4,2.238,49,269.900959,1,0.812,11,11.714286
661,2020-08-01,36009,Cattaraugus County,-13.0,51.0,0.0,0.0,-13.0,0.0,146.27,4.348,79,58.177858,1,0.793,0,1.142857
831,2020-08-01,36011,Cayuga County,-6.0,8.0,0.0,55.0,-7.0,-1.0,156.89,3.552,85,110.726163,1,0.732,1,0.714286
1001,2020-08-01,36013,Chautauqua County,-5.0,20.0,0.0,0.0,-13.0,0.0,156.76,2.867,73,119.69384,1,0.703,3,2.142857
1171,2020-08-01,36015,Chemung County,-21.0,0.0,0.0,-67.0,-2.0,-1.0,95.11,3.664,62,204.875414,1,0.594,0,0.428571
1341,2020-08-01,36017,Chenango County,-8.0,30.0,0.0,0.0,-5.0,0.0,137.06,5.034,65,52.830843,1,0.735,2,0.857143
1511,2020-08-01,36019,Clinton County,-19.0,4.0,0.0,-35.0,-10.0,-2.0,103.6,2.287,61,77.549742,1,0.773,1,1.142857
1681,2020-08-01,36021,Columbia County,-10.0,3.0,0.0,24.0,-10.0,0.0,154.44,3.446,37,93.682154,1,0.848,3,2.714286


In [12]:
NY_combined_data.head(8)

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.0,0,
1,2020-03-02,36001,Albany County,11.0,15.0,29.0,5.0,3.0,0.0,4.02,4.68,97,584.364958,0,0.0,0,
2,2020-03-03,36001,Albany County,8.0,15.0,41.0,8.0,3.0,-1.0,12.09,4.778,100,584.364958,0,0.0,0,
3,2020-03-04,36001,Albany County,7.0,8.0,6.0,2.0,3.0,0.0,11.97,4.486,93,584.364958,0,0.0,0,
4,2020-03-05,36001,Albany County,5.0,13.0,18.0,2.0,3.0,-1.0,18.74,5.129,107,584.364958,0,0.0,0,
5,2020-03-06,36001,Albany County,6.0,10.0,12.0,7.0,3.0,0.0,37.03,5.428,113,584.364958,0,0.0,0,
6,2020-03-07,36001,Albany County,12.0,12.0,78.0,11.0,6.0,-1.0,21.45,3.992,83,584.364958,0,0.0,0,0.0
7,2020-03-08,36001,Albany County,9.0,13.0,186.0,7.0,2.0,-1.0,-6.01,2.55,53,584.364958,0,0.0,0,0.0


## clean data - fill NA 

In [13]:
NY_combined_data = NY_combined_data.fillna(0)
NY_combined_data.head(8)

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.0,0,0.0
1,2020-03-02,36001,Albany County,11.0,15.0,29.0,5.0,3.0,0.0,4.02,4.68,97,584.364958,0,0.0,0,0.0
2,2020-03-03,36001,Albany County,8.0,15.0,41.0,8.0,3.0,-1.0,12.09,4.778,100,584.364958,0,0.0,0,0.0
3,2020-03-04,36001,Albany County,7.0,8.0,6.0,2.0,3.0,0.0,11.97,4.486,93,584.364958,0,0.0,0,0.0
4,2020-03-05,36001,Albany County,5.0,13.0,18.0,2.0,3.0,-1.0,18.74,5.129,107,584.364958,0,0.0,0,0.0
5,2020-03-06,36001,Albany County,6.0,10.0,12.0,7.0,3.0,0.0,37.03,5.428,113,584.364958,0,0.0,0,0.0
6,2020-03-07,36001,Albany County,12.0,12.0,78.0,11.0,6.0,-1.0,21.45,3.992,83,584.364958,0,0.0,0,0.0
7,2020-03-08,36001,Albany County,9.0,13.0,186.0,7.0,2.0,-1.0,-6.01,2.55,53,584.364958,0,0.0,0,0.0


## Rolling window to compare every 2 weeks with next 2 weeks

We don't have data from March 2 - march 

In [14]:
NY_combined_data = NY_combined_data.sort_values(by=['Date'])
NY_combined_data.head()

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.0,0,0.0
5270,2020-03-01,36067,Onondaga County,20.0,7.0,17.0,5.0,7.0,-1.0,-1.02,2.66,53,591.641722,0,0.0,0,0.0
1530,2020-03-01,36021,Columbia County,21.0,4.0,0.0,25.0,4.0,0.0,19.75,5.337,58,93.682154,0,0.0,0,0.0
10030,2020-03-01,36123,Yates County,31.0,33.0,0.0,0.0,7.0,0.0,28.14,5.004,72,73.676584,0,0.0,0,0.0
9520,2020-03-01,36117,Wayne County,20.0,12.0,0.0,0.0,7.0,-2.0,7.88,5.013,44,148.912773,0,0.0,0,0.0


In [58]:
NY_combined_data.tail(10)

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
6629,2020-08-19,36081,Queens County,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.689,18,20767.142726,1,0.751,0,0.0
6459,2020-08-19,36079,Putnam County,0.0,0.0,0.0,0.0,0.0,0.0,56.16,7.098,61,426.90287,1,0.852,3,2.285714
6289,2020-08-19,36077,Otsego County,0.0,0.0,0.0,0.0,0.0,0.0,113.64,3.666,92,59.392034,1,0.779,2,0.571429
6119,2020-08-19,36075,Oswego County,0.0,0.0,0.0,0.0,0.0,0.0,117.49,5.173,66,123.07466,1,0.766,2,2.428571
5949,2020-08-19,36073,Orleans County,0.0,0.0,0.0,0.0,0.0,0.0,114.94,6.126,54,103.133466,1,0.693,1,0.714286
5779,2020-08-19,36071,Orange County,0.0,0.0,0.0,0.0,0.0,0.0,59.78,5.159,57,474.245094,1,0.746,13,13.0
5609,2020-08-19,36069,Ontario County,0.0,0.0,0.0,0.0,0.0,0.0,107.49,6.899,78,170.442654,1,0.849,0,1.142857
5439,2020-08-19,36067,Onondaga County,0.0,0.0,0.0,0.0,0.0,0.0,73.19,3.621,72,591.641722,1,0.756,16,14.142857
7479,2020-08-19,36091,Saratoga County,0.0,0.0,0.0,0.0,0.0,0.0,92.83,5.423,60,283.788489,1,0.751,0,4.571429
10199,2020-08-19,36123,Yates County,0.0,0.0,0.0,0.0,0.0,0.0,350.22,6.347,91,73.676584,1,0.884,0,0.285714


In [74]:
NY_combined_data[NY_combined_data['fips_x'] == 36081]

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases
6460,2020-03-01,36081,Queens County,3.0,1.0,14.0,-4.0,1.0,0.0,-2.82,1.339,35,20767.142726,0,0.000,0,0.0
6461,2020-03-02,36081,Queens County,4.0,5.0,25.0,-4.0,6.0,0.0,3.05,3.813,99,20767.142726,0,0.000,0,0.0
6462,2020-03-03,36081,Queens County,2.0,4.0,9.0,-3.0,4.0,0.0,5.10,3.747,98,20767.142726,0,0.000,0,0.0
6463,2020-03-04,36081,Queens County,7.0,8.0,28.0,-4.0,2.0,0.0,6.67,3.822,99,20767.142726,0,0.000,0,0.0
6464,2020-03-05,36081,Queens County,8.0,8.0,31.0,-2.0,3.0,0.0,10.04,4.052,106,20767.142726,0,0.000,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6625,2020-08-15,36081,Queens County,-26.0,-4.0,218.0,-39.0,-23.0,4.0,44.04,1.208,31,20767.142726,1,0.751,0,0.0
6626,2020-08-16,36081,Queens County,-38.0,-19.0,27.0,-51.0,-29.0,7.0,0.58,0.038,0,20767.142726,1,0.751,0,0.0
6627,2020-08-17,36081,Queens County,-23.0,-5.0,117.0,-49.0,-45.0,14.0,24.26,0.733,19,20767.142726,1,0.751,0,0.0
6628,2020-08-18,36081,Queens County,0.0,0.0,0.0,0.0,0.0,0.0,29.77,1.154,30,20767.142726,1,0.751,0,0.0


In [15]:
minDate = NY_combined_data['Date'].min()
print(minDate)

dateList = NY_combined_data['Date'].unique()

2020-03-01


In [16]:
import datetime

#dt_object = datetime.fromtimestamp(date)
day_name= ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']

day = datetime.datetime.strptime(minDate, '%Y-%m-%d').weekday()
if day == 6:
  day =0
elif day < 6:
  day += 1

print(day)

0


In [17]:
index = 0
count = 0
lst = []
while index < len(dateList):
  row = []
  j = 14 - day;
  # print(index,j)
  # print(dateList[index:index+j])
  weekRange = dateList[index:index+j]
  #print(weekRange)
  index = index+j -7
  day = 0
  count +=1
  row.append(count)
  row.append(weekRange.min())
  row.append(weekRange.max())
  print(row)
  lst.append(row)
  


[1, '2020-03-01', '2020-03-14']
[2, '2020-03-08', '2020-03-21']
[3, '2020-03-15', '2020-03-28']
[4, '2020-03-22', '2020-04-04']
[5, '2020-03-29', '2020-04-11']
[6, '2020-04-05', '2020-04-18']
[7, '2020-04-12', '2020-04-26']
[8, '2020-04-19', '2020-05-03']
[9, '2020-04-27', '2020-05-10']
[10, '2020-05-04', '2020-05-17']
[11, '2020-05-11', '2020-05-24']
[12, '2020-05-18', '2020-06-01']
[13, '2020-05-25', '2020-06-08']
[14, '2020-06-02', '2020-06-15']
[15, '2020-06-09', '2020-06-22']
[16, '2020-06-16', '2020-06-29']
[17, '2020-06-23', '2020-07-06']
[18, '2020-06-30', '2020-07-13']
[19, '2020-07-07', '2020-07-20']
[20, '2020-07-14', '2020-07-27']
[21, '2020-07-21', '2020-08-03']
[22, '2020-07-28', '2020-08-10']
[23, '2020-08-04', '2020-08-17']
[24, '2020-08-11', '2020-08-19']
[25, '2020-08-18', '2020-08-19']


In [18]:
week_df = pd.DataFrame(lst,index=None,columns=['WeekNumber','startDate','endDate'])
week_df.head(2)

Unnamed: 0,WeekNumber,startDate,endDate
0,1,2020-03-01,2020-03-14
1,2,2020-03-08,2020-03-21


## Slice data for the week dataframe

In [45]:

average_newCases_df = pd.DataFrame(columns=['County Name',  'fips_x',  'average_per_week','startDate','endDate'])

for index, row in week_df.iterrows():
    startDate, endDate = row['startDate'], row['endDate']
    df_index = (NY_combined_data['Date'] >= startDate) & (NY_combined_data['Date'] <= endDate)
    df_weekData =  NY_combined_data.loc[df_index]
    df_base = (
      df_weekData
      .pipe(lambda x: x.assign(gains_pctg=x["rolling_avg_new_cases"]))
      .groupby(['County Name','fips_x'])
      .agg({"gains_pctg": "mean"})
      .reset_index()
      .rename(columns={"gains_pctg": "average_per_week"})
    )
    df_base['startDate'] = startDate
    df_base['endDate'] = endDate
  
    #print(startDate,endDate,df_base['average_per_week'])
    average_newCases_df = average_newCases_df.append(df_base)


In [73]:
average_newCases_df[average_newCases_df['fips_x'] == 36081]

Unnamed: 0,County Name,fips_x,average_per_week,startDate,endDate
38,Queens County,36081,0.0,2020-03-01,2020-03-14
38,Queens County,36081,0.0,2020-03-08,2020-03-21
38,Queens County,36081,0.0,2020-03-15,2020-03-28
38,Queens County,36081,0.0,2020-03-22,2020-04-04
38,Queens County,36081,0.0,2020-03-29,2020-04-11
38,Queens County,36081,0.0,2020-04-05,2020-04-18
38,Queens County,36081,0.0,2020-04-12,2020-04-26
38,Queens County,36081,0.0,2020-04-19,2020-05-03
38,Queens County,36081,0.0,2020-04-27,2020-05-10
38,Queens County,36081,0.0,2020-05-04,2020-05-17


In [46]:
average_newCases_df.head(10)

Unnamed: 0,County Name,fips_x,average_per_week,startDate,endDate
0,Albany County,36001,0.112245,2020-03-01,2020-03-14
1,Allegany County,36003,0.0,2020-03-01,2020-03-14
2,Broome County,36007,0.020408,2020-03-01,2020-03-14
3,Cattaraugus County,36009,0.0,2020-03-01,2020-03-14
4,Cayuga County,36011,0.0,2020-03-01,2020-03-14
5,Chautauqua County,36013,0.0,2020-03-01,2020-03-14
6,Chemung County,36015,0.0,2020-03-01,2020-03-14
7,Chenango County,36017,0.0,2020-03-01,2020-03-14
8,Clinton County,36019,0.0,2020-03-01,2020-03-14
9,Columbia County,36021,0.0,2020-03-01,2020-03-14


## Finding the % growth or reduction in 'new cases' on every 2 week average

In [61]:
average_newCases_df_ = average_newCases_df.copy()

In [62]:
average_newCases_df = average_newCases_df.sort_values(by=['startDate'])

print(average_newCases_df.shape)
temp = average_newCases_df.groupby(['fips_x','startDate'])['average_per_week']
temp = temp.sum().diff().reset_index()

print(temp.shape)

average_newCases_df2 = average_newCases_df.merge(temp,on=['fips_x','startDate'])
print(average_newCases_df2.shape)

(1500, 5)
(1500, 3)
(1500, 6)


In [64]:
average_newCases_df2[average_newCases_df2['fips_x'] == 36071].head(5)

Unnamed: 0,County Name,fips_x,average_per_week_x,startDate,endDate,average_per_week_y
2,Orange County,36071,0.112245,2020-03-01,2020-03-14,-1.102041
62,Orange County,36071,3.693878,2020-03-08,2020-03-21,3.581633
128,Orange County,36071,46.153061,2020-03-15,2020-03-28,42.459184
189,Orange County,36071,130.234694,2020-03-22,2020-04-04,84.081633
251,Orange County,36071,242.632653,2020-03-29,2020-04-11,112.397959


In [53]:
average_newCases_df2.head()

Unnamed: 0,County Name,fips_x,average_per_week_x,startDate,endDate,average_per_week_y
0,Albany County,36001,0.112245,2020-03-01,2020-03-14,
1,Schoharie County,36095,0.0,2020-03-01,2020-03-14,-7.785714
2,Broome County,36007,0.020408,2020-03-01,2020-03-14,0.020408
3,Cattaraugus County,36009,0.0,2020-03-01,2020-03-14,-5.214286
4,Cayuga County,36011,0.0,2020-03-01,2020-03-14,-0.642857


In [54]:
average_newCases_df2 = average_newCases_df2.fillna(0)

In [55]:
average_newCases_df2.head()

Unnamed: 0,County Name,fips_x,average_per_week_x,startDate,endDate,average_per_week_y
0,Albany County,36001,0.112245,2020-03-01,2020-03-14,0.0
1,Schoharie County,36095,0.0,2020-03-01,2020-03-14,-7.785714
2,Broome County,36007,0.020408,2020-03-01,2020-03-14,0.020408
3,Cattaraugus County,36009,0.0,2020-03-01,2020-03-14,-5.214286
4,Cayuga County,36011,0.0,2020-03-01,2020-03-14,-0.642857


In [69]:
average_newCases_df2['percent_newcases'] = average_newCases_df2.apply(lambda x: (x.average_per_week_y) / x.average_per_week_x if x.average_per_week_x > 0 else 0 , axis=1)


average_newCases_df2.head()



Unnamed: 0,County Name,fips_x,average_per_week_x,startDate,endDate,average_per_week_y,percent_newcases
0,Albany County,36001,0.112245,2020-03-01,2020-03-14,,
1,Erie County,36029,0.030612,2020-03-01,2020-03-14,-11.683673,-381.666667
2,Orange County,36071,0.112245,2020-03-01,2020-03-14,-1.102041,-9.818182
3,Orleans County,36073,0.0,2020-03-01,2020-03-14,-12.857143,0.0
4,Oswego County,36075,0.0,2020-03-01,2020-03-14,-0.642857,0.0


In [72]:
average_newCases_df2[average_newCases_df2['County Name'] == 'Queens County']

Unnamed: 0,County Name,fips_x,average_per_week_x,startDate,endDate,average_per_week_y,percent_newcases
7,Queens County,36081,0.0,2020-03-01,2020-03-14,-2.142857,0.0
65,Queens County,36081,0.0,2020-03-08,2020-03-21,0.0,0.0
133,Queens County,36081,0.0,2020-03-15,2020-03-28,0.0,0.0
185,Queens County,36081,0.0,2020-03-22,2020-04-04,0.0,0.0
249,Queens County,36081,0.0,2020-03-29,2020-04-11,0.0,0.0
300,Queens County,36081,0.0,2020-04-05,2020-04-18,0.0,0.0
371,Queens County,36081,0.0,2020-04-12,2020-04-26,0.0,0.0
426,Queens County,36081,0.0,2020-04-19,2020-05-03,0.0,0.0
493,Queens County,36081,0.0,2020-04-27,2020-05-10,0.0,0.0
551,Queens County,36081,0.0,2020-05-04,2020-05-17,0.0,0.0


# unused code


In [None]:
agg_gains_df = (
    average_newCases_df2
    .pipe(lambda x: x.assign(gains_pctg=x.average_per_week_y/x.average_per_week_x))
    .groupby(['County Name', 'fips_x','startDate'])
    .agg({"gains_pctg": "mean"})
    .reset_index()
    .rename(columns={"gains_pctg": "weekly_growth"})
)

In [None]:
week_df.head()

In [None]:
NY_combined_data.dtypes

In [None]:
start_date_df = (
    NY_combined_data
    .merge(week_df, left_on=['Date'],right_on=['startDate'], how='inner')
    .pipe(lambda x: x.assign(date=x.Date))
    .rename(columns={"starting_rolling_avg_new_cases": "rolling_avg_new_cases"})
    .reset_index(drop=True)

    [['Date','fips_x','County Name','retail and recreation','grocery and pharmacy','parks','transit stations','workplaces','residential','driving','m50','m50_index','population_density','mask_rule_active','mask_wearing_percent','New cases','startDate','WeekNumber','rolling_avg_new_cases']]
    .pivot_table(values='rolling_avg_new_cases', columns='WeekNumber', index='County Name', aggfunc='sum')
    .rename_axis(None, axis=1)
    .reset_index()
)

start_date_df

In [None]:
end_date_df = (
    NY_combined_data
    .merge(week_df, on='Date', how='inner')
    .pipe(lambda x: x.assign(date=x.Date))
    .rename(columns={"Confirmed": "end_date_Confirmed"})
    .reset_index(drop=True)

    [['County Name', 'fips_x', 'end_date_Confirmed','endDate','WeekNumber']]
    .pivot_table(values='end_date_Confirmed', columns='WeekNumber', index='County Name', aggfunc='sum')
    .rename_axis(None, axis=1)
    .reset_index()
)