<a href="https://colab.research.google.com/github/aarsanjani/meansquares/blob/master/CA_LabelingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Workflow

This colab aims in labeling the CA data for super spread week, safe week and improvement week as per the 'number of cases' increasing or decreasing every week.

Idea: 
* compute 7 -day moving average
* 14 days data compare with next 14 days rolling window
* split the % as buckets 
* label the data as per bucket

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

from tqdm import tqdm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
from pandas import Series, datetime
from pandas.plotting import scatter_matrix, autocorrelation_plot
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
import random
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
import warnings; warnings.simplefilter('ignore')


In [None]:
!ls  '/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/'

# 1.1 Loading CA data

In [None]:
CA_combined_data = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/CA_combined_Sep25.csv',low_memory=False)
CA_combined_data.head()

## Computing rolling average

In [None]:
CA_combined_data.iloc[:,15]

In [None]:
df = pd.DataFrame(columns=['Date','fips_x','County Name','retail and recreation','grocery and pharmacy','parks','transit stations','workplaces','residential','driving','m50','m50_index','population_density','mask_rule_active','mask_wearing_percent','New cases','rolling_avg_new_cases'])


In [None]:
CA_counties = CA_combined_data['County Name'].unique()

for county in CA_counties:
  county_data = CA_combined_data[CA_combined_data['County Name'] == county]
  #print(county_data.iloc[:,15])
  county_data['rolling_avg_new_cases'] = county_data.iloc[:,15].rolling(window=7).mean()
  df = df.append(county_data)

df.head()

In [None]:
print(df.shape)
CA_combined_data.shape

In [None]:
CA_combined_data = df.copy()
CA_combined_data

In [None]:
CA_combined_data[CA_combined_data['Date'] == '2020-08-01']

In [None]:
CA_combined_data.head(8)

## clean data - fill NA 

In [None]:
CA_combined_data = CA_combined_data.fillna(0)
CA_combined_data.head(8)

In [None]:
CA_combined_data['County Name'].unique()

## verifying Rolling average with visualizations -Los Angeles County

In [None]:
los_angeles_county = CA_combined_data[CA_combined_data['County Name'] == 'Los Angeles County']
print(los_angeles_county.shape)
los_angeles_county.head()

In [None]:
fig,ax = plt.subplots()
# make a plot
ax.plot(los_angeles_county.Date, los_angeles_county['rolling_avg_new_cases'], color="blue")
# set x-axis label
ax.set_xlabel("Date",fontsize=14)
# set y-axis label
ax.set_ylabel("rolling_avg_new_cases",color="blue",fontsize=14)
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(los_angeles_county.Date, los_angeles_county["New cases"],color="red")
ax2.set_ylabel("New cases",color="blue",fontsize=14)
plt.title("'rolling_avg_new_cases' Vs New cases")

plt.show()

In [None]:
suffolk_county = CA_combined_data[CA_combined_data['County Name'] == 'Santa Clara County']

In [None]:
fig,ax = plt.subplots()
# make a plot
ax.plot(suffolk_county.Date, suffolk_county['rolling_avg_new_cases'], color="blue")
# set x-axis label
ax.set_xlabel("Date",fontsize=14)
# set y-axis label
ax.set_ylabel("rolling_avg_new_cases",color="blue",fontsize=14)
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(suffolk_county.Date, suffolk_county["New cases"],color="red")
ax2.set_ylabel("New cases",color="blue",fontsize=14)
plt.title("'rolling_avg_new_cases' Vs New cases")

plt.show()

## Rolling window to compare every 2 weeks with next 2 weeks

In [None]:
CA_combined_data = CA_combined_data.sort_values(by=['Date'])
CA_combined_data.head()

In [None]:
CA_combined_data[CA_combined_data['fips_x'] == 6037]

In [None]:
minDate = CA_combined_data['Date'].min()
print(minDate)

dateList = CA_combined_data['Date'].unique()

In [None]:
import datetime

#dt_object = datetime.fromtimestamp(date)
day_name= ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']

day = datetime.datetime.strptime(minDate, '%Y-%m-%d').weekday()
if day == 6:
  day =0
elif day < 6:
  day += 1

print(day)

In [None]:
index = 0
count = 0
lst = []
while index < len(dateList):
  row = []
  j = 14 - day;
  # print(index,j)
  # print(dateList[index:index+j])
  weekRange = dateList[index:index+j]
  #print(weekRange)
  index = index+j 
  day = 0
  count +=1
  row.append(count)
  row.append(weekRange.min())
  row.append(weekRange.max())
  print(row)
  lst.append(row)
  


In [None]:
week_df = pd.DataFrame(lst,index=None,columns=['WeekNumber','startDate','endDate'])
week_df.head()

## Slice data for the week dataframe

In [None]:

average_newCases_df = pd.DataFrame(columns=['County Name',  'fips_x',  'average_per_week','startDate','endDate'])

for index, row in week_df.iterrows():
    startDate, endDate = row['startDate'], row['endDate']
    df_index = (CA_combined_data['Date'] >= startDate) & (CA_combined_data['Date'] <= endDate)
    df_weekData =  CA_combined_data.loc[df_index]
    df_base = (
      df_weekData
      .pipe(lambda x: x.assign(gains_pctg=x["rolling_avg_new_cases"]))
      .groupby(['County Name','fips_x'])
      .agg({"gains_pctg": "mean"})
      .reset_index()
      .rename(columns={"gains_pctg": "average_per_week"})
    )
    df_base['startDate'] = startDate
    df_base['endDate'] = endDate
  
    #print(startDate,endDate,df_base['average_per_week'])
    average_newCases_df = average_newCases_df.append(df_base)


In [None]:
average_newCases_df.shape

In [None]:
average_newCases_df[average_newCases_df['fips_x'] == 6037]

## Verifying average per week data Eg., Los Angeles County

In [None]:
los_angeles_county[(los_angeles_county['Date'] >= '2020-03-01') & (los_angeles_county['Date'] <= '2020-03-14') ]['rolling_avg_new_cases']



In [None]:
average_newCases_df.head(10)

## Finding the % growth or reduction in 'new cases' on every 2 week average

In [None]:
average_newCases_df_ = average_newCases_df.copy()

In [None]:
albany_avg_newCases = average_newCases_df[average_newCases_df['County Name'] == 'Los Angeles County']
albany_avg_newCases

In [None]:
# average_newCases_df = average_newCases_df.sort_values(by=['startDate','endDate'])

# print(average_newCases_df.shape)
# temp = average_newCases_df.groupby(['County Name','fips_x','startDate','endDate'])['average_per_week']
# temp = temp.sum().diff().reset_index()
# print("-------------")
# print(temp)
# print(temp.shape)

# average_newCases_df2 = average_newCases_df.merge(temp,on=['County Name','fips_x','startDate','endDate'])
# print(average_newCases_df2.shape)

In [None]:
average_newCases_df.head()

In [None]:
# average_newCases_df = average_newCases_df.sort_values(by=['startDate','endDate'])

# print(average_newCases_df.shape)
# temp = average_newCases_df.groupby(['County Name','fips_x','startDate','endDate'])['average_per_week']
# temp = temp.sum().diff().reset_index()
# print("-------------")
# print(temp)
# print(temp.shape)

# average_newCases_df2 = average_newCases_df.merge(temp,on=['County Name','fips_x','startDate','endDate'])
# print(average_newCases_df2.shape)

In [None]:
average_newCases_df2 = average_newCases_df.copy()

average_newCases_df2 = average_newCases_df2.reset_index(drop=True)
average_newCases_df2.head()

In [None]:
average_newCases_df2['diff_avg_x'] = average_newCases_df2.sort_values(['startDate','endDate']).groupby(['County Name', 'fips_x'])['average_per_week'].diff().fillna(0)

print(average_newCases_df2.shape)
average_newCases_df2.head()


In [None]:
average_newCases_df2[average_newCases_df2['County Name'] == 'Santa Clara County']

In [None]:
average_newCases_df2[average_newCases_df2['County Name'] == 'Alameda County']

In [None]:
average_newCases_df2[average_newCases_df2['fips_x'] == 6037].head(5)

In [None]:
average_newCases_df2.head()

In [None]:
average_newCases_df2 = average_newCases_df2.fillna(0)

In [None]:
average_newCases_df2.head()

In [None]:
#not working
#average_newCases_df2['pct_change_newcases'] = average_newCases_df2.sort_values(['startDate','endDate']).groupby(['County Name', 'fips_x','startDate','endDate']).average_per_week_y.pct_change()

Shifting average_per_week_x	 column value for formula. Means previous row's value will be come to the current row.

In [None]:
average_newCases_df2['shift_avg_x'] = average_newCases_df2.sort_values(['startDate','endDate']).groupby(['County Name', 'fips_x'])['average_per_week'].shift()

In [None]:
average_newCases_df2.groupby(by=['County Name', 'fips_x','startDate','endDate']).sum()

In [None]:
average_newCases_df2[average_newCases_df2['County Name'] == 'Los Angeles County']

In [None]:
average_newCases_df2['percent_newcases'] = average_newCases_df2.apply(lambda x: (x.diff_avg_x) / x.shift_avg_x if x.shift_avg_x > 0.0 else 0 , axis=1)


average_newCases_df2.tail()


In [None]:
average_newCases_df2[average_newCases_df2['County Name'] == 'Los Angeles County']

# 1.2 Labeling data as per % change

## Validation

In [None]:
test_ca_combined = average_newCases_df2.copy()

In [None]:
test_ca_combined[(test_ca_combined['fips_x']== 6001) & (test_ca_combined['startDate']=='2020-08-18')]

## Actual Work

In [None]:
average_newCases_df2.describe()

In [None]:
los_angeles_county = average_newCases_df2[average_newCases_df2['County Name'] == 'Los Angeles County']

In [None]:
fig,ax = plt.subplots()
# make a plot
ax.plot(los_angeles_county.startDate, los_angeles_county['percent_newcases'], color="blue")
# set x-axis label
ax.set_xlabel("Date",fontsize=14)
# set y-axis label
ax.set_ylabel("percent_newcases",color="blue",fontsize=14)
# ax2=ax.twinx()
# make a plot with different y-axis using second axis object
# ax2.plot(los_angeles_county.startDate, los_angeles_county["percent_newcases"],color="red")
# ax2.set_ylabel("New cases",color="blue",fontsize=14)
plt.title("'percent_newcases'")

plt.show()

In [None]:
data = average_newCases_df2['percent_newcases'].to_list()

print(data)


In [None]:
average_newCases_df3 = average_newCases_df2.copy().reset_index(drop=True)

In [None]:
average_newCases_df3.head()

In [None]:
new_cases_list = average_newCases_df2['percent_newcases'].values

## Determining growth label for covid cases

#### Getting diff array

In [None]:
len(new_cases_list)

In [None]:
diffArr = np.diff(new_cases_list)
print(len(diffArr))
diffArr[0]

In [None]:
growth_values =[]
growth_values.append(0.0) # making first value as 0, as np.diff leaves first values as NA
print('growth_values : ',len(growth_values))
growth_values.extend(diffArr)
print('growth_values : ',len(growth_values))


In [None]:
len(growth_values)

In [None]:
average_newCases_df2['growth_per_day'] = growth_values

In [None]:
average_newCases_df2.dtypes

### Checking mean, std dev and variance for growth data

In [None]:
stdDev = np.std(diffArr)
variance = np.var(diffArr)
mean = np.mean(diffArr)
min = np.min(diffArr)
max = np.max(diffArr)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)

#### Getting positive cases

In [None]:
print('length diffArr: ',len(diffArr))

non_zero_pos_diff = []

for i in diffArr:
  if i > 0:
    non_zero_pos_diff.append(i)

print('non_zero_pos_diff length: ',len(non_zero_pos_diff))

In [None]:
stdDev = np.std(non_zero_pos_diff)
variance = np.var(non_zero_pos_diff)
mean = np.mean(non_zero_pos_diff)
min = np.min(non_zero_pos_diff)
max = np.max(non_zero_pos_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)

#### Getting negative cases

In [None]:
print('length diffArr: ',len(diffArr))

non_zero_neg_diff = []

for i in diffArr:
  if i < 0:
    non_zero_neg_diff.append(i)
print("non_zero_neg_diff length : ",len(non_zero_neg_diff))


In [None]:
stdDev = np.std(non_zero_neg_diff)
variance = np.var(non_zero_neg_diff)
mean = np.mean(non_zero_neg_diff)
min = np.min(non_zero_neg_diff)
max = np.max(non_zero_neg_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)

### Trying natural breaks

In [None]:
!pip install jenkspy


In [None]:
import jenkspy

In [None]:
pos_breaks = jenkspy.jenks_breaks(non_zero_pos_diff, nb_class=2)
print(pos_breaks)
neg_breaks = jenkspy.jenks_breaks(non_zero_neg_diff, nb_class=2)
print(neg_breaks)

##### **Note**: 
From the positive and negative break, 
* [ -0.012359, 0.000374 ] - flat growth
* [ 0.000374, 21.813186 ] - mild growth
* [ 21.813186, 95.421053 ] - drastic growth
* [ -0.012359, -21.813186 ] - mild decrement
* [ -21.813186, -95.421053 ] - drastic decrement

In [None]:
# original range for CA combined data

'''
def determine_growth_label(x):
  if x >= -0.012359 and x <= 0.000374:
    return 'flat_growth'
  if x > 0.000374 and x <= 21.813186:
    return 'mild_growth'
  if x > 21.813186 and x <= 95.421053:
    return 'drastic_growth'
  if x >= -21.813186  and x <= -0.012359:
    return 'mild_decrement'
  if x >= -95.421053 and x < -21.813186:
    return 'drastic_decrement'
'''


In [None]:
# temporarily using NY combined data range for CA
def determine_growth_label(x):
  if x >= -0.002109 and x <= 0.00011:
    return 'flat_growth'
  if x > 0.00011 and x <= 39.5:
    return 'mild_growth'
  if x > 39.5 and x <= 410.1818:
    return 'drastic_growth'
  if x >= -39.5  and x <= -0.0021:
    return 'mild_decrement'
  if x >= -410.18 and x < -39.5:
    return 'drastic_decrement'

In [None]:
average_newCases_df2['growth_label'] = average_newCases_df2['growth_per_day'].apply(lambda x: determine_growth_label(x) )

In [None]:
average_newCases_df2.head()

In [None]:
average_newCases_df2_growth0 = average_newCases_df2[average_newCases_df2['growth_label'] == 'drastic_decrement']
average_newCases_df2_growth1 = average_newCases_df2[average_newCases_df2['growth_label'] == 'drastic_growth']
average_newCases_df2_growth2 = average_newCases_df2[average_newCases_df2['growth_label'] == 'flat_growth']
average_newCases_df2_growth3 = average_newCases_df2[average_newCases_df2['growth_label'] == 'mild_decrement']
average_newCases_df2_growth4 = average_newCases_df2[average_newCases_df2['growth_label'] == 'mild_growth']

print(average_newCases_df2_growth0.shape)
print(average_newCases_df2_growth1.shape)
print(average_newCases_df2_growth2.shape)
print(average_newCases_df2_growth3.shape)
print(average_newCases_df2_growth4.shape)

## Jenks Natural breaks 

In [None]:
breaks = jenkspy.jenks_breaks(average_newCases_df2['percent_newcases'], nb_class=3)
print(breaks)

In [None]:
breaks = jenkspy.jenks_breaks(average_newCases_df2['percent_newcases'], nb_class=8)
print(breaks)

### Labeling criteria

We have applied "jenks Natural breaks' to label the county data as per 'percent change in cases'

From the natural break algorithm, we have splited the data as follows 

* -1.136 to 3.642 is **label 1** (least chance of spreading) 

* 3.642 to 13.625 is **label 2** (Below threshold but chances to reach super spread level)

* 13.625 to 410.181818 is **label 3** (Super spread week)

In [None]:
average_newCases_df2['labels'] = pd.cut(average_newCases_df2['percent_newcases'],
                        bins=breaks,
                        labels=['LessSpread','Spread', 'SuperSpread1','SuperSpread2','SuperSpread3','SuperSpread4','SuperSpread5','SuperSpread6'])

In [None]:
print('Number of records in LessSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'LessSpread']['County Name'].count())
print('Number of records in Spread:',average_newCases_df2[average_newCases_df2['labels'] == 'Spread']['County Name'].count())
print('Number of records in SuperSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread1']['County Name'].count())
print('Number of records in SuperSpread2:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread2']['County Name'].count())


print('Number of records in SuperSpread3:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread3']['County Name'].count())

print('Number of records in SuperSpread4:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread4']['County Name'].count())

print('Number of records in SuperSpread5:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread5']['County Name'].count())

print('Number of records in SuperSpread6:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread6']['County Name'].count())




In [None]:
import jenkspy

breaks = jenkspy.jenks_breaks(average_newCases_df2['percent_newcases'], nb_class=4)
print(breaks)

In [None]:
average_newCases_df2['labels'] = pd.cut(average_newCases_df2['percent_newcases'],
                        bins=breaks,
                        labels=['LessSpread','Spread', 'SuperSpread','SuperSpread2'])

In [None]:
print('Number of records in LessSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'LessSpread']['County Name'].count())
print('Number of records in Spread:',average_newCases_df2[average_newCases_df2['labels'] == 'Spread']['County Name'].count())
print('Number of records in SuperSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread']['County Name'].count())

print('Number of records in SuperSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread2']['County Name'].count())

In [None]:
average_newCases_df2.head()

# 1.3 Labeling data for CA combined

In [None]:
CA_combined_data.head(2)

In [None]:
CA_combined_data_ = CA_combined_data.copy()

In [None]:
CA_combined_data = CA_combined_data_.copy().reset_index(drop=True)

In [None]:
#average_newCases_df = pd.DataFrame(columns=['County Name',  'fips_x',  'average_per_week','startDate','endDate'])

for index, row in (average_newCases_df2.iterrows()):
    startDate, endDate = row['startDate'], row['endDate']
    fips = row['fips_x']
    #print(startDate,endDate,fips)
    df_index = (CA_combined_data['fips_x'] == row['fips_x']) & (CA_combined_data['Date'] >= startDate) & (CA_combined_data['Date'] <= endDate)
    df_weekData =  CA_combined_data.loc[df_index]
    #print(df_weekData)
    CA_combined_data.loc[df_index,'label'] = row['labels']
    CA_combined_data.loc[df_index,'growth_label'] = row['growth_label']




In [None]:
CA_combined_data.head()

In [None]:
CA_combined_data[CA_combined_data['label'] == 'SuperSpread2'].sort_values(by=['fips_x','Date']).count()

In [None]:
CA_combined_data.shape

In [None]:
CA_combined_data['label'] = CA_combined_data['label'].apply(lambda x: x if x != 'SuperSpread2' else 'SuperSpread')

In [None]:
CA_combined_data.dtypes

In [None]:
CA_combined_data[CA_combined_data['label'] == 'LessSpread'].count()

In [None]:
CA_combined_data[CA_combined_data['growth_label'] == 'flat_growth'].count()

## Computing New Cases per 1k Population

Formula: 'New Cases/1k population' = (New Cases * 1000)/population

In [None]:
CA_combined_data['New Cases/1k population'] = CA_combined_data['New cases']*1000/ CA_combined_data['population']
CA_combined_data.tail(2)

In [None]:
new_cases_10k_list = CA_combined_data['New Cases/1k population'].values

In [None]:
diffArr = np.diff(new_cases_10k_list)
print(len(diffArr))
diffArr[0]

growth_values =[]
growth_values.append(0.0) # making first value as 0, as np.diff leaves first values as NA
print('growth_values : ',len(growth_values))
growth_values.extend(diffArr)
print('growth_values : ',len(growth_values))

In [None]:
stdDev = np.std(diffArr)
variance = np.var(diffArr)
mean = np.mean(diffArr)
min = np.min(diffArr)
max = np.max(diffArr)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)

## Getting positive cases

In [None]:
print('length diffArr: ',len(diffArr))

non_zero_pos_diff = []

for i in diffArr:
  if i > 0:
    non_zero_pos_diff.append(i)

print('non_zero_pos_diff length: ',len(non_zero_pos_diff))


stdDev = np.std(non_zero_pos_diff)
variance = np.var(non_zero_pos_diff)
mean = np.mean(non_zero_pos_diff)
min = np.min(non_zero_pos_diff)
max = np.max(non_zero_pos_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)

## Getting negative cases


In [None]:
non_zero_neg_diff = []

for i in diffArr:
  if i < 0:
    non_zero_neg_diff.append(i)
print("non_zero_neg_diff length : ",len(non_zero_neg_diff))

stdDev = np.std(non_zero_neg_diff)
variance = np.var(non_zero_neg_diff)
mean = np.mean(non_zero_neg_diff)
min = np.min(non_zero_neg_diff)
max = np.max(non_zero_neg_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)


In [None]:
CA_combined_data.head()

## Applying natural breaks

In [None]:
pos_breaks = jenkspy.jenks_breaks(non_zero_pos_diff, nb_class=2)
print(pos_breaks)
neg_breaks = jenkspy.jenks_breaks(non_zero_neg_diff, nb_class=2)
print(neg_breaks)

print(' ****** breaks *******')
breaks = pos_breaks + neg_breaks
breaks.sort()
breaks

## New Labeling

In [None]:
'''
****** breaks *******
[-4.811795376206633,
 -1.0427002573115152,
 -9.454894623110721e-06,
 1.1045153360346449e-05,
 0.835960519245697,
 4.811795376206633

 def determine_growth_label_1k(x):
  x = round(x,4)
  if x > 0.83596 and x <= 4.8118:
    return 'drastic_growth'
  if x > 0.00001 and x <= 0.8360:
    return 'mild_growth'
  if x >= -0.00000945 and x <= 0.00001:
    return 'flat_growth'
  if x >= -1.0427 and x < -0.00000945:
    return 'mild_decrement'
  if x >= -4.8118  and x <= -1.0427:
    return 'drastic_decrement'
'''

In [None]:
#[5.2680092775606036e-05, 0.5928496690142417, 3.960127989407119]
#[-3.8230281887128026, -0.5659584931410302, -8.310897482368892e-06]
def determine_growth_label_1k(x):
  if x >= -0.00000831 and x <= 0.0000526:
    return 'flat_growth'
  if x > 0.0000526 and x <= 0.59285:
    return 'mild_growth'
  if x > 0.59285 and x <= 3.96013:
    return 'drastic_growth'
  if x >= -0.56595  and x <= -0.00000831:
    return 'mild_decrement'
  if x >= -3.96013 and x < -0.56595:
    return 'drastic_decrement'

In [None]:
CA_combined_data['growth_label_1k'] = CA_combined_data['New Cases/1k population'].apply(lambda x: determine_growth_label_1k(x) )
CA_combined_data.head(4)

### Export Combined labeled Data

In [None]:
CA_combined_data_growth0 = CA_combined_data[CA_combined_data['growth_label'] == 'drastic_decrement']
CA_combined_data_growth1 = CA_combined_data[CA_combined_data['growth_label'] == 'drastic_growth']
CA_combined_data_growth2 = CA_combined_data[CA_combined_data['growth_label'] == 'flat_growth']
CA_combined_data_growth3 = CA_combined_data[CA_combined_data['growth_label'] == 'mild_decrement']
CA_combined_data_growth4 = CA_combined_data[CA_combined_data['growth_label'] == 'mild_growth']

print(CA_combined_data_growth0.shape)
print(CA_combined_data_growth1.shape)
print(CA_combined_data_growth2.shape)
print(CA_combined_data_growth3.shape)
print(CA_combined_data_growth4.shape)

In [None]:
from datetime import datetime
location = '/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/'
filename = 'CA_labeledData_'+datetime.now().strftime("%b%d")+'.csv' #%Y%m%d
print(filename)
CA_combined_data.to_csv(location+filename,index=False)

In [None]:
average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread']

## Kernal Density Estimation

In [None]:
%matplotlib inline

from numpy import array, linspace
from sklearn.neighbors.kde import KernelDensity
from matplotlib.pyplot import plot

a = array(data).reshape(-1, 1)
kde = KernelDensity(kernel='gaussian', bandwidth=3).fit(a)
s = linspace(0,50)
e = kde.score_samples(s.reshape(-1,1))
plot(s, e)

In [None]:


from scipy.signal import argrelextrema
mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0]
print("Minima:", s[mi])
print("Maxima:", s[ma])

In [None]:
lower_range = s[mi][0]
upper_range = s[mi][1]

lower_range

In [None]:
print(a[a < mi[0]], a[(a >= mi[0]) * (a <= mi[1])], a[a >= mi[1]])


[-1.1363636363636362, 39.5, 213.91325695581014, 410.1818181818182]

Minima: [19.3877551  33.67346939]
Maxima: [24.48979592 39.79591837]*italicized text*

In [None]:
plot(s[:mi[0]+1], e[:mi[0]+1], 'r',
     s[mi[0]:mi[1]+1], e[mi[0]:mi[1]+1], 'g',
     s[mi[1]:], e[mi[1]:], 'b',
     s[ma], e[ma], 'go',
     s[mi], e[mi], 'ro')

print(a[a < mi[0]], a[(a >= mi[0]) * (a <= mi[1])], a[a >= mi[1]])


## Labeling Data with Kernal density results

In [None]:
average_newCases_df3.head()

In [None]:
print(lower_range, upper_range)

In [None]:
average_newCases_df3['label'] = average_newCases_df3['percent_newcases'].apply(lambda x: 'lessSpread' if x < lower_range  else 'SuperSpread')


In [None]:
average_newCases_df3['label'] = average_newCases_df3['percent_newcases'].apply(lambda x: 'lessSpread' if x < lower_range else ('Spread' if (x >= lower_range and x <= upper_range) else 'SuperSpread'))



#converter = lambda x : x*2 if x < 10 else (x*3 if x < 20 else x)



In [None]:
average_newCases_df3[average_newCases_df3['label'] == 'lessSpread'].count()

In [None]:
average_newCases_df3[average_newCases_df3['label'] == 'Spread'].count()

In [None]:
average_newCases_df3[average_newCases_df3['label'] == 'SuperSpread'].count()

In [None]:
average_newCases_df3.head()

# 2.1 Loading CA - Social Distancing Inertia Maryland Data

In [None]:
CA_combined_sd_inertia_data = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/CA_socialDistancingInertiaData_Sep25.csv',low_memory=False)
CA_combined_sd_inertia_data.head()

In [None]:
CA_combined_sd_inertia_data.FIPS.unique()

## Computing rolling average

In [None]:
CA_combined_sd_inertia_data.columns

In [None]:
#new case column nume = 5
CA_combined_sd_inertia_data.iloc[:,5]

removing comuns: 
'Confirmed', 'Deaths', 'New deaths','#days: decreasing ILI cases', '#days: decreasing COVID cases', 'New cases/1000 people', 'Imported COVID cases' ,'COVID death rate',  'Active cases/1000 people',

In [None]:
df = pd.DataFrame(columns=['Date', 'Province_State', 'FIPS',  'New cases',
        'mask_rule_active', 'CTFIPS', 'CTNAME', 'STFIPS',
       'Social distancing index', '% staying home', 'Trips/person',
       '% out-of-county trips', '% out-of-state trips', 'Miles/person',
       'Work trips/person', 'Non-work trips/person', 'New COVID cases',
       'Population', '% change in consumption', 'date', 'Transit mode share',
       '% people older than 60', 'Median income', '% African Americans',
       '% Hispanic Americans', '% Male', 'Population density',
       'Employment density', '# hot spots/1000 people',
       'Hospital beds/1000 people', 'ICUs/1000 people',
       '# contact tracing workers/1000 people',
       '# contact tracing workers/1000 people rank',
       '% people older than 60 rank', 'COVID exposure/1000 people',
        'Unemployment claims/1000 people',
       'Unemployment rate', '% working from home', 'Cumulative inflation rate',
        '% hospital bed utilization',
       'Testing capacity', 'Tests done/1000 people', '% ICU utilization',
       'Ventilator shortage'])


In [None]:
CA_counties = CA_combined_sd_inertia_data['CTNAME'].unique()

for county in CA_counties:
  county_data = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['CTNAME'] == county]
  #print(county_data.iloc[:,15])
  county_data['rolling_avg_new_cases'] = county_data.iloc[:,5].rolling(window=7).mean()
  df = df.append(county_data)

df.head()

In [None]:
print(df.shape)
CA_combined_sd_inertia_data.shape

In [None]:
CA_combined_sd_inertia_data = df.copy()
CA_combined_sd_inertia_data

In [None]:
CA_combined_sd_inertia_data.head(3)

## clean data - fill NA 

In [None]:
CA_combined_sd_inertia_data = CA_combined_sd_inertia_data.fillna(0)
CA_combined_sd_inertia_data.head(4)

In [None]:
CA_combined_sd_inertia_data['CTNAME'].unique()

## verifying Rolling average with visualizations

In [None]:
los_angeles_county = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['CTNAME'] == 'Los Angeles County']
print(los_angeles_county.shape)
los_angeles_county.head()

In [None]:
fig,ax = plt.subplots()
# make a plot
ax.plot(los_angeles_county.Date, los_angeles_county['rolling_avg_new_cases'], color="blue")
# set x-axis label
ax.set_xlabel("Date",fontsize=14)
# set y-axis label
ax.set_ylabel("rolling_avg_new_cases",color="blue",fontsize=14)
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(los_angeles_county.Date, los_angeles_county["New cases"],color="red")
ax2.set_ylabel("New cases",color="blue",fontsize=14)
plt.title("'rolling_avg_new_cases' Vs New cases")

plt.show()

In [None]:
suffolk_county = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['CTNAME'] == 'Santa Clara County']

In [None]:
fig,ax = plt.subplots()
# make a plot
ax.plot(suffolk_county.Date, suffolk_county['rolling_avg_new_cases'], color="blue")
# set x-axis label
ax.set_xlabel("Date",fontsize=14)
# set y-axis label
ax.set_ylabel("rolling_avg_new_cases",color="blue",fontsize=14)
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(suffolk_county.Date, suffolk_county["New cases"],color="red")
ax2.set_ylabel("New cases",color="blue",fontsize=14)
plt.title("'rolling_avg_new_cases' Vs New cases")

plt.show()

## Rolling window to compare every 2 weeks with next 2 weeks

In [None]:
CA_combined_sd_inertia_data = CA_combined_sd_inertia_data.sort_values(by=['Date'])
CA_combined_sd_inertia_data.head()

In [None]:
CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['FIPS'] == 6037]

In [None]:
minDate = CA_combined_sd_inertia_data['Date'].min()
print(minDate)

dateList = CA_combined_sd_inertia_data['Date'].unique()

In [None]:
import datetime

#dt_object = datetime.fromtimestamp(date)
day_name= ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']

day = datetime.datetime.strptime(minDate, '%Y-%m-%d').weekday()
if day == 6:
  day =0
elif day < 6:
  day += 1

print(day)

In [None]:
index = 0
count = 0
lst = []
while index < len(dateList):
  row = []
  j = 14 - day;
  # print(index,j)
  # print(dateList[index:index+j])
  weekRange = dateList[index:index+j]
  #print(weekRange)
  index = index+j 
  day = 0
  count +=1
  row.append(count)
  row.append(weekRange.min())
  row.append(weekRange.max())
  print(row)
  lst.append(row)
  


[1, '2020-03-01', '2020-03-14']
[2, '2020-03-15', '2020-03-28']
[3, '2020-03-29', '2020-04-11']
[4, '2020-04-12', '2020-04-26']
[5, '2020-04-27', '2020-05-10']
[6, '2020-05-11', '2020-05-24']
[7, '2020-05-25', '2020-06-08']
[8, '2020-06-09', '2020-06-22']
[9, '2020-06-23', '2020-07-06']
[10, '2020-07-07', '2020-07-20']
[11, '2020-07-21', '2020-08-03']
[12, '2020-08-04', '2020-08-17']
[13, '2020-08-18', '2020-08-21']

In [None]:
week_df = pd.DataFrame(lst,index=None,columns=['WeekNumber','startDate','endDate'])
week_df.head()

## Slice data for the week dataframe

In [None]:

average_newCases_df = pd.DataFrame(columns=['CTNAME',  'FIPS',  'average_per_week','startDate','endDate'])

for index, row in week_df.iterrows():
    startDate, endDate = row['startDate'], row['endDate']
    df_index = (CA_combined_sd_inertia_data['Date'] >= startDate) & (CA_combined_sd_inertia_data['Date'] <= endDate)
    df_weekData =  CA_combined_sd_inertia_data.loc[df_index]
    df_base = (
      df_weekData
      .pipe(lambda x: x.assign(gains_pctg=x["rolling_avg_new_cases"]))
      .groupby(['CTNAME','FIPS'])
      .agg({"gains_pctg": "mean"})
      .reset_index()
      .rename(columns={"gains_pctg": "average_per_week"})
    )
    df_base['startDate'] = startDate
    df_base['endDate'] = endDate
  
    #print(startDate,endDate,df_base['average_per_week'])
    average_newCases_df = average_newCases_df.append(df_base)


In [None]:
average_newCases_df.shape

In [None]:
average_newCases_df[average_newCases_df['FIPS'] == 6037]

## Verifying average per week data Eg., Los Angeles County

In [None]:
los_angeles_county[(los_angeles_county['Date'] >= '2020-03-01') & (los_angeles_county['Date'] <= '2020-03-14') ]['rolling_avg_new_cases']



In [None]:
average_newCases_df.head(10)

## Finding the % growth or reduction in 'new cases' on every 2 week average

In [None]:
average_newCases_df_ = average_newCases_df.copy()

In [None]:
albany_avg_newCases = average_newCases_df[average_newCases_df['CTNAME'] == 'Los Angeles County']
albany_avg_newCases

In [None]:
average_newCases_df.head()

In [None]:
average_newCases_df2 = pd.DataFrame()
average_newCases_df2 = average_newCases_df.copy()

average_newCases_df2 = average_newCases_df2.reset_index(drop=True)
average_newCases_df2.head()

In [None]:
average_newCases_df2['diff_avg_x'] = average_newCases_df2.sort_values(['startDate','endDate']).groupby(['CTNAME', 'FIPS'])['average_per_week'].diff().fillna(0)

print(average_newCases_df2.shape)
average_newCases_df2.head()


In [None]:
average_newCases_df2[average_newCases_df2['CTNAME'] == 'Santa Clara County']

In [None]:
average_newCases_df2[average_newCases_df2['CTNAME'] == 'Alameda County']

In [None]:
average_newCases_df2[average_newCases_df2['FIPS'] == 6037].head(5)

In [None]:
average_newCases_df2.head()

In [None]:
average_newCases_df2 = average_newCases_df2.fillna(0)

In [None]:
average_newCases_df2.head()

Shifting average_per_week_x	 column value for formula. Means previous row's value will be come to the current row.

In [None]:
average_newCases_df2['shift_avg_x'] = average_newCases_df2.sort_values(['startDate','endDate']).groupby(['CTNAME', 'FIPS'])['average_per_week'].shift()

In [None]:
average_newCases_df2.groupby(by=['CTNAME', 'FIPS','startDate','endDate']).sum()

In [None]:
average_newCases_df2[average_newCases_df2['CTNAME'] == 'Los Angeles County']

In [None]:
average_newCases_df2['percent_newcases'] = average_newCases_df2.apply(lambda x: (x.diff_avg_x) / x.shift_avg_x if x.shift_avg_x > 0.0 else 0 , axis=1)


average_newCases_df2.tail()


In [None]:
average_newCases_df2[average_newCases_df2['CTNAME'] == 'Santa Clara County']

# 2.2 Labeling data as per % change

## Validation

In [None]:
average_newCases_df2.head(3)

In [None]:
average_newCases_df2[(average_newCases_df2['FIPS']== 6001) & (average_newCases_df2['startDate']=='2020-08-02')]

In [None]:
test_ca_combined[(test_ca_combined['fips_x']== 6001) & (test_ca_combined['startDate']=='2020-08-04')]

## Actual Work

In [None]:
average_newCases_df2.describe()

In [None]:
los_angeles_county = average_newCases_df2[average_newCases_df2['CTNAME'] == 'Los Angeles County']

In [None]:
fig,ax = plt.subplots()
# make a plot
ax.plot(los_angeles_county.startDate, los_angeles_county['percent_newcases'], color="blue")
# set x-axis label
ax.set_xlabel("Date",fontsize=14)
# set y-axis label
ax.set_ylabel("percent_newcases",color="blue",fontsize=14)
# ax2=ax.twinx()
# make a plot with different y-axis using second axis object
# ax2.plot(los_angeles_county.startDate, los_angeles_county["percent_newcases"],color="red")
# ax2.set_ylabel("New cases",color="blue",fontsize=14)
plt.title("'percent_newcases'")

plt.show()

In [None]:
data = average_newCases_df2['percent_newcases'].to_list()

print(data)


In [None]:
average_newCases_df3 = average_newCases_df2.copy().reset_index(drop=True)

## Determining growth label

In [None]:
new_cases_list = average_newCases_df2['percent_newcases'].values
diffArr = np.diff(new_cases_list)
print(len(diffArr))
diffArr[0]

In [None]:
growth_values =[]
growth_values.append(0.0) # making first value as 0, as np.diff leaves first values as NA

print('growth_values : ',len(growth_values))
growth_values.extend(diffArr)
print('growth_values : ',len(growth_values))

In [None]:
average_newCases_df2['growth_per_day'] = growth_values


### Checking mean, std dev and variance for growth data


In [None]:
stdDev = np.std(diffArr)
variance = np.var(diffArr)
mean = np.mean(diffArr)
min = np.min(diffArr)
max = np.max(diffArr)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)

### Getting positive cases

In [None]:
print('length diffArr: ',len(diffArr))

non_zero_pos_diff = []

for i in diffArr:
  if i > 0:
    non_zero_pos_diff.append(i)

print('non_zero_pos_diff length: ',len(non_zero_pos_diff))


In [None]:
stdDev = np.std(non_zero_pos_diff)
variance = np.var(non_zero_pos_diff)
mean = np.mean(non_zero_pos_diff)
min = np.min(non_zero_pos_diff)
max = np.max(non_zero_pos_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)


### Getting negative cases

In [None]:
print('length diffArr: ',len(diffArr))

non_zero_neg_diff = []

for i in diffArr:
  if i < 0:
    non_zero_neg_diff.append(i)
print("non_zero_neg_diff length : ",len(non_zero_neg_diff))


In [None]:
stdDev = np.std(non_zero_neg_diff)
variance = np.var(non_zero_neg_diff)
mean = np.mean(non_zero_neg_diff)
min = np.min(non_zero_neg_diff)
max = np.max(non_zero_neg_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)

### Applying Jenks Natural break

In [None]:

pos_breaks = jenkspy.jenks_breaks(non_zero_pos_diff, nb_class=2)
print(pos_breaks)
neg_breaks = jenkspy.jenks_breaks(non_zero_neg_diff, nb_class=2)
print(neg_breaks)

#### **Note**: 

From the positive and negative break, 
* [ -0.009978 , 0.001377 ] - flat growth
* [ 0.001377 , 157.486111 ] - mild growth
* [ 157.486111 , 377.00 ] - drastic growth
* [ -0.009978 , -157.486111 ] - mild decrement
* [ -157.486111, -377.00 ] - drastic decrement

In [None]:
#original range for CA -inertia data
'''
def determine_growth_label_inertia(x):
  if x >= -0.009978 and x <= 0.001377:
    return 'flat_growth'
  if x > 0.001377 and x <= 157.486111:
    return 'mild_growth'
  if x > 157.486111 and x <= 377.00:
    return 'drastic_growth'
  if x >= -157.486111  and x <= -0.009978:
    return 'mild_decrement'
  if x >= -377.00 and x < -157.486111:
    return 'drastic_decrement'
'''

In [None]:
# temporarily using same label range of combined data for inertia data

'''
def determine_growth_label_inertia(x):
  if x >= -0.012359 and x <= 0.000374:
    return 'flat_growth'
  if x > 0.000374 and x <= 21.813186:
    return 'mild_growth'
  if x > 21.813186 and x <= 95.421053:
    return 'drastic_growth'
  if x >= -21.813186  and x <= -0.012359:
    return 'mild_decrement'
  if x >= -95.421053 and x < -21.813186:
    return 'drastic_decrement'
'''

In [None]:
# temporarily using NY combined data range for CA
def determine_growth_label_inertia(x):
  if x >= -0.002109 and x <= 0.00011:
    return 'flat_growth'
  if x > 0.00011 and x <= 39.5:
    return 'mild_growth'
  if x > 39.5 and x <= 410.1818:
    return 'drastic_growth'
  if x >= -39.5  and x <= -0.0021:
    return 'mild_decrement'
  if x >= -410.18 and x < -39.5:
    return 'drastic_decrement'

In [None]:
average_newCases_df2['growth_label'] = average_newCases_df2['growth_per_day'].apply(lambda x: determine_growth_label_inertia(x) )

In [None]:
average_newCases_df2.head(3)

In [None]:
average_newCases_df2_growth0 = average_newCases_df2[average_newCases_df2['growth_label'] == 'drastic_decrement']
average_newCases_df2_growth1 = average_newCases_df2[average_newCases_df2['growth_label'] == 'drastic_growth']
average_newCases_df2_growth2 = average_newCases_df2[average_newCases_df2['growth_label'] == 'flat_growth']
average_newCases_df2_growth3 = average_newCases_df2[average_newCases_df2['growth_label'] == 'mild_decrement']
average_newCases_df2_growth4 = average_newCases_df2[average_newCases_df2['growth_label'] == 'mild_growth']

print(average_newCases_df2_growth0.shape)
print(average_newCases_df2_growth1.shape)
print(average_newCases_df2_growth2.shape)
print(average_newCases_df2_growth3.shape)
print(average_newCases_df2_growth4.shape)

## Jenks Natural breaks 

In [None]:
!pip install jenkspy


In [None]:
import jenkspy

breaks = jenkspy.jenks_breaks(average_newCases_df2['percent_newcases'], nb_class=3)
print(breaks)

### Labeling criteria

We have applied "jenks Natural breaks' to label the county data as per 'percent change in cases'

From the natural break algorithm, we have splited the data as follows 

* -1.136 to 3.642 is **label 1** (least chance of spreading) 

* 3.642 to 13.625 is **label 2** (Below threshold but chances to reach super spread level)

* 13.625 to 410.181818 is **label 3** (Super spread week)

4 class labeling

In [None]:
import jenkspy

breaks = jenkspy.jenks_breaks(average_newCases_df2['percent_newcases'], nb_class=4)
print(breaks)

In [None]:
average_newCases_df2['labels'] = pd.cut(average_newCases_df2['percent_newcases'],
                        bins=breaks,
                        labels=['LessSpread','Spread', 'SuperSpread','SuperSpread2'])

In [None]:
print('Number of records in LessSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'LessSpread']['CTNAME'].count())
print('Number of records in Spread:',average_newCases_df2[average_newCases_df2['labels'] == 'Spread']['CTNAME'].count())
print('Number of records in SuperSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread']['CTNAME'].count())

print('Number of records in SuperSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread2']['CTNAME'].count())

8 class labeling

In [None]:
breaks = jenkspy.jenks_breaks(average_newCases_df2['percent_newcases'], nb_class=8)
print(breaks)

In [None]:
average_newCases_df2['labels'] = pd.cut(average_newCases_df2['percent_newcases'],
                        bins=breaks,
                        labels=['LessSpread','Spread', 'SuperSpread1','SuperSpread2','SuperSpread3','SuperSpread4','SuperSpread5','SuperSpread6'])

In [None]:
print('Number of records in LessSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'LessSpread']['CTNAME'].count())
print('Number of records in Spread:',average_newCases_df2[average_newCases_df2['labels'] == 'Spread']['CTNAME'].count())
print('Number of records in SuperSpread:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread1']['CTNAME'].count())
print('Number of records in SuperSpread2:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread2']['CTNAME'].count())
print('Number of records in SuperSpread3:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread3']['CTNAME'].count())
print('Number of records in SuperSpread4:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread4']['CTNAME'].count())
print('Number of records in SuperSpread5:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread5']['CTNAME'].count())
print('Number of records in SuperSpread6:',average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread6']['CTNAME'].count())

In [None]:
average_newCases_df2.head()

# 2.3 Labeling data for CA -Maryland Social Distancing Inertia combined

In [None]:
CA_combined_sd_inertia_data.head(2)

In [None]:
CA_combined_sd_inertia_data_ = CA_combined_sd_inertia_data.copy()

In [None]:
CA_combined_sd_inertia_data = CA_combined_sd_inertia_data_.copy().reset_index(drop=True)

In [None]:
#average_newCases_df = pd.DataFrame(columns=['County Name',  'fips_x',  'average_per_week','startDate','endDate'])

for index, row in tqdm(average_newCases_df2.iterrows()):
    startDate, endDate = row['startDate'], row['endDate']
    fips = row['FIPS']
    #print(startDate,endDate,fips)
    df_index = (CA_combined_sd_inertia_data['FIPS'] == row['FIPS']) & (CA_combined_sd_inertia_data['Date'] >= startDate) & (CA_combined_sd_inertia_data['Date'] <= endDate)
    df_weekData =  CA_combined_sd_inertia_data.loc[df_index]
    #print(df_weekData)
    CA_combined_sd_inertia_data.loc[df_index,'label'] = row['labels']
    CA_combined_sd_inertia_data.loc[df_index,'growth_label'] = row['growth_label']




In [None]:
CA_combined_sd_inertia_data.head(5)

In [None]:
CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['label'] == 'SuperSpread2'].sort_values(by=['FIPS','Date']).count()

In [None]:
CA_combined_sd_inertia_data.shape

In [None]:
CA_combined_sd_inertia_data['label'] = CA_combined_sd_inertia_data['label'].apply(lambda x: x if x != 'SuperSpread2' else 'SuperSpread')

## Computing growth label per 1k population

In [None]:
newcases_1k_inertia = CA_combined_sd_inertia_data['New cases/1000 people'].values

In [None]:
diffArr = np.diff(newcases_1k_inertia)
print(len(diffArr))
diffArr[0]

growth_values =[]
growth_values.append(0.0) # making first value as 0, as np.diff leaves first values as NA
print('growth_values : ',len(growth_values))
growth_values.extend(diffArr)
print('growth_values : ',len(growth_values))

## Determining mean,stddev, variance

In [None]:
stdDev = np.std(diffArr)
variance = np.var(diffArr)
mean = np.mean(diffArr)
min = np.min(diffArr)
max = np.max(diffArr)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)



## Getting positive cases

In [None]:
print('length diffArr: ',len(diffArr))

non_zero_pos_diff = []

for i in diffArr:
  if i > 0:
    non_zero_pos_diff.append(i)

print('non_zero_pos_diff length: ',len(non_zero_pos_diff))


stdDev = np.std(non_zero_pos_diff)
variance = np.var(non_zero_pos_diff)
mean = np.mean(non_zero_pos_diff)
min = np.min(non_zero_pos_diff)
max = np.max(non_zero_pos_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)


## Getting negative cases

In [None]:
non_zero_neg_diff = []

for i in diffArr:
  if i < 0:
    non_zero_neg_diff.append(i)
print("non_zero_neg_diff length : ",len(non_zero_neg_diff))

stdDev = np.std(non_zero_neg_diff)
variance = np.var(non_zero_neg_diff)
mean = np.mean(non_zero_neg_diff)
min = np.min(non_zero_neg_diff)
max = np.max(non_zero_neg_diff)


print("min ",min)
print("max ",max)

print("mean ",mean)
print("std. deviation ",stdDev)
print("variance ",variance)


## Applying natural breaks

In [None]:
pos_breaks = jenkspy.jenks_breaks(non_zero_pos_diff, nb_class=2)
print(pos_breaks)
neg_breaks = jenkspy.jenks_breaks(non_zero_neg_diff, nb_class=2)
print(neg_breaks)


## New Labeling

In [None]:

def determine_growth_label_inertia1k(x):
  if x >= -0.000009 and x <= 0.00009:
    return 'flat_growth'
  if x > 0.00009 and x <= 0.3248:
    return 'mild_growth'
  if x > 0.3248 and x <= 2.1234:
    return 'drastic_growth'
  if x >= -0.3404  and x <= -0.000009:
    return 'mild_decrement'
  if x >= -2.1359 and x < -0.3404:
    return 'drastic_decrement'

In [None]:
CA_combined_sd_inertia_data['growth_label_1k'] = CA_combined_sd_inertia_data['New cases/1000 people'].apply(lambda x: determine_growth_label_inertia1k(x) )

In [None]:
CA_combined_sd_inertia_data['growth_label_1k'].unique()
CA_combined_data['growth_label_1k'].unique()

In [None]:
CA_combined_sd_inertia_data_growth0 = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['growth_label'] == 'drastic_decrement']
CA_combined_sd_inertia_data_growth1 = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['growth_label'] == 'drastic_growth']
CA_combined_sd_inertia_data_growth2 = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['growth_label'] == 'flat_growth']
CA_combined_sd_inertia_data_growth3 = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['growth_label'] == 'mild_decrement']
CA_combined_sd_inertia_data_growth4 = CA_combined_sd_inertia_data[CA_combined_sd_inertia_data['growth_label'] == 'mild_growth']

print(CA_combined_sd_inertia_data_growth0.shape)
print(CA_combined_sd_inertia_data_growth1.shape)
print(CA_combined_sd_inertia_data_growth2.shape)
print(CA_combined_sd_inertia_data_growth3.shape)
print(CA_combined_sd_inertia_data_growth4.shape)


### Export Labeled Maryland Inertia Data

In [None]:
from datetime import datetime
location = '/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/'
filename = 'CA_sd_inertia_labeledData_'+datetime.now().strftime("%b%d")+'.csv' #%Y%m%d
print(filename)
CA_combined_sd_inertia_data.to_csv(location+filename,index=False)

In [None]:
average_newCases_df2[average_newCases_df2['labels'] == 'SuperSpread']

## Kernal Density Estimation

In [None]:
%matplotlib inline

from numpy import array, linspace
from sklearn.neighbors.kde import KernelDensity
from matplotlib.pyplot import plot

a = array(data).reshape(-1, 1)
kde = KernelDensity(kernel='gaussian', bandwidth=3).fit(a)
s = linspace(0,50)
e = kde.score_samples(s.reshape(-1,1))
plot(s, e)

In [None]:


from scipy.signal import argrelextrema
mi, ma = argrelextrema(e, np.less)[0], argrelextrema(e, np.greater)[0]
print("Minima:", s[mi])
print("Maxima:", s[ma])

In [None]:
lower_range = s[mi][0]
upper_range = s[mi][1]

lower_range

In [None]:
print(a[a < mi[0]], a[(a >= mi[0]) * (a <= mi[1])], a[a >= mi[1]])


[-1.1363636363636362, 39.5, 213.91325695581014, 410.1818181818182]

Minima: [19.3877551  33.67346939]
Maxima: [24.48979592 39.79591837]*italicized text*

In [None]:
plot(s[:mi[0]+1], e[:mi[0]+1], 'r',
     s[mi[0]:mi[1]+1], e[mi[0]:mi[1]+1], 'g',
     s[mi[1]:], e[mi[1]:], 'b',
     s[ma], e[ma], 'go',
     s[mi], e[mi], 'ro')

print(a[a < mi[0]], a[(a >= mi[0]) * (a <= mi[1])], a[a >= mi[1]])


## Labeling Data with Kernal density results

In [None]:
average_newCases_df3.head()

In [None]:
print(lower_range, upper_range)

In [None]:
average_newCases_df3['label'] = average_newCases_df3['percent_newcases'].apply(lambda x: 'lessSpread' if x < lower_range  else 'SuperSpread')


In [None]:
average_newCases_df3['label'] = average_newCases_df3['percent_newcases'].apply(lambda x: 'lessSpread' if x < lower_range else ('Spread' if (x >= lower_range and x <= upper_range) else 'SuperSpread'))



#converter = lambda x : x*2 if x < 10 else (x*3 if x < 20 else x)



In [None]:
average_newCases_df3[average_newCases_df3['label'] == 'lessSpread'].count()

In [None]:
average_newCases_df3[average_newCases_df3['label'] == 'Spread'].count()

In [None]:
average_newCases_df3[average_newCases_df3['label'] == 'SuperSpread'].count()

In [None]:
average_newCases_df3[average_newCases_df3['growth_label'] == 'SuperSpread'].count()

In [None]:
average_newCases_df3.head()

# unused code


In [None]:
average_newCases_df2[average_newCases_df2['County Name'] != 'Queens County']

In [None]:
average_newCases_df2.groupby(by=['County Name', 'fips_x','startDate','endDate']).sum()

In [None]:
week_df.head()

In [None]:
CA_combined_data.dtypes

In [None]:
start_date_df = (
    CA_combined_data
    .merge(week_df, left_on=['Date'],right_on=['startDate'], how='inner')
    .pipe(lambda x: x.assign(date=x.Date))
    .rename(columns={"starting_rolling_avg_new_cases": "rolling_avg_new_cases"})
    .reset_index(drop=True)

    [['Date','fips_x','County Name','retail and recreation','grocery and pharmacy','parks','transit stations','workplaces','residential','driving','m50','m50_index','population_density','mask_rule_active','mask_wearing_percent','New cases','startDate','WeekNumber','rolling_avg_new_cases']]
    .pivot_table(values='rolling_avg_new_cases', columns='WeekNumber', index='County Name', aggfunc='sum')
    .rename_axis(None, axis=1)
    .reset_index()
)

start_date_df

In [None]:
end_date_df = (
    CA_combined_data
    .merge(week_df, on='Date', how='inner')
    .pipe(lambda x: x.assign(date=x.Date))
    .rename(columns={"Confirmed": "end_date_Confirmed"})
    .reset_index(drop=True)

    [['County Name', 'fips_x', 'end_date_Confirmed','endDate','WeekNumber']]
    .pivot_table(values='end_date_Confirmed', columns='WeekNumber', index='County Name', aggfunc='sum')
    .rename_axis(None, axis=1)
    .reset_index()
)