<a href="https://colab.research.google.com/github/aarsanjani/meansquares/blob/master/Google_MobilityDataAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Google mobility Data

* Data downloaded from : https://www.google.com/covid19/mobility/

In [21]:
!pip install wget



In [22]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import wget
from pandas import Series, datetime
from pandas.plotting import scatter_matrix, autocorrelation_plot
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
import random
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture

In [23]:
download_url = 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv?cachebust=94537edba4db1128'

filename = wget.download(download_url)

filename

'Global_Mobility_Report (1).csv'

In [24]:
mobility_data = pd.read_csv(filename,low_memory=False)
mobility_data.head(2)

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,AE,United Arab Emirates,,,,,,2020-02-15,0.0,4.0,5.0,0.0,2.0,1.0
1,AE,United Arab Emirates,,,,,,2020-02-16,1.0,4.0,4.0,1.0,2.0,1.0


In [25]:
US_mobility = mobility_data[mobility_data['country_region'] == 'United States']
US_mobility.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1089802,US,United States,,,,,,2020-02-15,6.0,2.0,15.0,3.0,2.0,-1.0
1089803,US,United States,,,,,,2020-02-16,7.0,1.0,16.0,2.0,0.0,-1.0
1089804,US,United States,,,,,,2020-02-17,6.0,0.0,28.0,-9.0,-24.0,5.0
1089805,US,United States,,,,,,2020-02-18,0.0,-1.0,6.0,1.0,0.0,1.0
1089806,US,United States,,,,,,2020-02-19,2.0,0.0,8.0,1.0,1.0,0.0


## 'Sub region' column has the State data 

In [26]:
US_mobility['sub_region_1'].unique()

array([nan, 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [27]:
CA_mobility_data = US_mobility[US_mobility['sub_region_1'] == 'California' ]
#CA_mobility_data.dropna()
CA_mobility_data.head(2)

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1116351,US,United States,California,,,US-CA,,2020-02-15,1.0,1.0,19.0,1.0,-1.0,0.0
1116352,US,United States,California,,,US-CA,,2020-02-16,5.0,0.0,31.0,1.0,-1.0,-1.0


## 'sub_region_2' column has the county data

In [28]:
print(len(CA_mobility_data['sub_region_2'].unique()))
print(CA_mobility_data.shape)
CA_mobility_data['sub_region_2'].unique()

57
(9480, 14)


array([nan, 'Alameda County', 'Amador County', 'Butte County',
       'Calaveras County', 'Colusa County', 'Contra Costa County',
       'Del Norte County', 'El Dorado County', 'Fresno County',
       'Glenn County', 'Humboldt County', 'Imperial County',
       'Inyo County', 'Kern County', 'Kings County', 'Lake County',
       'Lassen County', 'Los Angeles County', 'Madera County',
       'Marin County', 'Mariposa County', 'Mendocino County',
       'Merced County', 'Modoc County', 'Mono County', 'Monterey County',
       'Napa County', 'Nevada County', 'Orange County', 'Placer County',
       'Plumas County', 'Riverside County', 'Sacramento County',
       'San Benito County', 'San Bernardino County', 'San Diego County',
       'San Francisco County', 'San Joaquin County',
       'San Luis Obispo County', 'San Mateo County',
       'Santa Barbara County', 'Santa Clara County', 'Santa Cruz County',
       'Shasta County', 'Siskiyou County', 'Solano County',
       'Sonoma County', 'St

#### Dropping nan from county data

In [29]:
CA_mobility_data = CA_mobility_data.dropna(subset=['sub_region_2'])
print(CA_mobility_data.shape)
CA_mobility_data.head(2)
#CA_mobility_data[CA_mobility_data['sub_region_2'] == nan]

(9312, 14)


Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1116519,US,United States,California,Alameda County,,,6001.0,2020-02-15,1.0,0.0,22.0,1.0,0.0,-1.0
1116520,US,United States,California,Alameda County,,,6001.0,2020-02-16,7.0,0.0,24.0,5.0,1.0,-2.0


## Analysing one county for understanding (Alameda County)

In [30]:
alameda_mobility_data = CA_mobility_data[CA_mobility_data['sub_region_2'] == 'Alameda County']
print(alameda_mobility_data.shape)
alameda_mobility_data.head()


(168, 14)


Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1116519,US,United States,California,Alameda County,,,6001.0,2020-02-15,1.0,0.0,22.0,1.0,0.0,-1.0
1116520,US,United States,California,Alameda County,,,6001.0,2020-02-16,7.0,0.0,24.0,5.0,1.0,-2.0
1116521,US,United States,California,Alameda County,,,6001.0,2020-02-17,13.0,1.0,55.0,-31.0,-53.0,11.0
1116522,US,United States,California,Alameda County,,,6001.0,2020-02-18,-2.0,3.0,21.0,3.0,0.0,0.0
1116523,US,United States,California,Alameda County,,,6001.0,2020-02-19,-1.0,0.0,14.0,1.0,0.0,0.0


In [31]:
orange_mobility_data = CA_mobility_data[CA_mobility_data['sub_region_2'] == 'Orange County']
print(orange_mobility_data.shape)
orange_mobility_data.head()

(168, 14)


Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1121175,US,United States,California,Orange County,,,6059.0,2020-02-15,0.0,0.0,9.0,0.0,0.0,0.0
1121176,US,United States,California,Orange County,,,6059.0,2020-02-16,4.0,-1.0,22.0,-3.0,0.0,-1.0
1121177,US,United States,California,Orange County,,,6059.0,2020-02-17,10.0,1.0,34.0,-10.0,-33.0,6.0
1121178,US,United States,California,Orange County,,,6059.0,2020-02-18,1.0,-2.0,7.0,3.0,2.0,0.0
1121179,US,United States,California,Orange County,,,6059.0,2020-02-19,0.0,-2.0,8.0,2.0,2.0,0.0


### Research note:
We have covid-19 data for countywie from 01/22/2020. Google mobility data is available from 02/15/2020. According to the data format, we have google mobility data only from week 5.

**Coding Logic:**
* Get the minimum date (Start date )
* Find the day for start date
* **Filter data**: If the start date of the record is after Friday. We can't compute the average mobility with 2 -3 days data for the week. Hence drop such date from processing.
* Get the maximum date (End date)
* Find the day for end date
* **Filter data**: If the start date of the record is before or on Tuesday, drop such date from processing for the above mentioned reason


In [56]:
startDate = alameda_mobility_data['date'].min()
endDate = alameda_mobility_data['date'].max()

print(startDate)
print(endDate)

from datetime import datetime, timedelta

#dt_object = datetime.fromtimestamp(date)
day_name= ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']

starting_day = datetime.strptime(startDate, '%Y-%m-%d').weekday()
new_starting_date = datetime.strptime(startDate, '%Y-%m-%d')
if(starting_day >= 4): # which is more than Friday
  new_starting_day = 6 - starting_day
  starting_day = datetime.strptime(startDate, '%Y-%m-%d').weekday()
  new_starting_date = datetime.strptime(startDate, '%Y-%m-%d') + timedelta(days=new_starting_day)


ending_day = datetime.strptime(endDate, '%Y-%m-%d').weekday()

new_ending_date = datetime.strptime(endDate, '%Y-%m-%d')
if(ending_day <= 1): # which is less than Tuesday
  new_ending_day = ending_day
  new_ending_date = datetime.strptime(endDate, '%Y-%m-%d') - timedelta(days=new_ending_day)

new_starting_date = new_starting_date.strftime('%Y-%m-%d')
new_ending_date = new_ending_date.strftime('%Y-%m-%d')

print(new_starting_date)
print(new_ending_date)

2020-02-15
2020-07-31
2020-02-16
2020-07-31


In [57]:
filtered_alameda_county = alameda_mobility_data[(alameda_mobility_data['date'] >= new_starting_date) &(alameda_mobility_data['date']<= new_ending_date)]

print(filtered_alameda_county.shape)
filtered_alameda_county.head()


(167, 14)


Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
1116520,US,United States,California,Alameda County,,,6001.0,2020-02-16,7.0,0.0,24.0,5.0,1.0,-2.0
1116521,US,United States,California,Alameda County,,,6001.0,2020-02-17,13.0,1.0,55.0,-31.0,-53.0,11.0
1116522,US,United States,California,Alameda County,,,6001.0,2020-02-18,-2.0,3.0,21.0,3.0,0.0,0.0
1116523,US,United States,California,Alameda County,,,6001.0,2020-02-19,-1.0,0.0,14.0,1.0,0.0,0.0
1116524,US,United States,California,Alameda County,,,6001.0,2020-02-20,1.0,1.0,13.0,2.0,0.0,0.0


In [58]:
filtered_alameda_county.dtypes

country_region_code                                    object
country_region                                         object
sub_region_1                                           object
sub_region_2                                           object
metro_area                                             object
iso_3166_2_code                                        object
census_fips_code                                      float64
date                                                   object
retail_and_recreation_percent_change_from_baseline    float64
grocery_and_pharmacy_percent_change_from_baseline     float64
parks_percent_change_from_baseline                    float64
transit_stations_percent_change_from_baseline         float64
workplaces_percent_change_from_baseline               float64
residential_percent_change_from_baseline              float64
dtype: object

# Data computation - Average mobility for the week

**Coding logic**:
* We have daily data, we are converting to mean of the week so that we can know on a average how people have computed in a county

In [60]:
filtered_alameda_county.shape[0] / 7

23.857142857142858

In [88]:
minDate = filtered_alameda_county['date'].min()
print(minDate)
maxDate = filtered_alameda_county['date'].max()
print(maxDate)
dateList = filtered_alameda_county['date'].unique()
print(len(dateList))

dateList.sort()

2020-02-16
2020-07-31
167


In [93]:
day_name= ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']

day = datetime.strptime(minDate, '%Y-%m-%d').weekday()
if day == 6:
  day =0
elif day < 6:
  day += 1

print(day)

0


In [73]:
len(dateList)

167

In [100]:
index = 0
count = 0
lst = []
while index < len(dateList):
  row = []
  j = 7 - day;
  if index+j >= len(dateList):
    break
  # print(dateList[index:index+j])
  #print(j)
  weekRange = dateList[index:index+j]
 
  row.append(count)
  row.append(weekRange.min())
  row.append(weekRange.max())
  
  
  df = filtered_alameda_county[(filtered_alameda_county['date'] >= dateList[index]) & (filtered_alameda_county['date'] < dateList[index+j] )]
  #print(df['sub_region_2'].tolist()[0])
  row.append(df['sub_region_2'].tolist()[0])
  row.append(df['census_fips_code'].tolist()[0])
  row.append(df['retail_and_recreation_percent_change_from_baseline'].mean())
  row.append(df['grocery_and_pharmacy_percent_change_from_baseline'].mean())
  row.append(df['parks_percent_change_from_baseline'].mean())
  row.append(df['transit_stations_percent_change_from_baseline'].mean())
  row.append(df['workplaces_percent_change_from_baseline'].mean())
  row.append(df['residential_percent_change_from_baseline'].mean())

  #print(row)
  lst.append(row)
  index = index+j
  day = 0
  count +=1
  #print('index in end ',index)
  

In [101]:
alameda_mobility_df = pd.DataFrame(lst,index=None,columns=['Id','startDate','endDate','County','FIPS',
                                                'retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline',
                                               'parks_percent_change_from_baseline','transit_stations_percent_change_from_baseline',
                                               'workplaces_percent_change_from_baseline','residential_percent_change_from_baseline'])

alameda_mobility_df.head(2)

Unnamed: 0,Id,startDate,endDate,County,FIPS,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,0,2020-02-16,2020-02-22,Alameda County,6001.0,2.857143,1.285714,21.857143,-3.285714,-6.428571,1.142857
1,1,2020-02-23,2020-02-29,Alameda County,6001.0,2.285714,3.428571,18.0,3.285714,3.0,-0.714286


In [108]:
alameda_mobility_df.shape

(23, 11)

## Converting the above work for all counties 

In [None]:

day = datetime.strptime(minDate, '%Y-%m-%d').weekday()
if day == 6:
  day =0
elif day < 6:
  day += 1

print(day)

In [112]:
def compute_mobility_average(countyName):
  index = 0
  count = 0
  day = datetime.strptime(minDate, '%Y-%m-%d').weekday()
  if day == 6:
    day =0
  elif day < 6:
    day += 1
  
  while index < len(dateList):
    row = []
    j = 7 - day;
    if index+j >= len(dateList):
      break
    # print(dateList[index:index+j])
    #print(j)
    weekRange = dateList[index:index+j]
  
    #row.append(count)
    row.append(weekRange.min())
    row.append(weekRange.max())
    
    
    df = CA_mobility_data[(CA_mobility_data['sub_region_2'] == countyName) & (CA_mobility_data['date'] >= dateList[index]) & (CA_mobility_data['date'] < dateList[index+j] )]
    #print(df['sub_region_2'].tolist()[0])
    row.append(df['sub_region_2'].tolist()[0])
    row.append(df['census_fips_code'].tolist()[0])
    row.append(df['retail_and_recreation_percent_change_from_baseline'].mean())
    row.append(df['grocery_and_pharmacy_percent_change_from_baseline'].mean())
    row.append(df['parks_percent_change_from_baseline'].mean())
    row.append(df['transit_stations_percent_change_from_baseline'].mean())
    row.append(df['workplaces_percent_change_from_baseline'].mean())
    row.append(df['residential_percent_change_from_baseline'].mean())

    #print(row)
    lst.append(row)
    index = index+j
    day = 0
    count +=1
  return lst
  

In [113]:
county_names = CA_mobility_data['sub_region_2'].unique()
print(len(county_names))
lst = []
for countyName in county_names:
  compute_mobility_average(countyName)


56


In [114]:
CA_mobility_avg_df = pd.DataFrame(lst,index=None,columns=['startDate','endDate','County','FIPS',
                                                'retail_and_recreation_percent_change_from_baseline','grocery_and_pharmacy_percent_change_from_baseline',
                                               'parks_percent_change_from_baseline','transit_stations_percent_change_from_baseline',
                                               'workplaces_percent_change_from_baseline','residential_percent_change_from_baseline'])

print(CA_mobility_avg_df.shape)
CA_mobility_avg_df.head(2)

(1288, 10)


Unnamed: 0,startDate,endDate,County,FIPS,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,2020-02-16,2020-02-22,Alameda County,6001.0,2.857143,1.285714,21.857143,-3.285714,-6.428571,1.142857
1,2020-02-23,2020-02-29,Alameda County,6001.0,2.285714,3.428571,18.0,3.285714,3.0,-0.714286


In [110]:
CA_mobility_avg_df.shape

(1288, 11)

In [116]:
CA_mobility_avg_df.to_csv("computed_CA_mobility_data.csv",index=False)

## Completed work

* We have downloaded data from Google mobility.
* we got 6 key factors from people mobility statistics 
  * retail_and_recreation_percent_change_from_baseline
  * parks_percent_change_from_baseline
  * grocery_and_pharmacy_percent_change_from_baseline 
  * transit_stations_percent_change_from_baseline
  * residential_percent_change_from_baseline
  * workplaces_percent_change_from_baseline

* We have computed the mean metric for the date range and exported as CSV