In [None]:
import functools
import json
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import tslearn.utils
import tslearn.clustering
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

In [None]:
def flag(code):
    assert(len(code) == 2)
    return chr(ord(code[0]) + 0x1F1E6 - 0x41) + chr(ord(code[1]) + 0x1F1E6 - 0x41)

def getGeo(gj, country):
    country = "United States of America" if country == "United States" else country
    a = [idx for idx in range(len(gj["features"])) if gj["features"][idx]["properties"]["ADMIN"] == country]
    if len(a) == 0:
        print(f"error: {country}")
        return country
    idx = a[0]
    return json.dumps(gj["features"][idx])

In [None]:
# data, notes

# https://covid19.apple.com/mobility
# https://www.google.com/covid19/mobility
# https://www.google.com/covid19/mobility/data_documentation.html

# https://developers.google.com/machine-learning/clustering/interpret

# apple baseline value = Jan 13 2020
# apple % change is relative to a baseline value (need to normalize day-of-week effects)
# google baseline value = median value for the corresponding day of the week, during the 5 week-period Jan 3-Feb 6 2020
# google % change is relative to a baseline value for that day of the week

apple_data_url = "https://covid19-static.cdn-apple.com/covid19-mobility-data/2025HotfixDev16/v3/en-us/applemobilitytrends-2021-01-28.csv"
google_data_url = "https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"

In [None]:
_apple_df = pd.read_csv(apple_data_url)
_google_df = pd.read_csv(google_data_url, parse_dates=["date"])

In [None]:
apple_df = _apple_df
google_df = _google_df

In [None]:
# dev

apple_df = _apple_df.copy()
google_df = _google_df.copy()

In [None]:
# these countries are in google's dataset but not apple's

google_df[~google_df.country_region.isin(apple_df.country)].country_region.unique()

In [None]:
# make (apple) data tidy

apple_df = apple_df.melt(
    id_vars=[
        "geo_type",
        "region",
        "transportation_type",
        "alternative_name",
        "sub-region",
        "country",
    ],
    var_name="date",
    value_name="requests",
)

apple_df.date = pd.to_datetime(apple_df.date)

In [None]:
# filter dates

# apple_df = apple_df.loc[apple_df.date.dt.year == 2021]
# google_df = google_df.loc[google_df.date.dt.year == 2021]

In [None]:
# latest date in the datasset

(max(apple_df.date), max(google_df.date))

In [None]:
# apple: missing data for May 11-12
# fill in na values, in time series
# first sort the values (for every region, dates in ascending order)

apple_df.sort_values(by=["geo_type", "transportation_type", "country", "region", "sub-region", "date"], inplace=True)
apple_df.requests.interpolate(method="linear", inplace=True)

In [None]:
# google: use 100 as baseline instead of 0
# (same scale as apple)

google_percent_change_df = google_df.filter(like="percent_change_from_baseline") + 100
google_percent_change_cols_count = google_percent_change_df.shape[1]
google_df[google_percent_change_df.columns] = google_percent_change_df

# google: calc avg % change (multiple mobility types)

google_df["requests"] = (
    google_percent_change_df.sum(axis=1) / google_percent_change_cols_count
)

In [None]:
apple_geo_type = "country/region"
apple_df = apple_df[apple_df.geo_type == apple_geo_type]
apple_df

In [None]:
# apple: calc avg requests (multiple transportation types)

apple_avg_df = apple_df.groupby(["region", "date"])["requests"].mean().reset_index()
apple_avg_df

In [None]:
country = "Switzerland"

In [None]:
apple_country_df = apple_avg_df.loc[apple_avg_df.region == country]
apple_country_df

In [None]:
plt.plot(apple_country_df.date, apple_country_df.requests)

In [None]:
google_country_df = google_df.loc[
    (
        google_df.sub_region_1.isnull()
        & (google_df.metro_area.isnull())
        & (google_df.country_region == country)
    )
]

plt.plot(google_country_df.date, google_country_df.requests)

In [None]:
# (inner) join apple & google df
# l and r are tidy
# for each l, r df: #rows <= unique #dates * unique #regions

l = apple_avg_df
r = google_df[(google_df.sub_region_1.isnull()) & (google_df.metro_area.isnull())]
j = l.merge(r, left_on=["date", "region"], right_on=["date", "country_region"])
j

In [None]:
# test

assert l.shape[0] <= len(l.date.unique()) * len(l.region.unique())
assert r.shape[0] <= len(r.date.unique()) * len(r.country_region.unique())

assert all(l.groupby(["date", "region"]).size() == 1)
assert all(r.groupby(["date", "country_region"]).size() == 1)

In [None]:
# visualize data

f, a = plt.subplots()
df = j[j.region == country]
a.plot(df.date, df.requests_x, label="apple")
a.plot(df.date, df.requests_y, label="google")
a.set_title(country)
a.legend()

In [None]:
df = j.filter(items=["date", "region", "requests_x", "requests_y"])
df

In [None]:
len(df), df.region.nunique()

In [None]:
df.info()
df.describe()

In [None]:
df.date.value_counts().std(), df.region.value_counts().std()

In [None]:
df.plot.scatter(x="requests_x", y="requests_y")
df.filter(items=["requests_x", "requests_y"]).plot.hist(alpha=0.5)

In [None]:
df = df.dropna()

In [None]:
m = np.array(df[["requests_x", "requests_y"]])
# m = m[~np.isnan(m).any(axis=1)]
assert not np.isnan(m).any()

In [None]:
# pivot: 1 country per row, dates are columns

apple_google_df = df.copy()

df["requests"] = df[["requests_x", "requests_y"]].agg("mean", axis=1)
pivot = df.pivot_table(index="region", columns=["date"], values=["requests"])
pivot.columns = [item[1] for item in np.array(pivot.columns)]

In [None]:
pivot.min(axis=1).nsmallest(10)

In [None]:
pivot.max(axis=1).nlargest(10)

In [None]:
pivot.std(axis=1).rank().sort_values()

In [None]:
pivot.skew(axis=1).sort_values()

In [None]:
pivot.kurtosis(axis=1).sort_values()

In [None]:
# compare time series to baseline

baseline_df = pd.DataFrame(np.ones(pivot.shape) * 100)
baseline_df.index = pivot.index
baseline_df.columns = pivot.columns

avg_pc = (pivot - baseline_df).mean(axis=1).sort_values()
avg_pc

In [None]:
# clean na

pivot = pivot.dropna(axis=1)

In [None]:
# reduce dimensions with pca, and
# cluster kmeans

pca = PCA(n_components=8)
kmeans = KMeans(n_clusters=4)
reduced_data = pca.fit_transform(pivot.drop(columns=["labels", "ts_labels"], errors="ignore"))
kmeans.fit(reduced_data)

kmeans.score(reduced_data)

In [None]:
_labels_df = pd.DataFrame(np.zeros(pivot.shape[0]))

In [None]:
_labels_df = pivot.copy()

In [None]:
pivot = pivot.drop(columns=["labels", "ts_labels"], errors="ignore")

In [None]:
_labels_df["labels"] = kmeans.labels_

In [None]:
labels_df = pd.DataFrame()

for label in _labels_df.labels.unique():
    idx = np.where(_labels_df.labels == label)
    labels_df = pd.concat([labels_df, pd.DataFrame({label: list(_labels_df.iloc[idx].index)})], axis=1)

labels_df

In [None]:
# visualize clusters: time series -> 2d
# every dot in the plot is a country
# color = label

tsne = sk.manifold.TSNE()
tsne_model = tsne.fit_transform(pivot)
tsne_model_df = pd.DataFrame(tsne_model)
tsne_model_df.index = pivot.index
plt.scatter(tsne_model_df.iloc[:, 0], tsne_model_df.iloc[:, 1], c=kmeans.labels_)

In [None]:
# cluster time series kmeans

ts_data = tslearn.utils.to_time_series_dataset(pivot)
ts_kmeans = tslearn.clustering.TimeSeriesKMeans(metric="dtw")
ts_kmeans.fit(ts_data)

In [None]:
_labels_df["ts_labels"] = ts_kmeans.labels_

In [None]:
ts_labels_df = pd.DataFrame()

for label in _labels_df.ts_labels.unique():
    idx = np.where(_labels_df.ts_labels == label)
    ts_labels_df = pd.concat([ts_labels_df, pd.DataFrame({label: list(_labels_df.iloc[idx].index)})], axis=1)

ts_labels_df

In [None]:
# geo map - choropleth
# using data from time series kmeans

px.choropleth(pivot, locationmode="country names", locations=pivot.index, color=ts_kmeans.labels_)

In [None]:
# set index to (country region code . flag)

country_region_df = google_df[["country_region_code", "country_region"]].set_index("country_region").drop_duplicates()
_pivot = pivot.copy()
_pivot.index = pivot.index.map(country_region_df.country_region_code)
_pivot.index = _pivot.index + " " + _pivot.index.map(flag)
_pivot

In [None]:
# average percent change of mobility requests in 2020

f = px.choropleth(pivot, locationmode="country names", locations=avg_pc.index, color=avg_pc)
f

In [None]:
# save as pdf/eps/svg

# pio.write_image(f, "img.svg", width=1000, height=600)

In [None]:
plt.figure(figsize=(12, 12))
pivot.transpose().boxplot(
    vert=False,
    figsize=(12, 12),
    boxprops={"color": "black"},
    flierprops={"alpha": 0.2},
    showcaps=False
)

In [None]:
# get geojson country polygons data

gj = pd.read_json("https://datahub.io/core/geo-countries/r/countries.geojson")

In [None]:
pivot_df = pivot.T.assign(date=pivot.T.index).melt(
    id_vars=["date"],
    var_name="geo",
    value_name="requests",
)

# memory+

# pivot_df.geo = pivot_df.geo.map(functools.partial(getGeo, gj))

In [None]:
# pivot_df.to_csv("geo_df.csv")

In [None]:
# covariance apple & google mobility data

scaler = StandardScaler()
apple_google_array = apple_google_df[["requests_x", "requests_y"]]
scaler.fit(apple_google_array)
apple_google_array = scaler.transform(apple_google_array)

In [None]:
covariance_model = sk.covariance.EmpiricalCovariance()
covariance_model.fit(apple_google_array)
covariance_df = pd.DataFrame(covariance_model.covariance_)
covariance_df.index = ["apple", "google"]
covariance_df.columns = ["apple", "google"]
covariance_df