In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

from matplotlib import cm
import matplotlib.pyplot as plt
from models import estimate_knn_clusters, estimate_som_clusters, prepare_whole_year, prepare_TS
from utils.utils import generate_filename
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
from IPython.display import clear_output
from tslearn.barycenters import dtw_barycenter_averaging

from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.svm import SVC


import plotly.express as px

from plotly.colors import qualitative
from matplotlib.colors import ListedColormap

px.set_mapbox_access_token("pk.eyJ1IjoiamdhY29zdGFzIiwiYSI6ImNsYWJwd3g1ZDAwaGUzb3Q0ZG04NDNndGgifQ.brk6kVA6biVSH0ovZ1dreA")

In [None]:
estimate_som_clusters(year=2019, month=1,  day=30, som_x=2, som_y=3, sigma=1, learning_rate = 0.5, plot=True)

In [None]:
df_comtime = pd.read_csv('data/census/commute_times/commute_times_zc_il.csv')

cols_comtimes = list(df_comtime.columns)[2:]
cols_comtimes_perc = []

for col in cols_comtimes:
    new_col = f'{col}_perc'
    df_comtime[new_col] = df_comtime[col] / df_comtime['Total']
    cols_comtimes_perc.append(new_col)
    
df_comtime[['zip', 'Total']+cols_comtimes_perc]

#df_clusters = estimate_knn_clusters(year=2019, month=7, day=10, n_clusters=5, metric='euclidean', df=True)
df_clusters = estimate_som_clusters(year=2019, month=1,  day=30, som_x=2, som_y=3, sigma=1, learning_rate = 0.5, df=True)
df_clusters['zip'] = df_clusters['GEOID20'].astype(int)

df_data = pd.merge(df_clusters[['zip', 'Cluster']], df_comtime[['zip', 'Total']+cols_comtimes_perc], on='zip').dropna()
df_data['Cluster_n'] = df_data['Cluster'].str[-1]#.astype(int)

In [None]:
gb_params = {
    'criterion':('friedman_mse', 'squared_error'), 
    'learning_rate':[0.0001, 0.001, 0.01, 0.1],
    'n_estimators': [20,40,50,60,80,90,100,150, 200],
    'max_depth': [3,5,7,9,11,15]
    }

In [None]:
X = df_data[cols_comtimes_perc]
y = df_data['Cluster_n']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=y)

In [None]:
cols_numeric = []
count_i = 0
for col in X.columns:
    df_data[f'{count_i}'] = df_data[col]
    cols_numeric.append(f'{count_i}') 
    count_i += 1

In [None]:
train_zips = df_data.loc[X_train.index].zip
test_zips = df_data.loc[X_test.index].zip

In [None]:
model_gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, criterion='squared_error', max_depth=3, random_state=1234)
model_gb.fit(X_train, y_train)
print(model_gb.score(X_train, y_train))
print(model_gb.score(X_test, y_test))

In [None]:
model_rf = RandomForestClassifier(n_estimators=30, criterion='gini', max_depth=7, random_state=1234)
model_rf.fit(X_train, y_train)
print(model_rf.score(X_train, y_train))
print(model_rf.score(X_test, y_test))

In [None]:
feature_importance = model_gb.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(cols_comtimes_perc)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    model_gb, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(cols_comtimes_perc)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

In [None]:
feature_importance = model_gb.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(cols_comtimes_perc)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    model_gb, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(cols_comtimes_perc)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

In [None]:
df_data['Predicted'] = model_gb.predict(df_data[cols_comtimes_perc])

In [None]:
df_data['correct'] = (df_data.Cluster == df_data.Predicted).astype(int)

In [None]:
gdf_zip = gpd.read_file('data/geo/Chicago_ZC.geojson')
gdf_zip['zip'] = gdf_zip.GEOID20.astype(int) 
gdf_data = pd.merge(gdf_zip, df_data, on='zip').sort_values('Cluster')

In [None]:
fig = px.choropleth_mapbox(gdf_data,
                   geojson=gdf_data,
                   featureidkey='properties.GEOID20',
                   color="Cluster",
                   locations='GEOID20',
                   width=1000,
                   height=700,
                   center={'lat':41.6, 'lon':-88.99},
                   zoom=6,
                   mapbox_style='carto-positron',
                   opacity=0.65,
                   #mapbox_style='open-street-map'
                   )
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

In [None]:
fig = px.choropleth_mapbox(gdf_data.sort_values('Predicted'),
                   geojson=gdf_data.sort_values('Predicted'),
                   featureidkey='properties.GEOID20',
                   color="Predicted",
                   locations='GEOID20',
                   width=1000,
                   height=700,
                   center={'lat':41.6, 'lon':-88.99},
                   zoom=6,
                   mapbox_style='carto-positron',
                   opacity=0.65,
                   )
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

In [None]:
fig = px.choropleth_mapbox(gdf_data[gdf_data.zip.isin(test_zips)],
                   geojson=gdf_data[gdf_data.zip.isin(test_zips)],
                   featureidkey='properties.GEOID20',
                   color="correct",
                   locations='GEOID20',
                   width=1000,
                   height=700,
                   center={'lat':41.6, 'lon':-88.99},
                   zoom=6,
                   mapbox_style='carto-positron',
                   opacity=0.65,
                   )
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

In [None]:
import shap
shap.initjs()

In [None]:
import xgboost
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [None]:
X = df_data[cols_numeric]
y = df_data['Cluster_n']

X = df_data[cols_comtimes_perc]
y = df_data['Cluster']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234, stratify=y)
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [None]:
bst = xgboost.XGBClassifier(n_estimators=49, max_depth=6, learning_rate=0.1, objective='multi:softmaxc')

In [None]:
bst.fit(X_train, y_train)
bst.score(X_train, y_train)

In [None]:
bst.score(X_test, y_test)

In [None]:
explainer = shap.Explainer(bst, X_train)
shap_values = explainer.shap_values(X_train)

In [None]:
xgboost.plot_importance(bst)

In [None]:
plotly_cm = ListedColormap(['#EF553B', '#19D3F3', '#FFA15A', '#AB63FA', '#00CC96', '#636EFA'], name='my_colormap_name')

In [None]:
show_cols = []
for col in X_train.columns:
    a = col.replace('_perc', 'min %')
    a = a.replace(' to ', '-')
    show_cols.append(a)

In [None]:
shap.summary_plot(shap_values, X_train, class_names=['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6'], color=plotly_cm, feature_names=show_cols)

In [None]:
shap.summary_plot(shap_values[0], X_train.values, feature_names = show_cols, cmap=plt.get_cmap("cool_r"), show=False)
plt.title('Cluster 1', fontsize=20)
plt.show()

In [None]:
shap.summary_plot(shap_values[1], X_train.values, feature_names = show_cols, cmap=plt.get_cmap("Reds"), show=False)
plt.title('Cluster 2', fontsize=20)
plt.show()

In [None]:
shap.summary_plot(shap_values[2], X_train.values, feature_names = show_cols, cmap=plt.get_cmap("Greens"), show=False)
plt.title('Cluster 3', fontsize=20)
plt.show()

In [None]:
shap.summary_plot(shap_values[3], X_train.values, feature_names = show_cols, cmap=plt.get_cmap("Purples"), show=False)
plt.title('Cluster 4', fontsize=20)
plt.show()

In [None]:
shap.summary_plot(shap_values[4], X_train.values, feature_names = show_cols, cmap=plt.get_cmap("Oranges"), show=False)
plt.title('Cluster 5', fontsize=20)
plt.show()

In [None]:
shap.summary_plot(shap_values[5], X_train.values, feature_names = show_cols, cmap=plt.get_cmap("Blues"), show=False)
plt.title('Cluster 6', fontsize=20)
plt.show()