## Interpretability & Insights

In [6]:
import pandas as pd
import numpy as np

from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder


#### Data Loading

In [2]:
df = pd.read_csv('../data/modelingData/modelingDataFrame.csv')

### TabNet - Interpretability

In [4]:
event_names = ['Astronomical Low Tide', 'Extreme Cold/Wind Chill', 'Flood','Winter Weather', 
               'Wildfire', 'Heavy Rain', 'Cold/Wind Chill', 'Dense Fog', 'Frost/Freeze', 'Strong Wind',
               'Lake-Effect Snow', 'Funnel Cloud', 'Flash Flood', 'Heavy Snow', 'Ice Storm', 
               'Thunderstorm Wind', 'Avalanche', 'Excessive Heat', 'Coastal Flood', 'Storm Surge/Tide', 
               'Sleet', 'Debris Flow', 'Winter Storm', 'Tropical Storm', 'Dust Storm', 'Drought', 
               'Blizzard', 'Lightning', 'Tornado', 'Hail', 'Rip Current', 'Heat', 'Freezing Fog', 
               'High Surf', 'High Wind']

In [7]:
df = df[df['ValidDataFlag'] == 1]
df = df[~df['Year'].isin([2015, 2016, 2017])]

zero_percentages = {}

for col in event_names:
    if col in df.columns:
        zero_count = (df[col] == 0).sum()
        total_count = len(df)
        zero_percentage = (zero_count / total_count) * 100
        zero_percentages[col] = zero_percentage

zero_percentages_df = pd.DataFrame.from_dict(zero_percentages, orient='index', columns=['%_zero'])
zero_percentages_df = zero_percentages_df.sort_values('%_zero', ascending=False)

selected_events_name = [
    col for col, perc in zero_percentages.items() if perc < 99.8
]

target = 'CustomersOut'

numeric_features = ['Tmin', 'Tmax', 'Tavg', 'Ppt', 'Lat', 'Lng']
categorical_features = ['Season', 'Region', 'Division', 'Month', 'StateName', 'CountyName']
event_features = [col for col in df.columns if col in selected_events_name]

X = df[numeric_features + categorical_features + event_features]
y = df[target]

X_encoded = X.copy()
for col in categorical_features:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))


X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# później podmień na lepszy model!

In [None]:
model_TabNet = TabNetRegressor()
model_TabNet.load_model('../models/tabnet_model.zip')
preds = model_TabNet.predict(X_test.values)



In [13]:
explain_matrix, masks = model_TabNet.explain(X_test.values)
feature_importance = np.mean(masks[0], axis=0)
feature_importance_df = pd.DataFrame({
    'feature': X_test.columns,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

display(feature_importance_df[feature_importance_df['importance'] > 0.001])

Unnamed: 0,feature,importance
22,Tornado,0.905317
3,Ppt,0.093306
11,CountyName,0.001377


##### RMSE analysis per State

In [22]:
state_le = LabelEncoder()
state_le.fit(X['StateName'].astype(str))

state_names = state_le.inverse_transform(X_test['StateName'].values)

In [23]:
y_pred = model_TabNet.predict(X_test.values)

results_df = pd.DataFrame({
    'state': state_names,
    'y_true': y_test.values.flatten(),
    'y_pred': y_pred.flatten()
})

state_rmse = results_df.groupby('state').apply(lambda x: mean_squared_error(x['y_true'], x['y_pred'], squared=False)).reset_index()
state_rmse.columns = ['state', 'rmse']

display(state_rmse.sort_values(by='rmse').reset_index(drop=True))

Unnamed: 0,state,rmse
0,Wyoming,88.786033
1,South Dakota,114.264517
2,North Dakota,137.61744
3,New Mexico,163.164645
4,Montana,243.634474
5,Hawaii,283.975292
6,Minnesota,344.31013
7,Indiana,363.124483
8,Kansas,399.198023
9,Nevada,418.368477


In [70]:
state_le = LabelEncoder()
state_le.fit(X['CountyName'].astype(str))

county_names = state_le.inverse_transform(X_test['CountyName'].values)

y_pred = model_TabNet.predict(X_test.values)

results_df = pd.DataFrame({
    'county': county_names,
    'y_true': y_test.values.flatten(),
    'y_pred': y_pred.flatten()
})

county_rmse = results_df.groupby('county').apply(lambda x: mean_squared_error(x['y_true'], x['y_pred'], squared=False)).reset_index()
county_rmse.columns = ['county', 'rmse']

display(county_rmse.sort_values(by='rmse').reset_index(drop=True))

Unnamed: 0,county,rmse
0,Geary,50.122195
1,Moniteau,51.503167
2,Oglethorpe,51.520566
3,Rawlins,52.574169
4,Nolan,53.771529
...,...,...
1087,Arecibo,12400.186847
1088,Bexar,12403.481472
1089,Oklahoma,16117.510213
1090,Mayagüez,25013.326898


In [71]:
fips_lookup = df[['CountyName', 'Fips']].drop_duplicates()
county_rmse = county_rmse.merge(fips_lookup, left_on='county', right_on='CountyName', how='right')
county_rmse = county_rmse.drop(columns=['CountyName'])
display(county_rmse.head())

Unnamed: 0,county,rmse,Fips
0,Autauga,315.463842,1001
1,Baldwin,520.425824,1003
2,Blount,248.248212,1009
3,Butler,284.743544,1013
4,Calhoun,1185.539954,1015


In [56]:
uscounties = pd.read_csv('../data/population_data/uscounties.csv')

In [72]:
fips_lookup = uscounties[['county', 'county_fips', 'state_name', 'state_id']].drop_duplicates()
fips_lookup.shape

(3144, 4)

In [73]:
full_data = fips_lookup.merge(county_rmse,
                              left_on="county_fips", 
                              right_on="Fips",
                              how="left")

In [74]:
full_data

Unnamed: 0,county_x,county_fips,state_name,state_id,county_y,rmse,Fips
0,Los Angeles,6037,California,CA,Los angeles,2504.471975,6037.0
1,Cook,17031,Illinois,IL,Cook,1736.626155,17031.0
2,Harris,48201,Texas,TX,Harris,2390.180852,48201.0
3,Maricopa,4013,Arizona,AZ,Maricopa,1327.823272,4013.0
4,San Diego,6073,California,CA,San diego,1267.217889,6073.0
...,...,...,...,...,...,...,...
3139,Blaine,31009,Nebraska,NE,,,
3140,King,48269,Texas,TX,,,
3141,Loving,48301,Texas,TX,Loving,55.407063,48301.0
3142,Kenedy,48261,Texas,TX,,,


In [75]:
full_data.drop(columns=['county_y', 'Fips'], inplace=True)
full_data['county_fips'] = full_data['county_fips'].astype(str).str.zfill(5)
full_data = full_data.rename(columns={'county_x': 'county_name'})

In [85]:
fig = px.choropleth(full_data, 
                    geojson="https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json",
                    locations='county_fips', color='rmse',
                    color_continuous_scale="OrRd",
                    title=f"RMSE by County",
                    labels={"rmse": "RMSE"},
                    range_color=[0, full_data['rmse'].max()])

fig.update_geos(
    scope="usa",
    visible=False,
    fitbounds="locations"
)

fig.update_traces(marker_line_width=0.2, selector=dict(type='choropleth'))
fig.show()