``Mission : Error Analysis & Model metadata``

- Analyize past submissions and find out what went wrong! 
- One great heuristic is all I need now!


# Data and dependencies

In [259]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [260]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [261]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

# Add date fts

In [262]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [263]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

### Expanding sev

In [264]:
# exapnding sev by rs  @ 0.859434 vs 0.8798
ex_sev_rs = pd.read_csv('../submissions/expanding_sev_rs_preds.csv')
ex_sev_rs
ex_sev_rs.severity.value_counts(normalize=True)

2    0.463902
4    0.354531
1    0.169892
3    0.011674
Name: severity, dtype: float64

In [265]:
avg_sev_r = pd.read_csv('../submissions/avg_severity_by_region.csv')
avg_sev_r # shouldn't be used
avg_sev_r.severity.value_counts(normalize=True)


2    0.633333
4    0.366667
Name: severity, dtype: float64

In [266]:
#  ex sev by regweek reg ~ ex sev by reg  @ 0.8971
ex_sev_regweek_reg = pd.read_csv('../submissions/expndng_sev_by_rw_exreg_preds.csv')
ex_sev_regweek_reg
ex_sev_regweek_reg.severity.value_counts(normalize=True)

2    0.536098
4    0.367127
1    0.096774
Name: severity, dtype: float64

In [267]:
ex_sev_reg = pd.read_csv('../submissions/expndng_sev_by_reg_preds.csv')
ex_sev_reg.severity.value_counts(normalize=True)

2    0.536098
4    0.367127
1    0.096774
Name: severity, dtype: float64

In [268]:
#  holy heck both are same lol...

In [269]:
xgbreg_preds = pd.read_csv('../submissions/xgbreg_cvpreds_on_expanding_sev_rs_preds.csv')
xgbreg_preds.severity.value_counts(normalize=True)

2    0.410445
4    0.311674
1    0.151613
3    0.125960
5    0.000307
Name: severity, dtype: float64

In [270]:
xgbreg_preds = pd.read_csv('../submissions/modified_xgbreg_cvpreds_on_expanding_sev_rs_preds.csv')
xgbreg_preds.severity.value_counts(normalize=True)

1    0.410445
4    0.311674
2    0.151613
3    0.125960
5    0.000307
Name: severity, dtype: float64

In [271]:
#  all ones @ 1.6592 vs 1.6387

all_ones = np.full(train_labels.shape[0], 1)
rmse(train_labels.severity, all_ones)

1.6592135031404238

In [272]:
pow(1.6592135031404238, 2)
# mse 2.7529894490035174

2.7529894490035174

In [337]:
all_ones_but_few = all_ones.copy()
all_ones_but_few[:10] = 2
rmse(train_labels.severity, all_ones_but_few)

1.659178174792949

In [274]:
all_twos = np.full(train_labels.shape[0], 2)
rmse(train_labels.severity, all_twos)

1.2087508206024726

In [275]:
all_twos_but_few = all_twos.copy()
all_twos_but_few[:10] = 5
rmse(train_labels.severity, all_twos_but_few)

1.2115118010484747

## Is 0.76 possible with avgs?

- and wt. or latest picking heuristics is better than avg

In [276]:
grp_by_region = train_data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)
print(grp_by_region.isna().sum())
mse(train_data.severity.sort_index(), grp_by_region.droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0


0.8939027540207913

In [279]:
#  avg sev by region @ 0.89475/0.8976
grp_by_region = train_data.groupby('region').severity.mean()
grp_by_region = grp_by_region.map(np.round)
print(grp_by_region)
print(grp_by_region.isna().sum())

print("RMSE:", rmse(train_data.severity, grp_by_region[train_data.region]))

region
midwest      2.0
northeast    2.0
south        2.0
west         4.0
Name: severity, dtype: float64
0
RMSE: 0.894754808017972


In [283]:
grp_by_month = train_data.groupby('month').severity.expanding(1).mean()
grp_by_month = grp_by_month.map(np.round)     
print(grp_by_month.isna().sum())    # 8 missing values  1.202
mse(train_data.severity.sort_index(), grp_by_month.droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0


1.2023326292941634

In [285]:
#  avg sev by month @ 1.204
grp_by_month = data.groupby('month').severity.mean()
grp_by_month = grp_by_month.map(np.round)     
print(grp_by_month.isna().sum())  
print(grp_by_month)

print("RMSE:", rmse(train_data.severity, grp_by_month[train_data.month]))

0
month
1     3.0
2     3.0
3     2.0
4     2.0
5     2.0
6     2.0
7     2.0
8     2.0
9     2.0
10    2.0
11    2.0
12    2.0
Name: severity, dtype: float64
RMSE: 1.2048164380819388


In [287]:
# avg sev by season @ 1.208
grp_by_season = data.groupby('season').severity.mean()
grp_by_season = grp_by_season.map(np.round)
print(grp_by_season.isna().sum())

print("RMSE:", rmse(train_data.severity, grp_by_season[train_data.season]))

0
RMSE: 1.2087508206024726


In [290]:
grp_by_yr = train_data.groupby('year').severity.mean()
grp_by_yr = grp_by_yr.map(np.round)
print(grp_by_yr.isna().sum())

print("RMSE:", rmse(train_data.severity, grp_by_yr[train_data.year]))

0
RMSE: 1.2087508206024726


In [317]:
grp_by_rm = train_data.groupby(['region', 'month']).severity.expanding(1).mean()
grp_by_rm = grp_by_rm.map(np.round)
print(grp_by_rm.isna().sum())  # 30 missing values 0.8522090458996826
print(grp_by_rm)

mse(train_data.severity.sort_index(), grp_by_rm.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0
region   month       
midwest  1      4387     1.0
                5317     1.0
                5566     1.0
                6144     1.0
                13644    1.0
                        ... 
west     12     22176    4.0
                22363    4.0
                22467    4.0
                22572    4.0
                23269    4.0
Name: severity, Length: 17060, dtype: float64


0.8522090458996826

In [318]:
ex_rm = grp_by_rm.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]]
ex_rm

0        1.0
2        1.0
3        1.0
4        3.0
5        4.0
        ... 
23562    2.0
23564    2.0
23566    2.0
23567    1.0
23568    2.0
Name: severity, Length: 17060, dtype: float64

In [319]:
rmse(train_data.severity.sort_index(), ex_rm)

0.8522090458996826

In [320]:
grp_by_rm = train_data.groupby(['region', 'month']).severity.mean()
grp_by_rm = grp_by_rm.map(np.round)
print(grp_by_rm.isna().sum())  # only 1 msising

# print(grp_by_rm)
avg_rm = grp_by_rm[train_data.region.sort_index()][train_data.month.sort_index()]

print("RMSE:", rmse(train_data.severity.sort_index(), avg_rm))

0
RMSE: 1.4458981072190233


In [321]:
rmse(ex_rm, avg_rm)

1.2369838142124905

In [322]:
avg_rm.value_counts(normalize=True)

2.0    0.472802
3.0    0.353458
1.0    0.173740
Name: severity, dtype: float64

In [323]:
ex_rm.value_counts(normalize=True)

2.0    0.526964
1.0    0.225557
4.0    0.199414
3.0    0.048066
Name: severity, dtype: float64

In [304]:
data.region.value_counts(normalize=True)

south        0.486169
west         0.261179
midwest      0.159737
northeast    0.092915
Name: region, dtype: float64

In [305]:
sub_format.region.value_counts(normalize=True)

west         0.366667
midwest      0.240399
south        0.232104
northeast    0.160829
Name: region, dtype: float64

In [225]:
train_data.region.value_counts(normalize=True)

south        0.583118
west         0.220926
midwest      0.128957
northeast    0.066999
Name: region, dtype: float64

In [313]:
grp_by_ry = train_data.groupby(['region', 'year']).severity.expanding(1).mean()
grp_by_ry = grp_by_ry.map(np.round)
print(grp_by_ry.isna().sum())  # 23 missing values 0.8780573783482186
print(grp_by_ry)

ex_ry = grp_by_ry.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]]

mse(train_data.severity.sort_index(), ex_ry, squared=False)

0
region   year       
midwest  2013  89       1.0
               110      1.0
               198      1.0
               330      1.0
               368      1.0
                       ... 
west     2021  23072    4.0
               23343    4.0
               23358    4.0
               23367    4.0
               23454    4.0
Name: severity, Length: 17060, dtype: float64


0.8780573783482186

In [314]:
grp_by_ry = data.groupby(['region', 'year']).severity.mean()
grp_by_ry = grp_by_ry.map(np.round)
print(grp_by_ry.isna().sum())  # 0 missing values 
# print(grp_by_ry)

avg_ry = grp_by_ry[train_data.region.sort_index()][train_data.year.sort_index()]

rmse(train_data.severity.sort_index(), avg_ry)

0


1.875928843790099

In [311]:
avg_ry.value_counts(normalize=True)

4.0    0.597245
2.0    0.302755
3.0    0.100000
Name: severity, dtype: float64

In [315]:
ex_ry.value_counts(normalize=True)

2.0    0.669109
4.0    0.192028
1.0    0.091970
3.0    0.046835
5.0    0.000059
Name: severity, dtype: float64

In [325]:
grp_by_rs = train_data.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)
print(grp_by_rs.isna().sum()) # 13     0.8584795305291978

mse(train_data.severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0


0.8584795305291978

In [334]:
grp_by_rs = train_data.groupby(['region', 'season']).severity.mean()
grp_by_rs = grp_by_rs.map(np.round)
print(grp_by_rs.isna().sum())
    
print(grp_by_rs)

rmse(train_data.severity.sort_index(), grp_by_rs[train_data.region][train_data.season])

0
region     season
midwest    1         1.0
           2         1.0
           3         2.0
           4         2.0
northeast  1         2.0
           2         1.0
           3         2.0
           4         2.0
south      1         1.0
           2         1.0
           3         2.0
           4         2.0
west       1         4.0
           2         4.0
           3         4.0
           4         4.0
Name: severity, dtype: float64


1.3952061619942009

In [344]:
grp_by_rs = train_data.groupby(['region', 'season']).severity.mean()
grp_by_rs = grp_by_rs.map(np.round)
print(grp_by_rs.isna().sum())

print(grp_by_rs)

rmse(train_data.severity.sort_index(), grp_by_rs[train_data.region][train_data.season])

0
region     season
midwest    1         1.0
           2         1.0
           3         2.0
           4         2.0
northeast  1         2.0
           2         1.0
           3         2.0
           4         2.0
south      1         1.0
           2         1.0
           3         2.0
           4         2.0
west       1         4.0
           2         4.0
           3         4.0
           4         4.0
Name: severity, dtype: float64


1.3952061619942009

In [422]:
grp_by_rd = train_data.groupby(['region', 'date']).severity.mean()
grp_by_rd = grp_by_rd.map(np.round)
print(grp_by_rd.isna().sum()) # 970 missing values 
  

0


In [423]:
from tqdm import tqdm


grp_by_rd_preds = []
for row in tqdm(train_data.sort_index().itertuples(), total=train_data.shape[0]):
    grp_by_rd_preds.append(grp_by_rd[row.region][row.date])

100%|██████████| 17060/17060 [00:04<00:00, 3867.88it/s]


In [424]:
rmse(train_data.severity.sort_index(), grp_by_rd_preds)

0.7077696421534655

# Sooo....

- Not satisfied with the avg severity score.

# Todos :
- Try ex and avg on density and see how it unfoldds?
- Think of custom heurstic to predict severity (try with both sevrity and density)
