``Mission : Error Analysis & Model metadata``

- Analyize past submissions and find out what went wrong! 
- One great heuristic is all I need now!


# Data and dependencies

In [259]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [260]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [261]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [45]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

# Add date fts

In [262]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [263]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

In [73]:
train_data.severity.mean()

2.1459554513481827

In [68]:
dens_to_sev(train_data.density.mean())

4

### Expanding density

In [75]:
# expanding density by reg
grp_by_region = train_data.groupby('region').density.expanding(1).mean()
grp_by_region = grp_by_region.map(dens_to_sev)
print(grp_by_region)
print(grp_by_region.isna().sum())  # 5

rmse(train_data.severity.sort_index(), grp_by_region.droplevel(0).sort_index()[:train_data.shape[0]])

region        
midwest  0        1
         4        2
         5        3
         13       3
         25       3
                 ..
west     23530    4
         23536    4
         23548    4
         23552    4
         23558    4
Name: density, Length: 17060, dtype: int64
0


1.1069957655083684

In [77]:
# expanding density by season
grp_by_season = train_data.groupby('season').density.expanding(1).mean()
grp_by_season = grp_by_season.map(dens_to_sev)
print(grp_by_season)
print(grp_by_season.isna().sum())  # 3

rmse(train_data.severity.sort_index(), grp_by_season.droplevel(0).sort_index()[:train_data.shape[0]])

season       
1       28       1
        36       4
        39       4
        42       4
        78       4
                ..
4       23543    4
        23546    4
        23554    4
        23555    4
        23568    4
Name: density, Length: 17060, dtype: int64
0


1.8149761252664516

In [66]:
grp_by_season.value_counts()

3    9472
4    7570
1      12
2       6
Name: density, dtype: int64

In [79]:
# expanding density by month
grp_by_month = train_data.groupby('month').density.expanding(1).mean()
grp_by_month = grp_by_month.map(dens_to_sev)
print(grp_by_month)
print(grp_by_month.isna().sum()) # 8

rmse(train_data.severity.sort_index(), grp_by_month.droplevel(0).sort_index()[:train_data.shape[0]])

month       
1      36       4
       39       4
       42       4
       106      4
       188      4
               ..
12     23281    4
       23365    4
       23425    4
       23481    4
       23550    4
Name: density, Length: 17060, dtype: int64
0


1.7001482591719168

In [64]:
grp_by_month.value_counts()

3    11698
4     5325
1       24
2       13
Name: density, dtype: int64

In [80]:
# expanding density by year
grp_by_year = train_data.groupby('year').density.expanding(1).mean()
grp_by_year = grp_by_year.map(dens_to_sev)
print(grp_by_year)
print(grp_by_year.isna().sum()) # 8

rmse(train_data.severity.sort_index(), grp_by_year.droplevel(0).sort_index()[:train_data.shape[0]])

year       
2013  10       2
      22       3
      27       3
      42       3
      55       4
              ..
2021  23503    4
      23510    4
      23529    4
      23546    4
      23561    4
Name: density, Length: 17060, dtype: int64
0


1.8505409367261816

In [84]:
# expanding density by region and month
grp_by_rm = train_data.groupby(['region', 'month']).density.expanding(1).mean()
grp_by_rm = grp_by_rm.map(dens_to_sev)
print(grp_by_rm.isna().sum())   # 30
print(grp_by_rm) 

mse(train_data.severity.sort_index(), grp_by_rm.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0
region   month       
midwest  1      4387     1
                5317     1
                5566     1
                6144     1
                13644    1
                        ..
west     12     22176    4
                22363    4
                22467    4
                22572    4
                23269    4
Name: density, Length: 17060, dtype: int64


1.107313426550492

In [86]:
# expanding density by region and season
grp_by_rs = train_data.groupby(['region', 'season']).density.expanding(1).mean()
grp_by_rs = grp_by_rs.map(dens_to_sev)
print(grp_by_rs.isna().sum())   # 30
print(grp_by_rs) 

mse(train_data.severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0
region   season       
midwest  1       815      3
                 3473     3
                 4387     3
                 5317     2
                 5566     2
                         ..
west     4       23437    4
                 23440    4
                 23500    4
                 23512    4
                 23521    4
Name: density, Length: 17060, dtype: int64


1.0950437207221835

In [89]:
# expanding density by region and year
grp_by_ry = train_data.groupby(['region', 'year']).density.expanding(1).mean()
grp_by_ry = grp_by_ry.map(dens_to_sev)
print(grp_by_ry.isna().sum())   # 23
print(grp_by_ry) 

mse(train_data.severity.sort_index(), grp_by_ry.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0
region   year       
midwest  2013  89       1
               110      1
               198      1
               330      1
               368      1
                       ..
west     2021  23072    4
               23343    4
               23358    4
               23367    4
               23454    4
Name: density, Length: 17060, dtype: int64


1.1178242557857554

In [90]:
# expanding density by month and year
grp_by_ym = train_data.groupby(['year', 'month']).density.expanding(1).mean()
grp_by_ym = grp_by_ym.map(dens_to_sev)
print(grp_by_ym.isna().sum())   # 23
print(grp_by_ym) 

mse(train_data.severity.sort_index(), grp_by_ym.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0
year  month       
2013  1      42       3
             188      3
             1126     4
             1178     4
             1989     4
                     ..
2021  12     20763    3
             21349    3
             22641    3
             23159    3
             23365    3
Name: density, Length: 17060, dtype: int64


1.726474767581316

In [96]:
# expanding density by region and week
grp_by_rw = train_data.groupby(['region', 'week']).density.expanding(1).mean()
grp_by_rw = grp_by_rw.map(dens_to_sev)
print(grp_by_rw.isna().sum())   # 254
print(grp_by_rw) 

mse(train_data.severity.sort_index(), grp_by_rw.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0
region   week       
midwest  1     4387     1
               5317     1
               5566     1
               6144     1
               13644    1
                       ..
west     52    9747     4
               13741    4
               16733    4
               20157    4
               20551    4
Name: density, Length: 17060, dtype: int64


1.0854737324853108

In [99]:
# expanding density by region and date
grp_by_rd = train_data.groupby(['region', 'date']).density.expanding(1).mean()
grp_by_rd = grp_by_rd.map(dens_to_sev)
print(grp_by_rd.isna().sum())   # 3816
print(grp_by_rd) 

mse(train_data.severity.sort_index(), grp_by_rd.droplevel(0).droplevel(0).sort_index()[:train_data.shape[0]], squared=False)

0
region   date             
midwest  2013-01-04  4387     1
                     5317     1
                     5566     1
                     6144     1
                     13644    1
                             ..
west     2021-10-26  18375    4
                     21479    4
         2021-10-28  19662    4
         2021-11-29  20822    4
         2021-12-13  5806     4
Name: density, Length: 17060, dtype: int64


0.8200305944597076

## Avg densities

- and wt. or latest picking heuristics is better than avg

In [111]:
#  avg density by region  
grp_by_region = train_data.groupby('region').density.mean()
grp_by_region = grp_by_region.map(dens_to_sev)
print(grp_by_region)
print(grp_by_region.isna().sum()) # 0

print("RMSE:", rmse(train_data.severity, grp_by_region[train_data.region]))

region
midwest      4
northeast    3
south        2
west         4
Name: density, dtype: int64
0
RMSE: 1.1442552367777556


In [109]:
#  avg density by month  
grp_by_month = train_data.groupby('month').density.mean()
grp_by_month = grp_by_month.map(dens_to_sev)
print(grp_by_month)
print(grp_by_month.isna().sum())  # 0

print("RMSE:", rmse(train_data.severity, grp_by_month[train_data.month]))

month
1     4
2     4
3     4
4     4
5     3
6     3
7     3
8     3
9     3
10    4
11    4
12    4
Name: density, dtype: int64
0
RMSE: 1.7213744882960893


In [107]:
#  avg density by year  
grp_by_year = train_data.groupby('year').density.mean()
grp_by_year = grp_by_year.map(dens_to_sev)
print(grp_by_year)
print(grp_by_year.isna().sum())  # 0

print("RMSE:", rmse(train_data.severity, grp_by_year[train_data.year]))

year
2013    3
2014    4
2015    4
2016    3
2017    3
2018    4
2019    4
2020    3
2021    4
Name: density, dtype: int64
0
RMSE: 1.8946614702243347


In [112]:
#  avg density by season  
grp_by_season = train_data.groupby('season').density.mean()
grp_by_season = grp_by_season.map(dens_to_sev)
print(grp_by_season)
print(grp_by_season.isna().sum())

print("RMSE:", rmse(train_data.severity, grp_by_season[train_data.season]))

season
1    4
2    4
3    3
4    4
Name: density, dtype: int64
0
RMSE: 1.8878273611136387


In [113]:
#  avg density by week  
grp_by_week = train_data.groupby('week').density.mean()
grp_by_week = grp_by_week.map(dens_to_sev)
print(grp_by_week)
print(grp_by_week.isna().sum())

print("RMSE:", rmse(train_data.severity, grp_by_week[train_data.week]))

week
1     4
2     4
3     4
4     4
5     3
6     4
7     4
8     4
9     4
10    4
11    4
12    4
13    3
14    3
15    4
16    4
17    3
18    3
19    3
20    4
21    3
22    3
23    3
24    4
25    3
26    3
27    3
28    4
29    4
30    3
31    3
32    3
33    4
34    3
35    3
36    3
37    4
38    3
39    3
40    3
41    4
42    3
43    4
44    4
45    3
46    3
47    4
48    3
49    4
50    3
51    4
52    4
Name: density, dtype: int64
0
RMSE: 1.7796887117984135


In [117]:
grp_by_rm = train_data.groupby(['region', 'month']).density.mean()
grp_by_rm = grp_by_rm.map(dens_to_sev)
print(grp_by_rm.isna().sum())  # 1 missing values 0.8522090458996826
print(grp_by_rm)

mse(train_data.severity.sort_index(), grp_by_rm[train_data.region.sort_index()][train_data.month.sort_index()], squared=False)

0
region     month
midwest    1        1
           2        2
           3        1
           4        1
           5        3
           6        4
           7        3
           8        3
           9        4
           10       4
           11       2
           12       3
northeast  3        1
           4        3
           5        4
           6        2
           7        3
           8        3
           9        3
           10       3
           11       2
           12       3
south      1        2
           2        2
           3        2
           4        2
           5        2
           6        2
           7        3
           8        3
           9        3
           10       3
           11       3
           12       2
west       1        4
           2        4
           3        4
           4        4
           5        4
           6        4
           7        4
           8        4
           9        4
           10       4
           11

1.8265657177389558

In [121]:
grp_by_rs = data.groupby(['region', 'season']).density.mean()
grp_by_rs = grp_by_rs.map(dens_to_sev)
print(grp_by_rs.isna().sum())  # 1 missing values 0.8522090458996826
print(grp_by_rs)

mse(train_data.severity.sort_index(), grp_by_rs[train_data.region.sort_index()][train_data.season.sort_index()], squared=False)

0
region     season
midwest    1         2
           2         3
           3         3
           4         4
northeast  1         3
           2         3
           3         3
           4         3
south      1         2
           2         2
           3         2
           4         3
west       1         4
           2         4
           3         4
           4         4
Name: density, dtype: int64


1.8264854880298358

In [123]:
grp_by_ry = data.groupby(['region', 'year']).density.mean()
grp_by_ry = grp_by_ry.map(dens_to_sev)
print(grp_by_ry.isna().sum())  # 1 missing values 0.8522090458996826
print(grp_by_ry)

mse(train_data.severity.sort_index(), grp_by_ry[train_data.region.sort_index()][train_data.year.sort_index()], squared=False)

0
region     year
midwest    2013    4
           2014    4
           2015    4
           2016    3
           2017    3
           2018    4
           2019    3
           2020    3
           2021    3
northeast  2013    2
           2014    2
           2015    3
           2016    3
           2017    3
           2018    3
           2019    3
           2020    3
           2021    3
south      2013    2
           2014    2
           2015    3
           2016    2
           2017    3
           2018    2
           2019    2
           2020    2
           2021    2
west       2013    4
           2014    4
           2015    4
           2016    4
           2017    4
           2018    4
           2019    4
           2020    4
           2021    4
Name: density, dtype: int64


1.946921699289753

In [132]:
grp_by_rm = train_data.groupby(['region', 'month']).density.mean()
grp_by_rm = grp_by_rm.map(dens_to_sev)
print(grp_by_rm.isna().sum())  # 1 missing values 0.8522090458996826
print(grp_by_rm)

mse(train_data.severity.sort_index(), grp_by_rm[train_data.region.sort_index()][train_data.month.sort_index()], squared=False)

0
region     month
midwest    1        1
           2        2
           3        1
           4        1
           5        3
           6        4
           7        3
           8        3
           9        4
           10       4
           11       2
           12       3
northeast  3        1
           4        3
           5        4
           6        2
           7        3
           8        3
           9        3
           10       3
           11       2
           12       3
south      1        2
           2        2
           3        2
           4        2
           5        2
           6        2
           7        3
           8        3
           9        3
           10       3
           11       3
           12       2
west       1        4
           2        4
           3        4
           4        4
           5        4
           6        4
           7        4
           8        4
           9        4
           10       4
           11

1.8265657177389558

In [137]:
grp_by_rd = train_data.groupby(['region', 'date']).density.mean()
grp_by_rd = grp_by_rd.map(dens_to_sev)
print(grp_by_rd.isna().sum())  # 970 missing values 0.8522090458996826
print(grp_by_rd)

# mse(train_data.severity.sort_index(), grp_by_rd[train_data.region.sort_index()][train_data.date.sort_index()], squared=False)

0
region   date      
midwest  2013-01-04    1
         2013-02-06    1
         2013-03-01    1
         2013-03-15    1
         2013-03-21    1
                      ..
west     2021-09-27    4
         2021-10-26    4
         2021-10-28    4
         2021-11-29    4
         2021-12-13    4
Name: density, Length: 2039, dtype: int64


In [138]:
from tqdm import tqdm


grp_by_rd_preds = []
for row in tqdm(train_data.sort_index().itertuples(), total=train_data.shape[0]):
    grp_by_rd_preds.append(grp_by_rd[row.region][row.date])

100%|██████████| 17060/17060 [00:03<00:00, 5662.79it/s]


In [139]:
rmse(train_data.severity.sort_index(), grp_by_rd_preds)
# holy heck!!

0.89380438783023

# Sooo....

- Am I doin this right??.
- exapnding density is worse than expainding severity
- avg density is worse than avg severity

- IN general modelling density is worse than severity

# Todos :

- Think of custom heurstic to predict severity (try with both sevrity and density)


In [142]:
#  Sanity check --> This should be 0
rmse(train_data.density.map(dens_to_sev).sort_index(), train_data.severity.sort_index())

0.0