``Mission : Error Analysis & Model metadata``

- Analyize past submissions and find out what went wrong! 
- One great heuristic is all I need now!


# Data and dependencies

In [50]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from geopy.distance import geodesic

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [2]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [3]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [4]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

## Add date fts

In [5]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [6]:
test_data = data[data.split == 'test']
test_data.shape, data.shape

((6510, 12), (23570, 12))

In [7]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

In [8]:
train_data.severity.mean()

2.1459554513481827

In [9]:
dens_to_sev(train_data.density.mean())

4

# My Guess

In [10]:
te_data = test_data.sort_values(by='date')
te_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
6865,howw,37.0062,-120.600,2013-01-08,test,2013,1,2,1,west,,
3661,eamn,36.9818,-120.221,2013-01-08,test,2013,1,2,1,west,,
7668,imsv,36.9836,-120.500,2013-01-08,test,2013,1,2,1,west,,
20182,wgxq,33.8011,-117.205,2013-01-25,test,2013,1,4,1,west,,
16095,rsos,33.8892,-117.562,2013-01-25,test,2013,1,4,1,west,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.7368,-121.734,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.7518,-121.742,2021-12-29,test,2021,12,52,1,west,,
6864,howu,36.7085,-121.749,2021-12-29,test,2021,12,52,1,west,,
6540,hfvr,36.7962,-121.782,2021-12-29,test,2021,12,52,1,west,,


In [11]:
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=144, shuffle=True)
tr_data.shape, val_data.shape

((14501, 12), (2559, 12))

In [16]:
#  Utils
def get_data_by_date( date=None, data=tr_data):
    return data[data.date == date]


def get_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

In [17]:
val_data.sort_values('date')

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0
1126,bgwz,37.413900,-121.014000,2013-01-08,train,2013,1,2,1,west,4.0,3740000.0
16227,rwkd,38.115600,-121.494000,2013-01-15,train,2013,1,3,1,west,4.0,1745249.0
13719,pceh,37.967400,-121.464000,2013-01-15,train,2013,1,3,1,west,3.0,985182.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3499,dwkx,35.909960,-79.157569,2021-12-02,train,2021,12,48,1,south,1.0,17427.0
23365,ztug,35.686387,-79.200004,2021-12-02,train,2021,12,48,1,south,4.0,2111128.0
15266,quux,35.877009,-78.893845,2021-12-02,train,2021,12,48,1,south,1.0,16980.0
5806,gkeq,37.263900,-120.906000,2021-12-13,train,2021,12,50,1,west,4.0,6797500.0


In [51]:
fill_2s = []
g_from_past = {}


def make_guess1(row: pd.Series, date=None, tr_data=tr_data) -> pd.Series:
    dists = []
    region = row.region.values[0]
    date = date
    uid = row.uid.values[0]
    tr_data = tr_data

    if date is None:
        date = row.date.values[0]

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return _, 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        return make_guess1(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    for some_row in rel_data.itertuples():
        dist = get_distance(row.latitude.values, row.longitude.values, some_row.latitude, some_row.longitude)
        dists.append(dist)
    
    nearest = rel_data.iloc[np.argmin(dists)]
    return nearest, nearest.severity

In [56]:
val_data['guess1'] = 0

for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    uid_series = val_data[val_data.uid == row.uid]
    nearest, severity = make_guess1(uid_series, date=row.date)
    val_data.loc[val_data.uid == row.uid, 'guess1'] = severity

rmse(val_data.severity, val_data.guess1)

100%|██████████| 2559/2559 [00:24<00:00, 103.86it/s]


0.8536980219153129

In [88]:
def cv_loop(rand, splits=10):
    # print("Random Number: ", rand)
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=rand)
    rmses = []
    guess1_train_preds = np.zeros((train_data.shape[0]))

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_data, train_data.severity)):
        print(f"Fold: {fold}")
        tr_data, val_data = train_data.iloc[train_idx], train_data.iloc[val_idx]

        val_data['guess1'] = 0

        temp = []
        for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
            uid_series = val_data[val_data.uid == row.uid]
            nearest, severity = make_guess1(uid_series, date=row.date, tr_data=tr_data)
            val_data.loc[val_data.uid == row.uid, f'guess1'] = severity
            temp.append(severity)
        
        guess1_train_preds[val_idx] = temp
        

                
        errror = rmse(val_data.severity, val_data.guess1)
        rmses.append(errror)
        print("RMSE: ", errror)

        print('Train Distribution: ')
        print(tr_data.severity.value_counts(normalize=True))
        print('Val Distribution: ')
        print(val_data.severity.value_counts(normalize=True))
        print('Predicted Distribution: ')
        print(val_data.guess1.value_counts(normalize=True))

    
    print('----------------------------------------------------')

    return rmses, guess1_train_preds

In [90]:
rmses, guess_train_preds = cv_loop(18952)

Fold: 0


100%|██████████| 1706/1706 [00:20<00:00, 82.72it/s] 


RMSE:  0.866871049336748
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003452
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.002931
Name: severity, dtype: float64
Predicted Distribution: 
1    0.429660
4    0.210434
2    0.189918
3    0.165885
5    0.004103
Name: guess1, dtype: float64
Fold: 1


100%|██████████| 1706/1706 [00:16<00:00, 104.29it/s]


RMSE:  0.864501157245755
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003452
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.002931
Name: severity, dtype: float64
Predicted Distribution: 
1    0.449590
4    0.206917
2    0.172919
3    0.167644
5    0.002931
Name: guess1, dtype: float64
Fold: 2


100%|██████████| 1706/1706 [00:14<00:00, 116.44it/s]


RMSE:  0.835538309888847
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189918
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189332
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.440211
4    0.206331
2    0.188159
3    0.162954
5    0.002345
Name: guess1, dtype: float64
Fold: 3


100%|██████████| 1706/1706 [00:13<00:00, 123.05it/s]


RMSE:  0.8628043918078191
Train Distribution: 
1.0    0.439429
4.0    0.207959
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.207503
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.432005
2    0.205744
4    0.197538
3    0.161782
5    0.002931
Name: guess1, dtype: float64
Fold: 4


100%|██████████| 1706/1706 [00:13<00:00, 123.87it/s]


RMSE:  0.8348364716214519
Train Distribution: 
1.0    0.439429
4.0    0.207959
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.207503
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.440797
2    0.199297
4    0.198710
3    0.158851
5    0.002345
Name: guess1, dtype: float64
Fold: 5


100%|██████████| 1706/1706 [00:13<00:00, 122.11it/s]


RMSE:  0.8729355146064317
Train Distribution: 
1.0    0.439429
4.0    0.207959
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.207503
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.428488
2    0.203986
4    0.203986
3    0.161196
5    0.002345
Name: guess1, dtype: float64
Fold: 6


100%|██████████| 1706/1706 [00:20<00:00, 85.12it/s] 


RMSE:  0.8672090766646026
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189853
3.0    0.159437
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189918
3.0    0.158851
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.462485
4    0.195780
2    0.185229
3    0.152989
5    0.003517
Name: guess1, dtype: float64
Fold: 7


100%|██████████| 1706/1706 [00:14<00:00, 116.21it/s]


RMSE:  0.890223044617271
Train Distribution: 
1.0    0.439495
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439039
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.436694
4    0.219226
2    0.185815
3    0.155334
5    0.002931
Name: guess1, dtype: float64
Fold: 8


100%|██████████| 1706/1706 [00:16<00:00, 106.01it/s]


RMSE:  0.8611042829799834
Train Distribution: 
1.0    0.439495
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439039
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.430246
4    0.216295
2    0.197538
3    0.152989
5    0.002931
Name: guess1, dtype: float64
Fold: 9


100%|██████████| 1706/1706 [00:16<00:00, 105.53it/s]


RMSE:  0.8459960173843882
Train Distribution: 
1.0    0.439495
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439039
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
1    0.443728
4    0.218640
2    0.185815
3    0.149472
5    0.002345
Name: guess1, dtype: float64
----------------------------------------------------


In [92]:
train_data.severity.value_counts(normalize=True)

1.0    0.439449
4.0    0.207913
2.0    0.189859
3.0    0.159379
5.0    0.003400
Name: severity, dtype: float64

In [91]:
pd.Series(guess_train_preds).value_counts(normalize=True)

1.0    0.439390
4.0    0.207386
2.0    0.191442
3.0    0.158910
5.0    0.002872
dtype: float64

In [93]:
rmse(train_data.severity, guess_train_preds)

0.8603551708042789

In [None]:
#  this 

In [31]:
np.mean(rmses), np.std(rmses)

(0.868199606838101, 0.017132723670223005)

In [32]:
max(rmses), min(rmses)

(0.8930717656411409, 0.8363563768264255)

In [316]:
#  the first one is hugee  0.91 and 867!
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=867, shuffle=True)
print(tr_data.severity.mean(), val_data.severity.mean())

tr_data.severity.value_counts(normalize=True), val_data.severity.value_counts(normalize=True)

2.1466105785807876 2.1422430636967564


(1.0    0.439556
 4.0    0.209296
 2.0    0.189780
 3.0    0.158265
 5.0    0.003103
 Name: severity, dtype: float64,
 1.0    0.438843
 4.0    0.200078
 2.0    0.190309
 3.0    0.165690
 5.0    0.005080
 Name: severity, dtype: float64)

In [314]:
np.mean(rmses), np.std(rmses)

(0.866379185673449, 0.01958544441628999)

In [317]:
np.mean(rmses[1:]), np.std(rmses[1:])

(0.8611064739275047, 0.01217357021407055)

In [319]:
np.argmin(rmses)       # 133

7

In [320]:
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=133, shuffle=True)
print(tr_data.severity.mean(), val_data.severity.mean())

tr_data.severity.value_counts(normalize=True), val_data.severity.value_counts(normalize=True)

2.1469553823874215 2.1402891754591638


(1.0    0.439832
 4.0    0.207710
 2.0    0.188263
 3.0    0.160610
 5.0    0.003586
 Name: severity, dtype: float64,
 1.0    0.437280
 4.0    0.209066
 2.0    0.198906
 3.0    0.152403
 5.0    0.002345
 Name: severity, dtype: float64)

In [None]:
#  is this trustable?
# No possible data leakage and true validation.(right in this context)


In [34]:
#  val_data guess1 is matching dist of train data severity

In [94]:
#  Making submission with guess1
sub_format['severity'] = 0

for row in tqdm(test_data.itertuples(), total=test_data.shape[0]):
    uid_series = test_data[test_data.uid == row.uid]
    nearest, severity = make_guess1(uid_series, date=row.date, tr_data=train_data)   # use all train data for making test submission
    sub_format.loc[sub_format.uid == row.uid, 'severity'] = severity


sub_format

 32%|███▏      | 2064/6510 [00:31<01:06, 67.02it/s]

No previous data for this date filling in 2s .. for 7466    igpa
Name: uid, dtype: object


 44%|████▍     | 2857/6510 [00:42<01:11, 50.81it/s]

No previous data for this date filling in 2s .. for 10422    lkpf
Name: uid, dtype: object


 58%|█████▊    | 3760/6510 [00:55<00:58, 46.76it/s]

No previous data for this date filling in 2s .. for 13645    paez
Name: uid, dtype: object


100%|██████████| 6510/6510 [01:33<00:00, 69.26it/s] 


Unnamed: 0,uid,region,severity
0,aabn,west,2
1,aair,west,4
2,aajw,northeast,2
3,aalr,midwest,4
4,aalw,west,4
...,...,...,...
6505,zzpn,northeast,5
6506,zzrv,west,4
6507,zzsx,south,2
6508,zzvv,west,4


In [95]:
print(sub_format.severity.value_counts(normalize=True))

sub_format.to_csv('../submissions/to submit/guess1_preds.csv', index=False)

4    0.339785
1    0.311982
3    0.191705
2    0.153303
5    0.003226
Name: severity, dtype: float64


# Sooo....

- Its not that bad, but 0.86 is not good enough and rand split with 867 could be hinting something
- 
- Definetly need better guessing


# Todos :

- Find out what went wrong with guess1 