``Mission : Error Analysis & Model metadata``

- Error Analysis Guess1
- Design/Improve Guess1


# Data and dependencies

In [1]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from geopy.distance import geodesic

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [2]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [3]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [4]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

## Add date fts

In [5]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [6]:
test_data = data[data.split == 'test']
test_data.shape, data.shape

((6510, 12), (23570, 12))

In [7]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

In [8]:
train_data.severity.mean()

2.1459554513481827

In [9]:
dens_to_sev(train_data.density.mean())

4

# Utils

In [78]:
#  Utils
def get_data_by_date( date=None, data=train_data):
    return data[data.date == date]


def get_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def analyize_matches(y_true, y_pred):
    print("Exact matches: ", sum(y_true == y_pred) / len(y_true))
    
    print("Missed by 1: ", sum(abs(y_true - y_pred) == 1) / len(y_true))
    print("Missed by 2: ", sum(abs(y_true - y_pred) == 2) / len(y_true))
    print("Missed by 3: ", sum(abs(y_true - y_pred) == 3) / len(y_true))
    print("Missed by 4: ", sum(abs(y_true - y_pred) == 4) / len(y_true))

# My Guess

In [10]:
te_data = test_data.sort_values(by='date')
te_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
6865,howw,37.0062,-120.600,2013-01-08,test,2013,1,2,1,west,,
3661,eamn,36.9818,-120.221,2013-01-08,test,2013,1,2,1,west,,
7668,imsv,36.9836,-120.500,2013-01-08,test,2013,1,2,1,west,,
20182,wgxq,33.8011,-117.205,2013-01-25,test,2013,1,4,1,west,,
16095,rsos,33.8892,-117.562,2013-01-25,test,2013,1,4,1,west,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.7368,-121.734,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.7518,-121.742,2021-12-29,test,2021,12,52,1,west,,
6864,howu,36.7085,-121.749,2021-12-29,test,2021,12,52,1,west,,
6540,hfvr,36.7962,-121.782,2021-12-29,test,2021,12,52,1,west,,


In [11]:
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=144, shuffle=True)
tr_data.shape, val_data.shape

((14501, 12), (2559, 12))

## Guess Funcs

In [122]:
fill_2s = []
g_from_past = {}


def make_guess1(row: pd.Series, date=None, tr_data=tr_data) -> pd.Series:
    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        return make_guess1(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    for some_row in rel_data.itertuples():
        dist = get_distance(row.latitude, row.longitude, some_row.latitude, some_row.longitude)
        dists.append(dist)
    
    nearest = rel_data.iloc[np.argmin(dists)]
    return nearest.severity


def make_guess2(row: pd.Series, date=None, tr_data=tr_data) -> pd.Series:
    """modified version of make_guess1, Uses mean/mode of severity for the region instead of nearest severity."""
    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        return make_guess1(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    severty_mode = rel_data.severity.mode()[0]
    severty_mean = np.round(rel_data.severity.mean())

    return severty_mean



def cv_loop(rand, splits=10, guess_func=make_guess1):
    # print("Random Number: ", rand)
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=rand)
    rmses = []
    guess_train_preds = np.zeros((train_data.shape[0]))

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_data, train_data.severity)):
        print(f"Fold: {fold}")
        tr_data, val_data = train_data.iloc[train_idx], train_data.iloc[val_idx]

        val_data['guess1'] = 0

        temp = []
        for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
            uid_series = val_data[val_data.uid == row.uid]
            severity = guess_func(uid_series.iloc[0], date=row.date, tr_data=tr_data)
            val_data.loc[val_data.uid == row.uid, f'guess1'] = severity
            temp.append(severity)
        
        guess_train_preds[val_idx] = temp
        

                
        errror = rmse(val_data.severity, val_data.guess1)
        rmses.append(errror)
        print("RMSE: ", errror)

        print('Train Distribution: ')
        print(tr_data.severity.value_counts(normalize=True))
        print('Val Distribution: ')
        print(val_data.severity.value_counts(normalize=True))
        print('Predicted Distribution: ')
        print(val_data.guess1.value_counts(normalize=True))

    
    print('----------------------------------------------------')

    return rmses, guess_train_preds


In [16]:
tr_data.sort_values(by='date', inplace=True)
val_data.sort_values(by='date', inplace=True)

In [17]:
tr_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0
13283,oqcg,37.114500,-120.890000,2013-01-08,train,2013,1,2,1,west,4.0,4500000.0
13827,pfly,37.803400,-120.841000,2013-01-08,train,2013,1,2,1,west,4.0,2881767.5
...,...,...,...,...,...,...,...,...,...,...,...,...
6718,hkvs,36.030000,-78.706429,2021-12-14,train,2021,12,50,1,south,2.0,31769.0
14516,pykd,36.030000,-78.706927,2021-12-14,train,2021,12,50,1,south,2.0,51737.0
17778,tobi,36.030000,-78.705932,2021-12-14,train,2021,12,50,1,south,1.0,4357.0
23159,zoaj,36.060000,-78.760000,2021-12-14,train,2021,12,50,1,south,2.0,48233.0


In [23]:
val_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0
1126,bgwz,37.413900,-121.014000,2013-01-08,train,2013,1,2,1,west,4.0,3740000.0
16227,rwkd,38.115600,-121.494000,2013-01-15,train,2013,1,3,1,west,4.0,1745249.0
13719,pceh,37.967400,-121.464000,2013-01-15,train,2013,1,3,1,west,3.0,985182.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3499,dwkx,35.909960,-79.157569,2021-12-02,train,2021,12,48,1,south,1.0,17427.0
23365,ztug,35.686387,-79.200004,2021-12-02,train,2021,12,48,1,south,4.0,2111128.0
15266,quux,35.877009,-78.893845,2021-12-02,train,2021,12,48,1,south,1.0,16980.0
5806,gkeq,37.263900,-120.906000,2021-12-13,train,2021,12,50,1,west,4.0,6797500.0


In [37]:
for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    val_data.loc[row.Index, 'guess'] = make_guess1(row)

100%|██████████| 2559/2559 [00:26<00:00, 97.93it/s] 


In [38]:
rmse(val_data.severity, val_data.guess)

0.8548416232611225

In [42]:
analyize_matches(val_data.severity, val_data.guess)

Exact matches:  0.6010160218835483
Missed by 1:  0.30676045330207113
Missed by 2:  0.08284486127393513
Missed by 3:  0.008206330597889801
Missed by 4:  0.0011723329425556857


In [94]:
# 0.08284486127393513 + 0.008206330597889801 + 0.0011723329425556857
# 91% preds < 1 offs, 99% preds < 2 offs, 88% preds == 2 offs

0.09222352481438062

In [43]:
# what are the samples missing by 1

val_data[val_data.severity - val_data.guess == 1]

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess
1126,bgwz,37.413900,-121.014000,2013-01-08,train,2013,1,2,1,west,4.0,3.740000e+06,3.0
16227,rwkd,38.115600,-121.494000,2013-01-15,train,2013,1,3,1,west,4.0,1.745249e+06,3.0
1178,bils,35.658042,-79.252651,2013-01-29,train,2013,1,5,1,south,2.0,5.184600e+04,1.0
629,ascv,35.794000,-79.004000,2013-02-12,train,2013,2,7,1,south,2.0,3.664900e+04,1.0
19900,vyle,32.384010,-104.145830,2013-03-28,train,2013,3,13,2,west,2.0,2.182287e+04,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8249,jdhb,39.077955,-96.880810,2021-09-20,train,2021,9,38,4,midwest,3.0,4.326020e+05,2.0
12915,ofvy,37.782398,-97.531050,2021-10-04,train,2021,10,40,4,midwest,3.0,1.587410e+05,2.0
376,aljv,39.211550,-97.005590,2021-10-11,train,2021,10,41,4,midwest,3.0,1.590030e+05,2.0
9495,klbq,35.876636,-78.890862,2021-10-12,train,2021,10,41,4,south,2.0,9.703900e+04,1.0


In [48]:
val_data[val_data.uid == 'bgwz']

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess
1126,bgwz,37.4139,-121.014,2013-01-08,train,2013,1,2,1,west,4.0,3740000.0,3.0


In [55]:
some_data = tr_data[tr_data.date == '2013-01-08']
some_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
13283,oqcg,37.1145,-120.89,2013-01-08,train,2013,1,2,1,west,4.0,4500000.0
13827,pfly,37.8034,-120.841,2013-01-08,train,2013,1,2,1,west,4.0,2881767.5
19257,vfgn,37.2953,-120.85,2013-01-08,train,2013,1,2,1,west,4.0,4265000.0
10515,lnth,37.2486,-120.851,2013-01-08,train,2013,1,2,1,west,4.0,4732500.0
20604,wtlv,37.2616,-120.906,2013-01-08,train,2013,1,2,1,west,4.0,4882500.0
15663,rgbz,37.3041,-120.901,2013-01-08,train,2013,1,2,1,west,4.0,5100000.0
2261,cnsa,37.4806,-121.031,2013-01-08,train,2013,1,2,1,west,4.0,3617736.5
16589,sgtc,37.3204,-120.983,2013-01-08,train,2013,1,2,1,west,4.0,4027500.0
8158,jalu,37.2764,-120.954,2013-01-08,train,2013,1,2,1,west,4.0,5187500.0
2366,cqge,37.1976,-120.488,2013-01-08,train,2013,1,2,1,west,4.0,3324651.5


In [53]:
# lets see bgwz

make_guess1(val_data[val_data.uid == 'bgwz'].iloc[0])

3.0

In [56]:
dists = []
for some_row in some_data.itertuples():
        dist = get_distance(37.4139, -121.014, some_row.latitude, some_row.longitude)
        dists.append(dist)

In [59]:
some_data.iloc[np.argmin(dists)]

uid                         pfsh
latitude                 37.4419
longitude               -121.003
date         2013-01-08 00:00:00
split                      train
year                        2013
month                          1
week                           2
season                         1
region                      west
severity                     3.0
density                 136538.0
Name: 13840, dtype: object

In [None]:
# Observations:

#  In this case the closest one is 3, but the actual value is 4
#  I guess it is far better if we search for the same location in the past or try mode of the region

In [64]:
# what are the samples missing by 2

val_data[val_data.severity - val_data.guess == 2]

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess,guess2
10194,leit,35.657803,-79.253096,2013-03-13,train,2013,3,11,2,south,3.0,517495.0,1.0,1.0
12054,nhaw,35.859897,-78.756888,2013-06-04,train,2013,6,23,3,south,3.0,121991.0,1.0,1.0
2778,dbwj,39.628330,-99.580000,2013-06-24,train,2013,6,26,3,midwest,4.0,1204875.0,2.0,2.0
9775,ktdj,36.177000,-79.053877,2013-06-26,train,2013,6,26,3,south,3.0,613345.0,1.0,1.0
21898,ydti,36.115000,-78.909256,2013-07-29,train,2013,7,31,3,south,3.0,711616.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23233,zqbi,35.790000,-79.037119,2021-07-12,train,2021,7,28,3,south,3.0,117925.0,1.0,1.0
12415,nrrl,35.610000,-79.000975,2021-07-13,train,2021,7,28,3,south,3.0,239384.0,1.0,1.0
2241,cnei,41.550878,-86.361626,2021-08-30,train,2021,8,35,3,midwest,4.0,1694159.0,2.0,3.0
11467,mpss,37.746680,-97.779360,2021-09-13,train,2021,9,37,4,midwest,3.0,136045.0,1.0,3.0


In [65]:
make_guess1(val_data[val_data.uid == 'leit'].iloc[0])


1.0

In [66]:
leit_date = '2013-03-13'

leit_data = tr_data[tr_data.date == leit_date]
leit_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
19154,vcho,35.859275,-78.751916,2013-03-13,train,2013,3,11,2,south,1.0,871.0
8679,jpte,35.794,-79.004,2013-03-13,train,2013,3,11,2,south,3.0,193734.0
4197,eqdu,35.657883,-79.252948,2013-03-13,train,2013,3,11,2,south,1.0,581.0
15152,qrhs,35.859306,-78.752165,2013-03-13,train,2013,3,11,2,south,1.0,2324.0
20379,wndy,35.910208,-79.159558,2013-03-13,train,2013,3,11,2,south,1.0,11909.0
7443,igat,35.91027,-79.160055,2013-03-13,train,2013,3,11,2,south,1.0,3970.0
4319,etfx,35.701531,-79.171876,2013-03-13,train,2013,3,11,2,south,1.0,1017.0
831,ayfy,35.701504,-79.171926,2013-03-13,train,2013,3,11,2,south,1.0,218.0
18806,usoo,35.910301,-79.160303,2013-03-13,train,2013,3,11,2,south,1.0,9159.0


In [None]:
dists = []
for some_row in some_data.itertuples():
        dist = get_distance(37.4139, -121.014, some_row.latitude, some_row.longitude)
        dists.append(dist)

### With guess2

In [61]:
for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    val_data.loc[row.Index, 'guess2'] = make_guess2(row)

100%|██████████| 2559/2559 [00:09<00:00, 256.65it/s]


In [62]:
rmse(val_data.severity, val_data.guess2)
#  this is worse on first glance but..

0.8718156481869335

In [63]:
analyize_matches(val_data.severity, val_data.guess2)

#  much better at exact matches and 1 offs but worse at 2 offs so missed by 2s should have spiked up the error!

Exact matches:  0.6514263384134428
Missed by 1:  0.23563892145369286
Missed by 2:  0.10003907776475186
Missed by 3:  0.011723329425556858
Missed by 4:  0.0011723329425556857


In [96]:
0.6514263384134428 + 0.23563892145369286 

0.8870652598671356

In [None]:
# 88% preds <= 1 offs, 98% preds <= 2 offs, 11% preds >= 2 offs (this is bad)

In [83]:
#  with mean
for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    val_data.loc[row.Index, 'guess2_mean'] = make_guess2(row)

100%|██████████| 2559/2559 [00:10<00:00, 233.24it/s]


In [86]:
rmse(val_data.severity, val_data.guess2_mean)
# ohh gawd!!

0.7887445815024979

In [87]:
analyize_matches(val_data.severity, val_data.guess2_mean)
#  this has less matches but better at 1 offs so better overall....

Exact matches:  0.5592028135990621
Missed by 1:  0.39234075810863617
Missed by 2:  0.04181320828448613
Missed by 3:  0.0062524423602969914
Missed by 4:  0.00039077764751856197


In [98]:
# 0.5592028135990621 + 0.39234075810863617 + 0.04181320828448613 + 0.0062524423602969914 + 0.00039077764751856197
# 0.5592028135990621 + 0.39234075810863617

# 95% preds <= 1 offs (thats why they are goood!)  4.8% >= 2 offs

0.04845642829230168

In [90]:
#  cv with mean..!!
rmses, guess2_mean = cv_loop(rand=1859, splits=10, guess_func=make_guess2)

Fold: 0


100%|██████████| 1706/1706 [00:09<00:00, 186.22it/s]


RMSE:  0.8015225839117173
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003452
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.002931
Name: severity, dtype: float64
Predicted Distribution: 
2    0.411489
1    0.315944
4    0.206917
3    0.064478
5    0.001172
Name: guess1, dtype: float64
Fold: 1


100%|██████████| 1706/1706 [00:08<00:00, 200.86it/s]


RMSE:  0.8040781167395744
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003452
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.002931
Name: severity, dtype: float64
Predicted Distribution: 
2    0.401524
1    0.321805
4    0.209261
3    0.065651
5    0.001758
Name: guess1, dtype: float64
Fold: 2


100%|██████████| 1706/1706 [00:08<00:00, 206.60it/s]


RMSE:  0.7978576237698913
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189918
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189332
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.414420
1    0.311841
4    0.202227
3    0.071512
Name: guess1, dtype: float64
Fold: 3


100%|██████████| 1706/1706 [00:07<00:00, 232.78it/s]


RMSE:  0.797490201404669
Train Distribution: 
1.0    0.439429
4.0    0.207959
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.207503
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.392732
1    0.322392
4    0.201055
3    0.082063
5    0.001758
Name: guess1, dtype: float64
Fold: 4


100%|██████████| 1706/1706 [00:08<00:00, 190.45it/s]


RMSE:  0.790476728551942
Train Distribution: 
1.0    0.439429
4.0    0.207959
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.207503
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.405041
1    0.314771
4    0.210434
3    0.069754
Name: guess1, dtype: float64
Fold: 5


100%|██████████| 1706/1706 [00:07<00:00, 224.98it/s]


RMSE:  0.7923283982911188
Train Distribution: 
1.0    0.439429
4.0    0.207959
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.207503
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.421454
1    0.305979
4    0.202814
3    0.069168
5    0.000586
Name: guess1, dtype: float64
Fold: 6


100%|██████████| 1706/1706 [00:07<00:00, 221.60it/s]


RMSE:  0.8344853311341532
Train Distribution: 
1.0    0.439429
4.0    0.207894
2.0    0.189853
3.0    0.159437
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439625
4.0    0.208089
2.0    0.189918
3.0    0.158851
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.402110
1    0.307151
4    0.206917
3    0.083236
5    0.000586
Name: guess1, dtype: float64
Fold: 7


100%|██████████| 1706/1706 [00:07<00:00, 234.73it/s]


RMSE:  0.8181699763492282
Train Distribution: 
1.0    0.439495
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439039
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.420868
1    0.303048
4    0.202227
3    0.073271
5    0.000586
Name: guess1, dtype: float64
Fold: 8


100%|██████████| 1706/1706 [00:07<00:00, 232.03it/s]


RMSE:  0.7743696146655247
Train Distribution: 
1.0    0.439495
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439039
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.432591
1    0.296600
4    0.196952
3    0.072685
5    0.001172
Name: guess1, dtype: float64
Fold: 9


100%|██████████| 1706/1706 [00:07<00:00, 216.86it/s]

RMSE:  0.7952821024698044
Train Distribution: 
1.0    0.439495
4.0    0.207894
2.0    0.189853
3.0    0.159372
5.0    0.003387
Name: severity, dtype: float64
Val Distribution: 
1.0    0.439039
4.0    0.208089
2.0    0.189918
3.0    0.159437
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.419109
1    0.298359
4    0.205744
3    0.076788
Name: guess1, dtype: float64
----------------------------------------------------





In [91]:
np.mean(rmses), np.std(rmses)
#  this is much better and seems trustable..

(0.8006060677287623, 0.015408674398557901)

In [93]:
pd.Series(guess2_mean).value_counts(normalize=True)

2.0    0.412134
1.0    0.309789
4.0    0.204455
3.0    0.072860
5.0    0.000762
dtype: float64

# Submission

In [123]:
#  Making submission with guess2 mean

sub_format['severity'] = 0

for row in tqdm(test_data.itertuples(), total=test_data.shape[0]):
    uid_series = test_data[test_data.uid == row.uid]
    severity = make_guess2(uid_series.iloc[0], date=row.date, tr_data=train_data)   # use all train data for making test submission
    sub_format.loc[sub_format.uid == row.uid, 'severity'] = severity

sub_format

 32%|███▏      | 2071/6510 [00:20<00:43, 103.05it/s]

No previous data for this date filling in 2s .. for igpa


 44%|████▍     | 2855/6510 [00:28<00:48, 75.03it/s] 

No previous data for this date filling in 2s .. for lkpf


 58%|█████▊    | 3764/6510 [00:37<00:33, 82.08it/s] 

No previous data for this date filling in 2s .. for paez


100%|██████████| 6510/6510 [01:03<00:00, 102.34it/s]


Unnamed: 0,uid,region,severity
0,aabn,west,2
1,aair,west,4
2,aajw,northeast,2
3,aalr,midwest,3
4,aalw,west,4
...,...,...,...
6505,zzpn,northeast,5
6506,zzrv,west,4
6507,zzsx,south,1
6508,zzvv,west,4


In [124]:
print(sub_format.severity.value_counts(normalize=True))

sub_format.to_csv('../submissions/to submit/guess2_mean_preds.csv', index=False)

4    0.326575
1    0.284178
2    0.214132
3    0.172197
5    0.002919
Name: severity, dtype: float64


# Sooo....

- 10 fold cv with guess2 yields 0.80 +/- 0.015
- guess1 is better than guess2 but guess2 has more


# Todos :

- Find out what went wrong with guess1
- Still where can I improve guess2?
- 
- How about modelling with boosted trees on these guessed severities?
-  