``Mission : Error Analysis & Model metadata``

- Why 0.80 vs 0.100???


# Data and dependencies

In [1]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from geopy.distance import geodesic

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [2]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [3]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [4]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

## Add date fts

In [5]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [6]:
test_data = data[data.split == 'test']
test_data.shape, data.shape

((6510, 12), (23570, 12))

In [7]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

In [8]:
train_data.severity.mean()

2.1459554513481827

In [9]:
dens_to_sev(train_data.density.mean())

4

# Utils

In [10]:
#  Utils
def get_data_by_date( date=None, data=train_data):
    return data[data.date == date]


def get_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def analyize_matches(y_true, y_pred):
    print("Exact matches: ", sum(y_true == y_pred) / len(y_true))
    
    print("Missed by 1: ", sum(abs(y_true - y_pred) == 1) / len(y_true))
    print("Missed by 2: ", sum(abs(y_true - y_pred) == 2) / len(y_true))
    print("Missed by 3: ", sum(abs(y_true - y_pred) == 3) / len(y_true))
    print("Missed by 4: ", sum(abs(y_true - y_pred) == 4) / len(y_true))

# My Guess

In [11]:
te_data = test_data.sort_values(by='date')
te_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
6865,howw,37.0062,-120.600,2013-01-08,test,2013,1,2,1,west,,
3661,eamn,36.9818,-120.221,2013-01-08,test,2013,1,2,1,west,,
7668,imsv,36.9836,-120.500,2013-01-08,test,2013,1,2,1,west,,
20182,wgxq,33.8011,-117.205,2013-01-25,test,2013,1,4,1,west,,
16095,rsos,33.8892,-117.562,2013-01-25,test,2013,1,4,1,west,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.7368,-121.734,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.7518,-121.742,2021-12-29,test,2021,12,52,1,west,,
6864,howu,36.7085,-121.749,2021-12-29,test,2021,12,52,1,west,,
6540,hfvr,36.7962,-121.782,2021-12-29,test,2021,12,52,1,west,,


In [12]:
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=144, shuffle=True)
tr_data.shape, val_data.shape

((14501, 12), (2559, 12))

## Guess Funcs

In [505]:
fill_2s = []
g_from_past = {}

n_times_called = {}

def make_guess1(row: pd.Series, date=None, tr_data=tr_data) -> pd.Series:

    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        return make_guess1(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    for some_row in rel_data.itertuples():
        dist = get_distance(row.latitude, row.longitude, some_row.latitude, some_row.longitude)
        dists.append(dist)
    
    nearest = rel_data.iloc[np.argmin(dists)]
    return nearest.severity


def make_guess2(row: pd.Series, date=None, tr_data=tr_data, n_times_called=None) -> pd.Series:
    """modified version of make_guess1, Uses mean/mode of severity for the region instead of nearest severity."""
    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data
    
    if n_times_called is not None:
        if n_times_called.get(uid) is None:
            n_times_called[uid] = 1
        else:
            n_times_called[uid] += 1
        

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        global count 
        count += 1
        return make_guess2(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    severty_mode = rel_data.severity.mode()[0]
    severty_mean = np.round(rel_data.severity.mean())

    return severty_mean



def cv_loop(rand, splits=10, guess_func=make_guess1):
    # print("Random Number: ", rand)
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=rand)
    tscv = TimeSeriesSplit(n_splits=splits)
    rmses = []
    guess_train_preds = np.zeros((train_data.shape[0]))

    for fold, (train_idx, val_idx) in enumerate(tscv.split(train_data, train_data.severity)):
        print(f"Fold: {fold}")
        tr_data, val_data = train_data.iloc[train_idx], train_data.iloc[val_idx]

        val_data['guess1'] = 0

        temp = []
        for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
            uid_series = val_data[val_data.uid == row.uid]
            severity = guess_func(uid_series.iloc[0], date=row.date, tr_data=tr_data)
            val_data.loc[val_data.uid == row.uid, f'guess1'] = severity
            temp.append(severity)
        
        guess_train_preds[val_idx] = temp
        

                
        errror = rmse(val_data.severity, val_data.guess1)
        rmses.append(errror)
        print("RMSE: ", errror)

        print('Train Distribution: ')
        print(tr_data.severity.value_counts(normalize=True))
        print('Val Distribution: ')
        print(val_data.severity.value_counts(normalize=True))
        print('Predicted Distribution: ')
        print(val_data.guess1.value_counts(normalize=True))

    
    print('----------------------------------------------------')

    return rmses, guess_train_preds


In [122]:
tr_data.sort_values(by='date', inplace=True)
val_data.sort_values(by='date', inplace=True)

In [123]:
tr_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0
13283,oqcg,37.114500,-120.890000,2013-01-08,train,2013,1,2,1,west,4.0,4500000.0
13827,pfly,37.803400,-120.841000,2013-01-08,train,2013,1,2,1,west,4.0,2881767.5
...,...,...,...,...,...,...,...,...,...,...,...,...
6718,hkvs,36.030000,-78.706429,2021-12-14,train,2021,12,50,1,south,2.0,31769.0
14516,pykd,36.030000,-78.706927,2021-12-14,train,2021,12,50,1,south,2.0,51737.0
17778,tobi,36.030000,-78.705932,2021-12-14,train,2021,12,50,1,south,1.0,4357.0
23159,zoaj,36.060000,-78.760000,2021-12-14,train,2021,12,50,1,south,2.0,48233.0


In [124]:
val_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess,guess2
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0,1.0,1.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0,1.0,1.0
1126,bgwz,37.413900,-121.014000,2013-01-08,train,2013,1,2,1,west,4.0,3740000.0,3.0,4.0
16227,rwkd,38.115600,-121.494000,2013-01-15,train,2013,1,3,1,west,4.0,1745249.0,3.0,4.0
13719,pceh,37.967400,-121.464000,2013-01-15,train,2013,1,3,1,west,3.0,985182.0,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6435,hdjp,35.686281,-79.200202,2021-12-02,train,2021,12,48,1,south,1.0,9958.0,1.0,2.0
549,aqae,35.658149,-79.252453,2021-12-02,train,2021,12,48,1,south,1.0,290.0,1.0,2.0
15266,quux,35.877009,-78.893845,2021-12-02,train,2021,12,48,1,south,1.0,16980.0,1.0,2.0
5806,gkeq,37.263900,-120.906000,2021-12-13,train,2021,12,50,1,west,4.0,6797500.0,4.0,4.0


In [125]:
for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    val_data.loc[row.Index, 'guess'] = make_guess1(row)

100%|██████████| 2559/2559 [00:42<00:00, 60.57it/s] 


In [126]:
rmse(val_data.severity, val_data.guess)

0.8548416232611225

In [19]:
analyize_matches(val_data.severity, val_data.guess)

Exact matches:  0.6053145760062525
Missed by 1:  0.3028526768268855
Missed by 2:  0.08245408362641657
Missed by 3:  0.008206330597889801
Missed by 4:  0.0011723329425556857


In [20]:
# 0.08284486127393513 + 0.008206330597889801 + 0.0011723329425556857
# 91% preds < 1 offs, 99% preds < 2 offs, 88% preds == 2 offs

In [21]:
# what are the samples missing by 1

val_data[val_data.severity - val_data.guess == 1]

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess
1126,bgwz,37.413900,-121.014000,2013-01-08,train,2013,1,2,1,west,4.0,3.740000e+06,3.0
16227,rwkd,38.115600,-121.494000,2013-01-15,train,2013,1,3,1,west,4.0,1.745249e+06,3.0
1178,bils,35.658042,-79.252651,2013-01-29,train,2013,1,5,1,south,2.0,5.184600e+04,1.0
629,ascv,35.794000,-79.004000,2013-02-12,train,2013,2,7,1,south,2.0,3.664900e+04,1.0
19900,vyle,32.384010,-104.145830,2013-03-28,train,2013,3,13,2,west,2.0,2.182287e+04,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8249,jdhb,39.077955,-96.880810,2021-09-20,train,2021,9,38,4,midwest,3.0,4.326020e+05,2.0
12915,ofvy,37.782398,-97.531050,2021-10-04,train,2021,10,40,4,midwest,3.0,1.587410e+05,2.0
376,aljv,39.211550,-97.005590,2021-10-11,train,2021,10,41,4,midwest,3.0,1.590030e+05,2.0
9495,klbq,35.876636,-78.890862,2021-10-12,train,2021,10,41,4,south,2.0,9.703900e+04,1.0


In [22]:
val_data[val_data.uid == 'bgwz']

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess
1126,bgwz,37.4139,-121.014,2013-01-08,train,2013,1,2,1,west,4.0,3740000.0,3.0


In [23]:
some_data = tr_data[tr_data.date == '2013-01-08']
some_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
14259,prgf,37.4918,-120.684,2013-01-08,train,2013,1,2,1,west,4.0,4529556.5
2107,cizy,37.1366,-120.762,2013-01-08,train,2013,1,2,1,west,4.0,4482500.0
13499,owaj,37.3772,-121.058,2013-01-08,train,2013,1,2,1,west,4.0,3755000.0
10819,lwjc,37.515,-121.012,2013-01-08,train,2013,1,2,1,west,4.0,1054871.0
13840,pfsh,37.4419,-121.003,2013-01-08,train,2013,1,2,1,west,3.0,136538.0
5172,fryq,37.2583,-120.475,2013-01-08,train,2013,1,2,1,west,4.0,4135845.75
9919,kwua,37.6003,-121.224,2013-01-08,train,2013,1,2,1,west,2.0,86350.0
2366,cqge,37.1976,-120.488,2013-01-08,train,2013,1,2,1,west,4.0,3324651.5
20554,wrxx,37.3133,-120.892,2013-01-08,train,2013,1,2,1,west,4.0,4070390.0
16589,sgtc,37.3204,-120.983,2013-01-08,train,2013,1,2,1,west,4.0,4027500.0


In [24]:
# lets see bgwz

make_guess1(val_data[val_data.uid == 'bgwz'].iloc[0])

3.0

In [25]:
dists = []
for some_row in some_data.itertuples():
        dist = get_distance(37.4139, -121.014, some_row.latitude, some_row.longitude)
        dists.append(dist)

In [26]:
some_data.iloc[np.argmin(dists)]

uid                         pfsh
latitude                 37.4419
longitude               -121.003
date         2013-01-08 00:00:00
split                      train
year                        2013
month                          1
week                           2
season                         1
region                      west
severity                     3.0
density                 136538.0
Name: 13840, dtype: object

In [27]:
# Observations:

#  In this case the closest one is 3, but the actual value is 4
#  I guess it is far better if we search for the same location in the past or try mode of the region

In [28]:
# what are the samples missing by 2

val_data[val_data.severity - val_data.guess == 2]

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess
10194,leit,35.657803,-79.253096,2013-03-13,train,2013,3,11,2,south,3.0,517495.0,1.0
12916,ofxt,35.976000,-78.712644,2013-05-14,train,2013,5,20,2,south,3.0,258059.0,1.0
12054,nhaw,35.859897,-78.756888,2013-06-04,train,2013,6,23,3,south,3.0,121991.0,1.0
2778,dbwj,39.628330,-99.580000,2013-06-24,train,2013,6,26,3,midwest,4.0,1204875.0,2.0
9775,ktdj,36.177000,-79.053877,2013-06-26,train,2013,6,26,3,south,3.0,613345.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14082,pmwk,35.892413,-79.017132,2021-07-12,train,2021,7,28,3,south,3.0,445365.0,1.0
23233,zqbi,35.790000,-79.037119,2021-07-12,train,2021,7,28,3,south,3.0,117925.0,1.0
2241,cnei,41.550878,-86.361626,2021-08-30,train,2021,8,35,3,midwest,4.0,1694159.0,2.0
11467,mpss,37.746680,-97.779360,2021-09-13,train,2021,9,37,4,midwest,3.0,136045.0,1.0


In [29]:
make_guess1(val_data[val_data.uid == 'leit'].iloc[0])


1.0

In [30]:
leit_date = '2013-03-13'

leit_data = tr_data[tr_data.date == leit_date]
leit_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4319,etfx,35.701531,-79.171876,2013-03-13,train,2013,3,11,2,south,1.0,1017.0
7443,igat,35.91027,-79.160055,2013-03-13,train,2013,3,11,2,south,1.0,3970.0
20379,wndy,35.910208,-79.159558,2013-03-13,train,2013,3,11,2,south,1.0,11909.0
831,ayfy,35.701504,-79.171926,2013-03-13,train,2013,3,11,2,south,1.0,218.0
4197,eqdu,35.657883,-79.252948,2013-03-13,train,2013,3,11,2,south,1.0,581.0
8679,jpte,35.794,-79.004,2013-03-13,train,2013,3,11,2,south,3.0,193734.0
19154,vcho,35.859275,-78.751916,2013-03-13,train,2013,3,11,2,south,1.0,871.0
15152,qrhs,35.859306,-78.752165,2013-03-13,train,2013,3,11,2,south,1.0,2324.0
18806,usoo,35.910301,-79.160303,2013-03-13,train,2013,3,11,2,south,1.0,9159.0


In [31]:
dists = []
for some_row in some_data.itertuples():
        dist = get_distance(37.4139, -121.014, some_row.latitude, some_row.longitude)
        dists.append(dist)

## With guess2

In [208]:
val_data_calls = {}
count = 0

In [209]:
for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    val_data.loc[row.Index, 'guess2'] = make_guess2(row, n_times_called=val_data_calls)

  5%|▌         | 133/2559 [00:00<00:05, 467.89it/s]

No data for this date, trying previous day.. for vyle
No data for this date, trying previous day.. for veuv
No data for this date, trying previous day.. for zxjy


  7%|▋         | 180/2559 [00:00<00:05, 465.32it/s]

No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for xdtz
No data for this date, trying previous day.. for kxjx
No data for this date, trying previous day.. for kxjx
No data for this date, trying previous day.. for kxjx
No data for this date, trying previous day.. for kxjx
No data for this date, trying previous day.. for kxjx
No data for this date, trying previous day.. for kxjx
No data for this date, trying previous day.. for kxjx
No data for this date, trying previous day.. for hkyf
No data for this date, trying previous day.. for hkyf
No data for this date, trying previous day.. for hkyf
No data for this date, tryin

 19%|█▉        | 497/2559 [00:01<00:03, 532.48it/s]

No data for this date, trying previous day.. for msnp
No data for this date, trying previous day.. for opci
No data for this date, trying previous day.. for ltai


 24%|██▎       | 604/2559 [00:01<00:04, 472.29it/s]

No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, trying previous day.. for reah
No data for this date, tryin

 30%|██▉       | 763/2559 [00:01<00:03, 501.66it/s]

No data for this date, trying previous day.. for rbfi
No data for this date, trying previous day.. for wqrh
No data for this date, trying previous day.. for vmgb
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, tryin

 32%|███▏      | 814/2559 [00:01<00:06, 264.77it/s]

No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, trying previous day.. for ipze
No data for this date, tryin

 33%|███▎      | 854/2559 [00:02<00:06, 274.91it/s]

No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for kwhr
No data for this date, trying previous day.. for wqqw
No data for this date, trying previous day.. for ckxf
No data for this date, trying previous day.. for ckxf
No data for this date, trying previous day.. for ckxf
No data for this date, tryin

 38%|███▊      | 975/2559 [00:02<00:05, 312.54it/s]

No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for gpzd
No data for this date, trying previous day.. for poau
No data for this date, trying previous day.. for qaqd


 42%|████▏     | 1067/2559 [00:02<00:04, 363.31it/s]

No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for fqwk
No data for this date, trying previous day.. for frrq
No data for this date, trying previous day.. for frrq
No data for this date, trying previous day.. for frrq
No data for this date, trying previous day.. for frrq
No data for this date, trying previous day.. for frrq
No data for this date, trying previous day.. for frrq
No data for this date, tryin

 47%|████▋     | 1191/2559 [00:03<00:03, 371.91it/s]

No data for this date, trying previous day.. for zafy
No data for this date, trying previous day.. for ygcr
No data for this date, trying previous day.. for ygcr
No data for this date, trying previous day.. for ygcr
No data for this date, trying previous day.. for ygcr
No data for this date, trying previous day.. for ygcr
No data for this date, trying previous day.. for ygcr
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, trying previous day.. for ldgb
No data for this date, tryin

 50%|█████     | 1280/2559 [00:03<00:03, 402.05it/s]

No data for this date, trying previous day.. for abdk
No data for this date, trying previous day.. for nute
No data for this date, trying previous day.. for bbsi
No data for this date, trying previous day.. for lmpf
No data for this date, trying previous day.. for hevf
No data for this date, trying previous day.. for mxos


 56%|█████▌    | 1429/2559 [00:03<00:02, 453.34it/s]

No data for this date, trying previous day.. for ojaz
No data for this date, trying previous day.. for jirh
No data for this date, trying previous day.. for zqah
No data for this date, trying previous day.. for wjjr
No data for this date, trying previous day.. for xabs
No data for this date, trying previous day.. for xabs
No data for this date, trying previous day.. for xabs
No data for this date, trying previous day.. for xabs
No data for this date, trying previous day.. for mghh
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, trying previous day.. for hefu
No data for this date, tryin

 62%|██████▏   | 1582/2559 [00:03<00:01, 490.80it/s]

No data for this date, trying previous day.. for tkdo
No data for this date, trying previous day.. for crcs
No data for this date, trying previous day.. for crcs


 66%|██████▌   | 1687/2559 [00:04<00:01, 469.98it/s]

No data for this date, trying previous day.. for bois
No data for this date, trying previous day.. for xwpr
No data for this date, trying previous day.. for uokp
No data for this date, trying previous day.. for xvok
No data for this date, trying previous day.. for xvok
No data for this date, trying previous day.. for xvok
No data for this date, trying previous day.. for xvok
No data for this date, trying previous day.. for akcy
No data for this date, trying previous day.. for yhdm
No data for this date, trying previous day.. for xkff


 68%|██████▊   | 1735/2559 [00:04<00:02, 403.57it/s]

No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for dpre
No data for this date, trying previous day.. for qbua
No data for this date, trying previous day.. for qbua
No data for this date, trying previous day.. for qbua
No data for this date, trying previous day.. for pptw
No data for this date, trying previous day.. for pptw
No data for this date, tryin

 71%|███████   | 1816/2559 [00:04<00:02, 277.79it/s]

No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for agqj
No data for this date, trying previous day.. for frxr
No data for this date, trying previous day.. for frxr
No data for this date, tryin

 74%|███████▍  | 1899/2559 [00:04<00:02, 314.78it/s]

No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, trying previous day.. for egsc
No data for this date, tryin

 77%|███████▋  | 1982/2559 [00:05<00:01, 356.38it/s]

No data for this date, trying previous day.. for iqvz
No data for this date, trying previous day.. for iqvz
No data for this date, trying previous day.. for iqvz
No data for this date, trying previous day.. for iqvz
No data for this date, trying previous day.. for iqvz
No data for this date, trying previous day.. for iqvz
No data for this date, trying previous day.. for iqvz
No data for this date, trying previous day.. for qrsj
No data for this date, trying previous day.. for qrsj
No data for this date, trying previous day.. for qrsj
No data for this date, trying previous day.. for oesj
No data for this date, trying previous day.. for ylcj
No data for this date, trying previous day.. for ylcj
No data for this date, trying previous day.. for ylcj
No data for this date, trying previous day.. for ylcj
No data for this date, trying previous day.. for ylcj
No data for this date, trying previous day.. for ylcj
No data for this date, trying previous day.. for ylcj
No data for this date, tryin

 81%|████████  | 2072/2559 [00:05<00:01, 401.21it/s]

No data for this date, trying previous day.. for copu
No data for this date, trying previous day.. for copu
No data for this date, trying previous day.. for copu
No data for this date, trying previous day.. for copu
No data for this date, trying previous day.. for copu
No data for this date, trying previous day.. for cbnk
No data for this date, trying previous day.. for cbnk
No data for this date, trying previous day.. for cbnk
No data for this date, trying previous day.. for cbnk
No data for this date, trying previous day.. for cbnk
No data for this date, trying previous day.. for cbnk
No data for this date, trying previous day.. for cbnk


 85%|████████▍ | 2165/2559 [00:05<00:00, 427.50it/s]

No data for this date, trying previous day.. for tmsd
No data for this date, trying previous day.. for tmsd
No data for this date, trying previous day.. for tmsd
No data for this date, trying previous day.. for tmsd
No data for this date, trying previous day.. for tmsd
No data for this date, trying previous day.. for tmsd
No data for this date, trying previous day.. for tmsd
No data for this date, trying previous day.. for nnnd


 88%|████████▊ | 2264/2559 [00:05<00:00, 458.01it/s]

No data for this date, trying previous day.. for tgmt
No data for this date, trying previous day.. for lbby
No data for this date, trying previous day.. for lbby
No data for this date, trying previous day.. for lbby
No data for this date, trying previous day.. for lbby
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, trying previous day.. for aclb
No data for this date, tryin

 92%|█████████▏| 2356/2559 [00:05<00:00, 420.38it/s]

No data for this date, trying previous day.. for nrrp
No data for this date, trying previous day.. for gpdn


 98%|█████████▊| 2507/2559 [00:06<00:00, 461.56it/s]

No data for this date, trying previous day.. for lxjy
No data for this date, trying previous day.. for lxjy
No data for this date, trying previous day.. for lxjy
No data for this date, trying previous day.. for lxjy
No data for this date, trying previous day.. for lxjy
No data for this date, trying previous day.. for lxjy
No data for this date, trying previous day.. for htzk
No data for this date, trying previous day.. for qoyi
No data for this date, trying previous day.. for jvfg
No data for this date, trying previous day.. for fgio


100%|██████████| 2559/2559 [00:06<00:00, 400.20it/s]

No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq
No data for this date, trying previous day.. for gkeq





In [211]:
count/val_data.shape[0]

0.28487690504103164

In [216]:
# this could be the reason! val data goes for previous dates about 1 in 4 samples. but the test data goes to previous date about 3 dates forward for every sample.!!

# Validate  val data only if it looks like test data

In [194]:
rmse(val_data.severity, val_data.guess2)
#  this is worse on first glance but..

0.7857663030948374

In [195]:
print(len(val_data_calls))

max(val_data_calls.values())

2559


1

In [129]:
analyize_matches(val_data.severity, val_data.guess2)

#  much better at exact matches and 1 offs but worse at 2 offs so missed by 2s should have spiked up the error!

Exact matches:  0.5607659241891364
Missed by 1:  0.3911684251660805
Missed by 2:  0.04181320828448613
Missed by 3:  0.005861664712778429
Missed by 4:  0.00039077764751856197


In [98]:
# 0.5592028135990621 + 0.39234075810863617 + 0.04181320828448613 + 0.0062524423602969914 + 0.00039077764751856197
# 0.5592028135990621 + 0.39234075810863617

# 95% preds <= 1 offs (thats why they are goood!)  4.8% >= 2 offs

0.04845642829230168

In [130]:
rmses, guess2_mean = cv_loop(rand=1859, splits=5,  guess_func=make_guess2)

Fold: 0


 12%|█▏        | 347/2843 [00:03<00:19, 131.06it/s]

No previous data for this date filling in 2s .. for evep


 35%|███▌      | 1009/2843 [00:11<00:22, 83.30it/s]

No previous data for this date filling in 2s .. for fvng
No previous data for this date filling in 2s .. for fwbt


 42%|████▏     | 1204/2843 [00:13<00:15, 108.99it/s]

No previous data for this date filling in 2s .. for gdxr


 49%|████▉     | 1399/2843 [00:15<00:25, 57.62it/s] 

No previous data for this date filling in 2s .. for gkvw


 58%|█████▊    | 1635/2843 [00:18<00:07, 156.57it/s]

No previous data for this date filling in 2s .. for guny


100%|██████████| 2843/2843 [00:29<00:00, 94.95it/s] 


RMSE:  0.9186005352544053
Train Distribution: 
1.0    0.437258
4.0    0.211248
2.0    0.184183
3.0    0.163796
5.0    0.003515
Name: severity, dtype: float64
Val Distribution: 
1.0    0.447415
4.0    0.206120
2.0    0.188533
3.0    0.154063
5.0    0.003869
Name: severity, dtype: float64
Predicted Distribution: 
2    0.347168
1    0.345762
4    0.212452
3    0.091453
5    0.003166
Name: guess1, dtype: float64
Fold: 1


 45%|████▍     | 1272/2843 [00:10<00:20, 78.23it/s] 

No previous data for this date filling in 2s .. for kmki


100%|██████████| 2843/2843 [00:28<00:00, 99.77it/s] 


RMSE:  0.8637377567389661
Train Distribution: 
1.0    0.442335
4.0    0.208685
2.0    0.186357
3.0    0.158931
5.0    0.003692
Name: severity, dtype: float64
Val Distribution: 
1.0    0.444953
4.0    0.212100
2.0    0.186423
3.0    0.153007
5.0    0.003517
Name: severity, dtype: float64
Predicted Distribution: 
2    0.371087
1    0.349279
4    0.221949
3    0.056630
5    0.001055
Name: guess1, dtype: float64
Fold: 2


100%|██████████| 2843/2843 [00:35<00:00, 79.06it/s] 


RMSE:  0.8151312611573267
Train Distribution: 
1.0    0.443207
4.0    0.209823
2.0    0.186379
3.0    0.156957
5.0    0.003634
Name: severity, dtype: float64
Val Distribution: 
1.0    0.451987
4.0    0.204010
2.0    0.185719
3.0    0.156877
5.0    0.001407
Name: severity, dtype: float64
Predicted Distribution: 
2    0.406964
1    0.320084
4    0.199085
3    0.073514
5    0.000352
Name: guess1, dtype: float64
Fold: 3


100%|██████████| 2843/2843 [00:22<00:00, 127.40it/s]


RMSE:  0.8493650770539654
Train Distribution: 
1.0    0.445402
4.0    0.208370
2.0    0.186214
3.0    0.156937
5.0    0.003077
Name: severity, dtype: float64
Val Distribution: 
1.0    0.429828
4.0    0.208934
2.0    0.198382
3.0    0.157580
5.0    0.005276
Name: severity, dtype: float64
Predicted Distribution: 
2    0.416110
1    0.301090
4    0.218079
3    0.064369
5    0.000352
Name: guess1, dtype: float64
Fold: 4


100%|██████████| 2843/2843 [00:23<00:00, 120.12it/s]

RMSE:  0.8001055153771268
Train Distribution: 
1.0    0.442287
4.0    0.208483
2.0    0.188647
3.0    0.157065
5.0    0.003517
Name: severity, dtype: float64
Val Distribution: 
1.0    0.425255
4.0    0.205065
2.0    0.195920
3.0    0.170946
5.0    0.002814
Name: severity, dtype: float64
Predicted Distribution: 
2    0.410482
1    0.310236
4    0.205065
3    0.073162
5    0.001055
Name: guess1, dtype: float64
----------------------------------------------------





In [131]:
np.mean(rmses), np.std(rmses)

(0.8493880291163581, 0.04146945508927233)

In [132]:
analyize_matches(train_data.severity, guess2_mean)

Exact matches:  0.45791324736225086
Missed by 1:  0.3899179366940211
Missed by 2:  0.0794841735052755
Missed by 3:  0.03622508792497069
Missed by 4:  0.03587338804220398


In [133]:
0.4587338804220399 + 0.3884525205158265

0.8471864009378665

In [46]:
#  Is there any difference between test and val sets?

te_data.region.value_counts(normalize=True)

west         0.366667
midwest      0.240399
south        0.232104
northeast    0.160829
Name: region, dtype: float64

In [50]:
val_data.region.value_counts(normalize=True)

south        0.579132
west         0.221180
midwest      0.130911
northeast    0.068777
Name: region, dtype: float64

In [51]:
train_data.region.value_counts(normalize=True)

south        0.583118
west         0.220926
midwest      0.128957
northeast    0.066999
Name: region, dtype: float64

In [52]:
test_data.date.value_counts(normalize=True)

2019-09-17    40
2014-07-30    38
2019-01-17    32
2019-08-27    30
2019-05-28    29
              ..
2020-01-17     1
2014-06-16     1
2018-05-23     1
2020-11-05     1
2013-11-21     1
Name: date, Length: 1278, dtype: int64

In [53]:
train_data.date.value_counts(normalize=True)

2016-08-10    0.008089
2015-07-07    0.006389
2015-06-22    0.006038
2015-08-24    0.005627
2015-06-10    0.005569
                ...   
2018-12-10    0.000059
2014-02-20    0.000059
2021-03-18    0.000059
2021-02-24    0.000059
2020-04-20    0.000059
Name: date, Length: 1255, dtype: float64

In [134]:
south = val_data[val_data.region == 'south']
west = val_data[val_data.region == 'west']
midwest = val_data[val_data.region == 'midwest']
northeast = val_data[val_data.region == 'northeast']

In [135]:
rmse(south.severity, south.guess2), rmse(south.severity, south.guess)

(0.8131842079168767, 0.9132404377632387)

In [136]:
rmse(west.severity, west.guess2), rmse(west.severity, west.guess)

(0.5670579955507383, 0.5763293130075913)

In [137]:
rmse(midwest.severity, midwest.guess2), rmse(midwest.severity, midwest.guess)

(0.9288092200038601, 0.9478963354115085)

In [138]:
rmse(northeast.severity, northeast.guess2), rmse(northeast.severity, northeast.guess)

(0.856127645538062, 0.9076693430779935)

In [139]:
rmse(val_data.guess2, val_data.guess)

0.6790564218619121

In [56]:
analyize_matches(val_data.guess2, val_data.guess)

Exact matches:  0.6228995701445877
Missed by 1:  0.3540445486518171
Missed by 2:  0.022274325908558032
Missed by 3:  0.0007815552950371239
Missed by 4:  0.0


In [142]:
rg = te_data.region.value_counts(normalize=True)
rg

west         0.366667
midwest      0.240399
south        0.232104
northeast    0.160829
Name: region, dtype: float64

In [143]:
0.8131842079168767 * rg['south']  + 0.5670579955507383 * rg['west'] + 0.9288092200038601 * rg['midwest'] + 0.856127645538062 * rg['northeast']

0.7576406832145016

In [114]:
west = train_data[train_data.region == 'west'].sample(366*2, random_state=42)
south = train_data[train_data.region == 'south'].sample(240*2, random_state=42)
midwest = train_data[train_data.region == 'midwest'].sample(232*2, random_state=42)
northeast = train_data[train_data.region == 'northeast'].sample(160*2, random_state=42)


new_val_data = pd.concat([west, south, midwest, northeast])
new_val_data.region.value_counts(normalize=True)

west         0.366733
south        0.240481
midwest      0.232465
northeast    0.160321
Name: region, dtype: float64

In [115]:
new_tr_data = train_data[~train_data.uid.isin(new_val_data.uid)]
new_tr_data.region.value_counts(normalize=True)

south        0.628518
west         0.201606
midwest      0.115242
northeast    0.054634
Name: region, dtype: float64

In [197]:
newval_calls = {}
for row in tqdm(new_val_data.itertuples(), total=new_val_data.shape[0]):
    new_val_data.loc[row.Index, 'guess2'] = make_guess2(row, tr_data=new_tr_data, n_times_called=newval_calls)
    

100%|██████████| 1996/1996 [00:05<00:00, 334.32it/s]


In [199]:
print(len(newval_calls))

max(newval_calls.values())

1996


1

In [198]:
rmse(new_val_data.severity, new_val_data.guess2)

0.7734315695558147

In [117]:
new_val_data.severity.value_counts(normalize=True)

4.0    0.342685
1.0    0.318136
3.0    0.166834
2.0    0.165832
5.0    0.006513
Name: severity, dtype: float64

In [118]:
new_val_data.guess2.value_counts(normalize=True)
#  4 1 2 3 5

4.0    0.334669
2.0    0.294088
1.0    0.233968
3.0    0.133267
5.0    0.004008
Name: guess2, dtype: float64

In [120]:
analyize_matches(new_val_data.severity, new_val_data.guess2)

Exact matches:  0.6277555110220441
Missed by 1:  0.31312625250501
Missed by 2:  0.04909819639278557
Missed by 3:  0.009519038076152305
Missed by 4:  0.000501002004008016


In [215]:
count/test_data.shape[0]

3.1178187403993856

In [204]:
print(len(test_calls))

max(test_calls.values())

6510


1

In [85]:
test_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess2
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,,2.0
12,aair,33.042600,-117.076000,2014-11-01,test,2014,11,44,4,west,,,4.0
14,aajw,40.703968,-80.293050,2015-08-26,test,2015,8,35,3,northeast,,,2.0
15,aalr,38.972500,-94.672930,2019-08-26,test,2019,8,35,3,midwest,,,3.0
16,aalw,34.279000,-118.905000,2018-01-08,test,2018,1,2,1,west,,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23556,zzpn,40.136410,-80.473740,2019-07-08,test,2019,7,28,3,northeast,,,5.0
23560,zzrv,36.875400,-121.561000,2019-09-17,test,2019,9,38,4,west,,,4.0
23563,zzsx,34.210000,-78.929389,2019-07-16,test,2019,7,29,3,south,,,1.0
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,,4.0


In [112]:
train_data[(train_data.date == '2016-08-17') & (train_data.region == 'west')]

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
1467,bqzw,37.6756,-121.264,2016-08-17,train,2016,8,33,3,west,4.0,1765920.0
10710,ltlm,38.2361,-121.419,2016-08-17,train,2016,8,33,3,west,4.0,1680760.0
11941,ncup,37.9718,-121.374,2016-08-17,train,2016,8,33,3,west,4.0,1596380.0
20573,wspe,38.307,-121.794,2016-08-17,train,2016,8,33,3,west,4.0,1534027.0
22972,zilg,38.3677,-121.521,2016-08-17,train,2016,8,33,3,west,4.0,1920500.0


In [148]:
len(set(val_data.date) - set(train_data.date) ), len(set(test_data.date) - set(train_data.date)), len(set(new_val_data.date) - set(train_data.date))

(0, 382, 0)

In [147]:
#  So this could be one possible reason for the difference in val and test set scores

0

In [169]:
data = pd.concat([train_data, test_data])
data.sort_index(inplace=True)
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess2,guess1
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0,,
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,,2.0,2.0
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0,,
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0,,
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,,4.0,4.0
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0,,
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0,,
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0,,


In [175]:
data.sort_values(by='date', inplace=True)
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,guess2,guess1
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0,,
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0,,
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0,,
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0,,
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,2021,12,52,1,west,,,4.0,4.0
17559,thki,36.725400,-121.730000,2021-12-29,test,2021,12,52,1,west,,,4.0,4.0
17452,teuu,36.772300,-121.788000,2021-12-29,test,2021,12,52,1,west,,,4.0,4.0
14254,prfi,36.751800,-121.742000,2021-12-29,test,2021,12,52,1,west,,,4.0,4.0


In [167]:
# Ig the exact answer lies in How is the test set different from val set?

#  find difference in consecutive dates between test samples


te_data.date.diff().value_counts()

0 days     5232
1 days      784
4 days      112
2 days       93
5 days       85
3 days       62
6 days       45
7 days       43
8 days       12
13 days       7
11 days       6
10 days       4
17 days       4
9 days        4
19 days       3
12 days       3
16 days       3
26 days       2
20 days       2
14 days       1
15 days       1
18 days       1
Name: date, dtype: int64

In [168]:
val_data.date.diff().value_counts()


0 days     1737
1 days      371
5 days       69
2 days       68
6 days       68
4 days       64
7 days       53
3 days       39
8 days       21
14 days      11
13 days      10
10 days       7
19 days       6
9 days        6
12 days       4
20 days       4
15 days       3
11 days       3
34 days       2
16 days       2
23 days       1
35 days       1
18 days       1
24 days       1
26 days       1
17 days       1
30 days       1
21 days       1
33 days       1
39 days       1
Name: date, dtype: int64

In [229]:
#  Understanding how the data is split into test set is the key!


In [232]:
train_data['location'] = train_data.longitude.astype(str) + train_data.latitude.astype(str)
train_data.location.value_counts()

-78.96708942.027378          94
-120.8537.2953               87
-121.79438.307               84
-120.85137.2486              82
-121.52637.8717              80
                             ..
-84.1799937.54089             1
-78.969905045800436.00756     1
-79.106425726211636.15        1
-86.7025339.29887             1
-79.238600417233535.742       1
Name: location, Length: 10614, dtype: int64

In [228]:
tr_data['location'] = tr_data.longitude.astype(str) + tr_data.latitude.astype(str)
print(tr_data.location.value_counts())

val_data['location'] = val_data.longitude.astype(str) + val_data.latitude.astype(str)
val_data.location.value_counts()

-78.96708942.027378                  79
-121.79438.307                       77
-120.8537.2953                       77
-120.8937.1145                       68
-121.52637.8717                      68
                                     ..
-78.81306251168935.8669153379229      1
-79.229900752108535.742               1
-78.930880833668135.8816361964081     1
-78.930383709946635.8815740830812     1
-78.843387058696335.98                1
Name: location, Length: 9130, dtype: int64


-120.96737.4583                      17
-120.85137.2486                      16
-78.96708942.027378                  15
-120.75937.4125                      15
-121.49438.1156                      14
                                     ..
-78.758876026053135.976               1
-79.163274703268535.7061617064181     1
-79.146071708472135.7154236165606     1
-78.974876283014735.61133             1
-78.843138496835635.98                1
Name: location, Length: 1806, dtype: int64

In [226]:

te_data['location'] = te_data.longitude.astype(str) + te_data.latitude.astype(str)
te_data.location.value_counts()

-120.637.0062                75
-120.03637.4363              44
-95.7069703238.51613412      42
-84.966144444444539.61525    42
-84.986841666666739.4981     42
                             ..
-119.256346.95433             1
-95.3966145.91673             1
-96.0239435.74527             1
-95.8400246.41943             1
-79.119822311239835.04        1
Name: location, Length: 2067, dtype: int64

In [352]:
len(set(val_data.location).intersection(set(tr_data.location)))/val_data.date.nunique()

0.39172749391727496

In [351]:
len(set(te_data.location).intersection(set(tr_data.location)))/te_data.date.nunique()

#  This could be a reason !!!

0.0

In [378]:
# Check overlap of dates between train and test set

len(set(test_data.date).intersection(set(train_data.date)))/test_data.date.nunique()

0.701095461658842

In [464]:
len(set(val_data.date).intersection(set(tr_data.date)))/val_data.date.nunique()

0.9708029197080292

In [461]:
#  Check overlap of date and region between train and test set

train_data['date_reg'] = train_data.date.astype(str) + "_" +  train_data.region
test_data['date_reg'] = test_data.date.astype(str) + "_" +  test_data.region
val_data['date_reg'] = val_data.date.astype(str) + "_" +  val_data.region

tr_data['date_reg'] = tr_data.date.astype(str) + "_" +  tr_data.region


In [415]:
#  what percentage of date_reg present in test set are present in train set
len(set(test_data.date_reg).intersection(set(train_data.date_reg)))/test_data.date_reg.nunique()

0.5178926441351889

In [462]:
len(set(val_data.date_reg).intersection(set(tr_data.date_reg)))/val_data.date_reg.nunique()

0.9224880382775119

# New val data?

In [451]:
val_data2 = val_data.drop(columns=['guess', 'guess2'])
tr_data2 = tr_data.copy()

val_data2['date_reg'] = val_data2.date.astype(str) + "_" +  val_data2.region
tr_data2['date_reg'] = tr_data2.date.astype(str) + "_" +  tr_data2.region


te_data2 = test_data.copy()
te_data2.sort_values(by='date', inplace=True)

In [452]:
assert (val_data2.columns == tr_data2.columns).all()

In [453]:
te_data2.columns

Index(['uid', 'latitude', 'longitude', 'date', 'split', 'year', 'month',
       'week', 'season', 'region', 'severity', 'density', 'guess2', 'guess1',
       'date_reg'],
      dtype='object')

In [454]:
set(val_data2.uid).intersection(set(tr_data2.uid))

set()

In [463]:
len(set(val_data2.date_reg).intersection(set(tr_data2.date_reg)))/val_data2.date_reg.nunique()

0.9224880382775119

In [526]:
#  remove 50% of datereg from tr_data2 present in val_data2 (to match test set)

datereg_to_remove = val_data2.date_reg.sample(frac=0.40, random_state=42)
tr_data2_ = tr_data2[~tr_data2.date_reg.isin(datereg_to_remove)]


In [527]:
len(set(val_data2.date_reg).intersection(set(tr_data2_.date_reg)))/val_data2.date_reg.nunique()    # matching test set

0.37320574162679426

In [528]:
len(set(val_data2.date).intersection(set(tr_data2_.date)))/val_data2.date.nunique()
#  almost matchine test set!

0.6593673965936739

In [529]:
val_calls = {}
for row in tqdm(val_data2.itertuples(), total=val_data2.shape[0]):
    val_data2.loc[row.Index, 'guess_2_new'] = make_guess2(row, tr_data=tr_data2_, n_times_called=val_calls)


100%|██████████| 2559/2559 [00:37<00:00, 67.64it/s] 


In [530]:
rmse(val_data2.severity, val_data2.guess_2_new)

1.0007812501191629

In [531]:
analyize_matches(val_data2.severity, val_data2.guess_2_new)

Exact matches:  0.5162172723720203
Missed by 1:  0.3614693239546698
Missed by 2:  0.09378663540445487
Missed by 3:  0.027354435326299335
Missed by 4:  0.0011723329425556857


In [532]:
sum(val_calls.values())/len(val_data2)

1.0

In [None]:
val_calls = {}
for row in tqdm(val_data2.itertuples(), total=val_data2.shape[0]):
    val_data2.loc[row.Index, 'guess_2'] = make_guess2(row, tr_data=tr_data2, n_times_called=val_calls)


print("rmse:", rmse(val_data2.severity, val_data2.guess_2))

100%|██████████| 2559/2559 [00:09<00:00, 274.93it/s]

rmse: 0.7857663030948374





In [None]:
analyize_matches(val_data2.severity, val_data2.guess_2)

Exact matches:  0.5607659241891364
Missed by 1:  0.3911684251660805
Missed by 2:  0.04181320828448613
Missed by 3:  0.005861664712778429
Missed by 4:  0.00039077764751856197


In [None]:
sum(val_calls.values())/len(val_data2)

1.0

In [538]:
# new val data with different split

tr_data3, val_data3 = train_test_split(train_data, test_size=0.2, random_state=123456789, shuffle=True)

tr_data3['date_reg'] = tr_data3.date.astype(str) + "_" +  tr_data3.region
val_data3['date_reg'] = val_data3.date.astype(str) + "_" +  val_data3.region

In [541]:
# new way of making val data

datereg_to_remove = val_data3.date_reg.sample(frac=0.40, random_state=123456789)
tr_data3_ = tr_data3[~tr_data3.date_reg.isin(datereg_to_remove)]

print("% of datereg matching in train data", len(set(val_data2.date_reg).intersection(set(tr_data2_.date_reg)))/val_data2.date_reg.nunique())   # matching test set

print("% of dates matching in train-data", len(set(val_data2.date).intersection(set(tr_data2_.date)))/val_data2.date.nunique())


val_calls = {}

for row in tqdm(val_data3.itertuples(), total=val_data3.shape[0]):
    val_data3.loc[row.Index, 'guess_2_new'] = make_guess2(row, tr_data=tr_data3_, n_times_called=val_calls)

print(rmse(val_data3.severity, val_data3.guess_2_new))

analyize_matches(val_data3.severity, val_data3.guess_2_new)

% of datereg matching in train data 0.37320574162679426
% of dates matching in train-data 0.6593673965936739


  0%|          | 15/3412 [00:00<01:57, 28.85it/s]

No previous data for this date filling in 2s .. for pbfb


  2%|▏         | 73/3412 [00:01<00:44, 75.33it/s]

No previous data for this date filling in 2s .. for jalu


 27%|██▋       | 928/3412 [00:14<00:32, 75.82it/s] 

No previous data for this date filling in 2s .. for jubi


 42%|████▏     | 1427/3412 [00:21<00:35, 55.98it/s]

No previous data for this date filling in 2s .. for pfly


 88%|████████▊ | 2989/3412 [00:43<00:05, 70.80it/s] 

No previous data for this date filling in 2s .. for dvpi


 90%|█████████ | 3087/3412 [00:44<00:04, 76.83it/s]

No previous data for this date filling in 2s .. for wrxx


 92%|█████████▏| 3149/3412 [00:45<00:04, 63.30it/s]

No previous data for this date filling in 2s .. for seke


 98%|█████████▊| 3343/3412 [00:48<00:01, 59.89it/s]

No previous data for this date filling in 2s .. for evep


100%|██████████| 3412/3412 [00:50<00:00, 68.02it/s]

0.9905769394242474
Exact matches:  0.5199296600234466
Missed by 1:  0.358147713950762
Missed by 2:  0.09525205158264947
Missed by 3:  0.02637749120750293
Missed by 4:  0.00029308323563892143





In [551]:
0.5199296600234466 + 0.358147713950762

0.8780773739742087

In [548]:
avg_sev_by_reg = np.round(tr_data3_.groupby('region').severity.mean())

print(rmse(val_data3.severity, val_data3.region.map(avg_sev_by_reg)))

analyize_matches(val_data3.severity, val_data3.region.map(avg_sev_by_reg))

0.9005339044216468
Exact matches:  0.3698710433763189
Missed by 1:  0.5908558030480656
Missed by 2:  0.026670574443141852
Missed by 3:  0.012602579132473623
Missed by 4:  0.0


In [549]:
0.3698710433763189 + 0.5908558030480656 

0.9607268464243846

In [540]:
# past way of making val data
val_calls = {}
for row in tqdm(val_data3.itertuples(), total=val_data3.shape[0]):
    val_data3.loc[row.Index, 'guess_2'] = make_guess2(row, tr_data=tr_data3, n_times_called=val_calls)

print(rmse(val_data3.severity, val_data3.guess_2))

analyize_matches(val_data3.severity, val_data3.guess_2)

100%|██████████| 3412/3412 [00:10<00:00, 312.04it/s]

0.8031663598789893
Exact matches:  0.5565650644783119
Missed by 1:  0.3886283704572098
Missed by 2:  0.0477725674091442
Missed by 3:  0.006740914419695193
Missed by 4:  0.00029308323563892143
None





In [550]:
0.5565650644783119 + 0.3886283704572098

0.9451934349355218

In [336]:
train_data.date.nunique()/len(train_data), test_data.date.nunique()/len(test_data), val_data.date.nunique()/len(val_data)

(0.07356389214536929, 0.19631336405529953, 0.3212192262602579)

# Submission

In [43]:
# #  Making submission with guess2 mean

# sub_format['severity'] = 0

# for row in tqdm(te_data.itertuples(), total=te_data.shape[0]):
#     uid_series = te_data[te_data.uid == row.uid]
#     severity = make_guess2(uid_series.iloc[0], date=row.date, tr_data=train_data)   # use all train data for making test submission
#     sub_format.loc[sub_format.uid == row.uid, 'severity'] = severity

# sub_format

  1%|▏         | 92/6510 [00:02<01:17, 83.06it/s]

No previous data for this date filling in 2s .. for igpa
No previous data for this date filling in 2s .. for lkpf


  2%|▏         | 101/6510 [00:02<03:28, 30.81it/s]

No previous data for this date filling in 2s .. for paez


100%|██████████| 6510/6510 [01:40<00:00, 64.90it/s] 


Unnamed: 0,uid,region,severity
0,aabn,west,2
1,aair,west,4
2,aajw,northeast,2
3,aalr,midwest,3
4,aalw,west,4
...,...,...,...
6505,zzpn,northeast,5
6506,zzrv,west,4
6507,zzsx,south,1
6508,zzvv,west,4


In [124]:
# print(sub_format.severity.value_counts(normalize=True))

# sub_format.to_csv('../submissions/to submit/guess2_mean_preds.csv', index=False)

4    0.326575
1    0.284178
2    0.214132
3    0.172197
5    0.002919
Name: severity, dtype: float64


# Sooo....

- The discrepancy is due to high percentage of dates and region combinations (as in myguess2) is missing in the test set, where as the val set is so sweet and nice and has all the    combinations
- finally figured out something, but not sure if it's right.
- 

# Todos :

- check this theory with a submission.
- analyise date missses vs date_reg misses.
- Figure out better hypothesis guessing on new val set.