``Mission : Error Analysis & Model metadata``

- Is this the problem with past submissions too??
- Develop Better guess
- Start from avgs, expanding means.


# Data and dependencies

In [1]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from geopy.distance import geodesic

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [2]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [3]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [4]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

## Add date fts

In [5]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [6]:
test_data = data[data.split == 'test']
test_data.shape, data.shape

((6510, 12), (23570, 12))

In [7]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

# Utils

In [8]:
#  Utils
def get_data_by_date( date=None, data=train_data):
    return data[data.date == date]


def get_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def analyize_matches(y_true, y_pred):
    print("Exact matches: ", sum(y_true == y_pred) / len(y_true))
    
    print("Missed by 1: ", sum(abs(y_true - y_pred) == 1) / len(y_true))
    print("Missed by 2: ", sum(abs(y_true - y_pred) == 2) / len(y_true))
    print("Missed by 3: ", sum(abs(y_true - y_pred) == 3) / len(y_true))
    print("Missed by 4: ", sum(abs(y_true - y_pred) == 4) / len(y_true))

# My Guess

In [9]:
te_data = test_data.sort_values(by='date')
te_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
6865,howw,37.0062,-120.600,2013-01-08,test,2013,1,2,1,west,,
3661,eamn,36.9818,-120.221,2013-01-08,test,2013,1,2,1,west,,
7668,imsv,36.9836,-120.500,2013-01-08,test,2013,1,2,1,west,,
20182,wgxq,33.8011,-117.205,2013-01-25,test,2013,1,4,1,west,,
16095,rsos,33.8892,-117.562,2013-01-25,test,2013,1,4,1,west,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.7368,-121.734,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.7518,-121.742,2021-12-29,test,2021,12,52,1,west,,
6864,howu,36.7085,-121.749,2021-12-29,test,2021,12,52,1,west,,
6540,hfvr,36.7962,-121.782,2021-12-29,test,2021,12,52,1,west,,


## Train test split

In [10]:
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=123456789, shuffle=True)
tr_data.shape, val_data.shape

((14501, 12), (2559, 12))

In [11]:
tr_data2, val_data2 = train_test_split(train_data, test_size=0.15, random_state=123456789, shuffle=True)
tr_data2.shape, val_data2.shape

((14501, 12), (2559, 12))

## Guess Funcs

In [13]:
fill_2s = []
g_from_past = {}

n_times_called = {}

def make_guess1(row: pd.Series, date=None, tr_data=tr_data) -> pd.Series:

    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        return make_guess1(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    for some_row in rel_data.itertuples():
        dist = get_distance(row.latitude, row.longitude, some_row.latitude, some_row.longitude)
        dists.append(dist)
    
    nearest = rel_data.iloc[np.argmin(dists)]
    return nearest.severity


def make_guess2(row: pd.Series, date=None, tr_data=tr_data, n_times_called=None) -> pd.Series:
    """modified version of make_guess1, Uses mean/mode of severity for the region instead of nearest severity."""
    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data
    
    if n_times_called is not None:
        if n_times_called.get(uid) is None:
            n_times_called[uid] = 1
        else:
            n_times_called[uid] += 1
        

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        global count 
        count += 1
        return make_guess2(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    severty_mode = rel_data.severity.mode()[0]
    severty_mean = np.round(rel_data.severity.mean())

    return severty_mean



def cv_loop(rand, splits=10, guess_func=make_guess1):
    # print("Random Number: ", rand)
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=rand)
    tscv = TimeSeriesSplit(n_splits=splits)
    
    rmses = []
    guess_train_preds = np.zeros((train_data.shape[0]))

    for fold, (train_idx, val_idx) in enumerate(tscv.split(train_data, train_data.severity)):
        print(f"Fold: {fold}")
        tr_data, val_data = train_data.iloc[train_idx], train_data.iloc[val_idx]

        val_data['guess'] = 0

        temp = []
        for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
            uid_series = val_data[val_data.uid == row.uid]
            severity = guess_func(uid_series.iloc[0], date=row.date, tr_data=tr_data)
            val_data.loc[val_data.uid == row.uid, f'guess'] = severity
            temp.append(severity)
        
        guess_train_preds[val_idx] = temp
        

                
        errror = rmse(val_data.severity, val_data.guess1)
        rmses.append(errror)
        print("RMSE: ", errror)

        print('Train Distribution: ')
        print(tr_data.severity.value_counts(normalize=True))
        print('Val Distribution: ')
        print(val_data.severity.value_counts(normalize=True))
        print('Predicted Distribution: ')
        print(val_data.guess1.value_counts(normalize=True))

    
    print('----------------------------------------------------')

    return rmses, guess_train_preds


In [14]:
tr_data.sort_values(by='date', inplace=True)
val_data.sort_values(by='date', inplace=True)

In [17]:
tr_data2.sort_values(by='date', inplace=True)
val_data2.sort_values(by='date', inplace=True)

### Failed way of validation

In [None]:
for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    val_data.loc[row.Index, 'guess'] = make_guess1(row)

In [16]:
rmse(val_data.severity, val_data.guess)

0.8516356696348804

In [17]:
analyize_matches(val_data.severity, val_data.guess)

Exact matches:  0.6053145760062525
Missed by 1:  0.3028526768268855
Missed by 2:  0.08245408362641657
Missed by 3:  0.008206330597889801
Missed by 4:  0.0011723329425556857


In [20]:
# 0.08284486127393513 + 0.008206330597889801 + 0.0011723329425556857
# 91% preds < 1 offs, 99% preds < 2 offs, 88% preds == 2 offs

In [19]:
count = 0

for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    val_data.loc[row.Index, 'guess'] = make_guess2(row)

analyize_matches(val_data.severity, val_data.guess)

rmse(val_data.severity, val_data.guess)

100%|██████████| 2559/2559 [00:06<00:00, 414.28it/s]

Exact matches:  0.5607659241891364
Missed by 1:  0.3911684251660805
Missed by 2:  0.04181320828448613
Missed by 3:  0.005861664712778429
Missed by 4:  0.00039077764751856197





0.7857663030948374

### New Era VAL SET

In [26]:
val_data2['date_reg'] = val_data2.date.astype(str) + "_" +  val_data2.region
tr_data2['date_reg'] = tr_data2.date.astype(str) + "_" +  tr_data2.region

assert (val_data2.columns == tr_data2.columns).all()

In [27]:
set(val_data2.uid).intersection(set(tr_data2.uid))

set()

In [28]:
#  Intersection percentage of date and regs before
len(set(val_data2.date_reg).intersection(set(tr_data2.date_reg)))/val_data2.date_reg.nunique()

0.9328793774319066

In [29]:
datereg_to_remove = val_data2.date_reg.sample(frac=0.40, random_state=123456789)
tr_data2_ = tr_data2[~tr_data2.date_reg.isin(datereg_to_remove)]

len(set(val_data2.date_reg).intersection(set(tr_data2_.date_reg)))/val_data2.date_reg.nunique()

0.36867704280155644

In [30]:
len(set(val_data2.date).intersection(set(tr_data2_.date)))/val_data2.date.nunique()

0.6562107904642409

In [31]:
for row in tqdm(val_data2.itertuples(), total=val_data2.shape[0]):
    val_data2.loc[row.Index, 'guess_2_new'] = make_guess2(row, tr_data=tr_data2_)

analyize_matches(val_data2.severity, val_data2.guess_2_new)
rmse(val_data2.severity, val_data2.guess_2_new)

  3%|▎         | 74/2559 [00:01<00:42, 57.84it/s]

No previous data for this date filling in 2s .. for jalu


 36%|███▋      | 934/2559 [00:11<00:14, 114.10it/s]

No previous data for this date filling in 2s .. for jubi


 56%|█████▌    | 1434/2559 [00:16<00:14, 79.91it/s] 

No previous data for this date filling in 2s .. for pfly


 78%|███████▊  | 1997/2559 [00:23<00:05, 108.87it/s]

No previous data for this date filling in 2s .. for qgrf


100%|██████████| 2559/2559 [00:27<00:00, 92.22it/s] 

Exact matches:  0.5224697147323173
Missed by 1:  0.35248143806174287
Missed by 2:  0.09183274716686206
Missed by 3:  0.031652989449003514
Missed by 4:  0.0015631105900742479





1.0147409034878858

In [35]:
tr_data.shape

(14501, 12)

In [37]:
val_data2.shape, tr_data2.shape

((2559, 14), (14501, 13))

In [38]:
#  new data
val_data2.shape, tr_data2_.shape

((2559, 14), (5617, 13))

1024

### Analyze past submissions with new valdation strategy

In [21]:
exp_sev_reg = pd.read_csv('../submissions/expndng_sev_by_reg_preds.csv')
exp_sev_regweek = pd.read_csv('../submissions/expndng_sev_by_rw_exreg_preds.csv')
exp_sev_rs = pd.read_csv('../submissions/expanding_sev_rs_preds.csv')

In [39]:
#  exp_sev_reg

#  past
grp_by_region = data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(2, inplace=True)
grp_by_region['northeast'].fillna(2, inplace=True)
print(grp_by_region.isna().sum())   # 5 --> 0.89416

mse(train_data.severity.sort_index(), grp_by_region.droplevel(0).loc[train_data.index].sort_index(), squared=False)

0


0.8939027540207913

In [28]:
train_data2 = pd.concat([tr_data2, val_data2], axis=0)
train_data2.sort_values(by='date', inplace=True)
train_data2.loc[val_data2.index, 'severity'] = np.nan

train_data2.isna().sum()

uid             0
latitude        0
longitude       0
date            0
split           0
year            0
month           0
week            0
season          0
region          0
severity     2559
density         0
dtype: int64

In [32]:
grp_by_region = train_data2.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(2, inplace=True)
grp_by_region['northeast'].fillna(2, inplace=True)
print(grp_by_region.isna().sum())   # 5 --> 0.89416

rmse(val_data2.severity.sort_index(), grp_by_region.droplevel(0).loc[val_data2.index].sort_index())

0


0.9035123162781323

In [35]:
grp_by_week = train_data2.groupby('week').severity.expanding(1).mean()
grp_by_week = grp_by_week.map(np.round)
print(grp_by_week.isna().sum())  # 105 --> 1.176284
grp_by_week.fillna(2, inplace=True)

rmse(val_data2.severity.sort_index(), grp_by_week.droplevel(0).loc[val_data2.index].sort_index())


17


1.200169325034355

In [37]:
grp_by_season = train_data2.groupby('season').severity.expanding(1).mean()
grp_by_season = grp_by_season.map(np.round)
print(grp_by_season.isna().sum())  # 105 --> 1.176284

rmse(val_data2.severity.sort_index(), grp_by_season.droplevel(0).loc[val_data2.index].sort_index())

0


1.211673067179112

In [39]:
grp_by_month = train_data2.groupby('month').severity.expanding(1).mean()
grp_by_month = grp_by_month.map(np.round)
print(grp_by_month.isna().sum())  # 105 --> 1.176284

grp_by_month.fillna(2, inplace=True)

rmse(val_data2.severity.sort_index(), grp_by_month.droplevel(0).loc[val_data2.index].sort_index())

1


1.2060159219778517

In [53]:
grp_by_region = train_data2.groupby('year').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

print(grp_by_region.isna().sum())   # 5 --> 0.89416
grp_by_region.fillna(2, inplace=True)

rmse(val_data2.severity.sort_index(), grp_by_region.droplevel(0).loc[val_data2.index].sort_index())

2


1.224824635947432

In [40]:
grp_by_rw = train_data2.groupby(['region', 'week']).severity.expanding(1).mean()
grp_by_rw = grp_by_rw.map(np.round)
print("Missing:", grp_by_rw.isna().sum())  # 566 missing vaulues  --> 0.82

train_data2['expndng_sev_by_rw_1'] = grp_by_rw.droplevel(0).droplevel(0).sort_index()


# 1 for imputing missing values with expanding_avg_by_reg   # current best option
# train_data2.expndng_sev_by_rw_1 = np.where(train_data2.expndng_sev_by_rw_1.isna(), train_data2.expndng_sev_by_reg, train_data2.expndng_sev_by_rw_1)
grp_by_rw.fillna(2, inplace=True)

print(train_data2.isna().sum())
rmse(val_data2.severity.sort_index(), grp_by_rw.droplevel(0).droplevel(0).loc[val_data2.index].sort_index())

Missing: 56
uid                       0
latitude                  0
longitude                 0
date                      0
split                     0
year                      0
month                     0
week                      0
season                    0
region                    0
severity               2559
density                   0
expndng_sev_by_rw_1      56
dtype: int64


0.8856019075011943

In [67]:
grp_by_rm = train_data2.groupby(['region', 'month']).severity.expanding(1).mean()
grp_by_rm = grp_by_rm.map(np.round)
print(grp_by_rm.isna().sum())  # 72 --> 0.8528

grp_by_rm.fillna(2, inplace=True)
analyize_matches(val_data2.severity.sort_index(), grp_by_rm.droplevel(0).droplevel(0).loc[val_data2.index].sort_index())
rmse(val_data2.severity.sort_index(), grp_by_rm.droplevel(0).droplevel(0).loc[val_data2.index].sort_index())

14
Exact matches:  0.49628761234857366
Missed by 1:  0.44001563110590075
Missed by 2:  0.0523642047674873
Missed by 3:  0.010550996483001172
Missed by 4:  0.0007815552950371239


0.8700208636828513

In [68]:
0.49628761234857366 + 0.44001563110590075

0.9363032434544745

In [65]:
grp_by_rm = train_data2.groupby(['region', 'year']).severity.expanding(1).mean()
grp_by_rm = grp_by_rm.map(np.round)
print(grp_by_rm.isna().sum())  # 72 --> 0.8528

grp_by_rm.fillna(2, inplace=True)

analyize_matches(grp_by_rm.droplevel(0).droplevel(0).loc[val_data2.index].sort_index(), val_data2.severity.sort_index())
rmse(val_data2.severity.sort_index(), grp_by_rm.droplevel(0).droplevel(0).loc[val_data2.index].sort_index())

14
Exact matches:  0.5439624853458382
Missed by 1:  0.365767878077374
Missed by 2:  0.07542008597108245
Missed by 3:  0.012895662368112544
Missed by 4:  0.0019538882375928096


0.9026468828263917

In [66]:
0.5439624853458382 + 0.365767878077374

0.9097303634232122

In [63]:
grp_by_rm = train_data2.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rm = grp_by_rm.map(np.round)
print(grp_by_rm.isna().sum())  # 72 --> 0.8528

grp_by_rm.fillna(2, inplace=True)

analyize_matches(grp_by_rm.droplevel(0).droplevel(0).loc[val_data2.index].sort_index(), val_data2.severity.sort_index())

rmse(val_data2.severity.sort_index(), grp_by_rm.droplevel(0).droplevel(0).loc[val_data2.index].sort_index())

8
Exact matches:  0.5001953888237592
Missed by 1:  0.4361078546307151
Missed by 2:  0.051191871824931616
Missed by 3:  0.011332551778038297
Missed by 4:  0.0011723329425556857


0.8727116562838365

In [64]:
0.5001953888237592 + 0.4361078546307151

0.9363032434544744

In [61]:
grp_by_rm = train_data2.groupby(['region', 'date']).severity.expanding(1).mean()
grp_by_rm = grp_by_rm.map(np.round)
print(grp_by_rm.isna().sum())  # 72 --> 0.8528

grp_by_rm.fillna(2, inplace=True)

preds = grp_by_rm.droplevel(0).droplevel(0).loc[val_data2.index].sort_index()

analyize_matches(val_data2.severity.sort_index(), preds)

rmse(val_data2.severity.sort_index(), preds)

736
Exact matches:  0.4404064087534193
Missed by 1:  0.4497850722938648
Missed by 2:  0.10042985541227042
Missed by 3:  0.008987885892926924
Missed by 4:  0.00039077764751856197


0.9688384330421589

In [62]:
0.4404064087534193 + 0.4497850722938648

0.8901914810472841

In [60]:
rmse(val_data2.severity, [1]*len(val_data2)), rmse(val_data2.severity, [2]*len(val_data2)), rmse(val_data2.severity, [3]*len(val_data2)), rmse(val_data2.severity, [4]*len(val_data2)), rmse(val_data2.severity, [5]*len(val_data2))

(1.6575758322631906,
 1.2082819584556195,
 1.4738836258523553,
 2.2101404014184314,
 3.0979199869280616)

In [69]:
#  so one more observation is : samples missed by <=1 should be less greatere than 90% inorder to breach rmse 0.89 

In [44]:
for row in tqdm(val_data2.itertuples(), total=val_data2.shape[0]):
    val_data2.loc[row.Index, 'guess_1_new'] = make_guess1(row, tr_data=tr_data2_)

  3%|▎         | 76/2559 [00:01<00:29, 83.25it/s]

No previous data for this date filling in 2s .. for jalu


 37%|███▋      | 936/2559 [00:12<00:17, 94.56it/s] 

No previous data for this date filling in 2s .. for jubi


 56%|█████▌    | 1430/2559 [00:18<00:11, 97.97it/s] 

No previous data for this date filling in 2s .. for pfly


 78%|███████▊  | 1992/2559 [00:24<00:06, 91.23it/s] 

No previous data for this date filling in 2s .. for qgrf


100%|██████████| 2559/2559 [00:29<00:00, 85.35it/s] 


In [45]:
analyize_matches(val_data2.severity, val_data2.guess_1_new)

rmse(val_data2.severity, val_data2.guess_1_new)

Exact matches:  0.5447440406408753
Missed by 1:  0.29269245799140287
Missed by 2:  0.1250488472059398
Missed by 3:  0.03673309886674482
Missed by 4:  0.0007815552950371239


1.0658286078617234

In [None]:
#  So guess1 should also give 1.06 or something like that..

# Submission

In [46]:
#  Making submission with guess2 mean

sub_format['severity'] = 0

for row in tqdm(te_data.itertuples(), total=te_data.shape[0]):
    uid_series = te_data[te_data.uid == row.uid]
    severity = make_guess1(uid_series.iloc[0], date=row.date, tr_data=train_data)   # use all train data for making test submission
    sub_format.loc[sub_format.uid == row.uid, 'severity'] = severity

sub_format

  1%|▏         | 88/6510 [00:01<01:09, 92.61it/s]

No previous data for this date filling in 2s .. for igpa
No previous data for this date filling in 2s .. for lkpf


  2%|▏         | 106/6510 [00:02<02:27, 43.53it/s]

No previous data for this date filling in 2s .. for paez


100%|██████████| 6510/6510 [01:17<00:00, 83.95it/s] 


Unnamed: 0,uid,region,severity
0,aabn,west,2
1,aair,west,4
2,aajw,northeast,2
3,aalr,midwest,4
4,aalw,west,4
...,...,...,...
6505,zzpn,northeast,5
6506,zzrv,west,4
6507,zzsx,south,2
6508,zzvv,west,4


In [47]:
print(sub_format.severity.value_counts(normalize=True))

sub_format.to_csv('../submissions/to submit/guess1_newly_validated.csv', index=False)

4    0.339785
1    0.311982
3    0.191705
2    0.153303
5    0.003226
Name: severity, dtype: float64


# Sooo....

- The discrepancy is due to high percentage of dates and region combinations (as in myguess2) is missing in the test set, where as the val set is so sweet and nice and has all the    combinations
- finally figured out something, but not sure if it's right.
- 

# Todos :

- check this theory with a submission.
- analyise date missses vs date_reg misses.
- Figure out better hypothesis guessing on new val set.