``Mission : Error Analysis & Model metadata``


# Data and dependencies

In [99]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from geopy.distance import geodesic
from joblib import Parallel, delayed

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [2]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [3]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [4]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

## Add date fts

In [5]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [6]:
test_data = data[data.split == 'test']
test_data.shape, data.shape

((6510, 12), (23570, 12))

In [7]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

# Utils

In [8]:
#  Utils
def get_data_by_date( date=None, data=train_data):
    return data[data.date == date]


def get_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def analyize_matches(y_true, y_pred):
    print("Exact matches: ", sum(y_true == y_pred) / len(y_true))
    
    print("Missed by 1: ", sum(abs(y_true - y_pred) == 1) / len(y_true))
    print("Missed by 2: ", sum(abs(y_true - y_pred) == 2) / len(y_true))
    print("Missed by 3: ", sum(abs(y_true - y_pred) == 3) / len(y_true))
    print("Missed by 4: ", sum(abs(y_true - y_pred) == 4) / len(y_true))

In [9]:
te_data = test_data.sort_values(by='date')
te_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
6865,howw,37.0062,-120.600,2013-01-08,test,2013,1,2,1,west,,
3661,eamn,36.9818,-120.221,2013-01-08,test,2013,1,2,1,west,,
7668,imsv,36.9836,-120.500,2013-01-08,test,2013,1,2,1,west,,
20182,wgxq,33.8011,-117.205,2013-01-25,test,2013,1,4,1,west,,
16095,rsos,33.8892,-117.562,2013-01-25,test,2013,1,4,1,west,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.7368,-121.734,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.7518,-121.742,2021-12-29,test,2021,12,52,1,west,,
6864,howu,36.7085,-121.749,2021-12-29,test,2021,12,52,1,west,,
6540,hfvr,36.7962,-121.782,2021-12-29,test,2021,12,52,1,west,,


## Train test split

In [10]:
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=123456789, shuffle=True)
tr_data.shape, val_data.shape

((14501, 12), (2559, 12))

In [11]:
tr_data2, val_data2 = train_test_split(train_data, test_size=0.15, random_state=123456789, shuffle=True)
tr_data2.shape, val_data2.shape

((14501, 12), (2559, 12))

## Guess Funcs

In [12]:
fill_2s = []
g_from_past = {}

n_times_called = {}

def make_guess1(row: pd.Series, date=None, tr_data=tr_data) -> pd.Series:

    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        return make_guess1(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    for some_row in rel_data.itertuples():
        dist = get_distance(row.latitude, row.longitude, some_row.latitude, some_row.longitude)
        dists.append(dist)
    
    nearest = rel_data.iloc[np.argmin(dists)]
    return nearest.severity


def make_guess2(row: pd.Series, date=None, tr_data=tr_data, n_times_called=None) -> pd.Series:
    """modified version of make_guess1, Uses mean/mode of severity for the region instead of nearest severity."""
    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data
    
    if n_times_called is not None:
        if n_times_called.get(uid) is None:
            n_times_called[uid] = 1
        else:
            n_times_called[uid] += 1
        

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        global count 
        count += 1
        return make_guess2(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    severty_mode = rel_data.severity.mode()[0]
    severty_mean = np.round(rel_data.severity.mean())

    return severty_mean



def cv_loop(rand, splits=10, guess_func=make_guess1):
    # print("Random Number: ", rand)
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=rand)
    tscv = TimeSeriesSplit(n_splits=splits)
    
    rmses = []
    guess_train_preds = np.zeros((train_data.shape[0]))

    for fold, (train_idx, val_idx) in enumerate(tscv.split(train_data, train_data.severity)):
        print(f"Fold: {fold}")
        tr_data, val_data = train_data.iloc[train_idx], train_data.iloc[val_idx]

        val_data['guess'] = 0

        temp = []
        for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
            uid_series = val_data[val_data.uid == row.uid]
            severity = guess_func(uid_series.iloc[0], date=row.date, tr_data=tr_data)
            val_data.loc[val_data.uid == row.uid, f'guess'] = severity
            temp.append(severity)
        
        guess_train_preds[val_idx] = temp
        

                
        errror = rmse(val_data.severity, val_data.guess1)
        rmses.append(errror)
        print("RMSE: ", errror)

        print('Train Distribution: ')
        print(tr_data.severity.value_counts(normalize=True))
        print('Val Distribution: ')
        print(val_data.severity.value_counts(normalize=True))
        print('Predicted Distribution: ')
        print(val_data.guess1.value_counts(normalize=True))

    
    print('----------------------------------------------------')

    return rmses, guess_train_preds


In [13]:
tr_data.sort_values(by='date', inplace=True)
val_data.sort_values(by='date', inplace=True)

In [14]:
tr_data2.sort_values(by='date', inplace=True)
val_data2.sort_values(by='date', inplace=True)

### Failed way of validation

In [15]:
# 0.08284486127393513 + 0.008206330597889801 + 0.0011723329425556857
# 91% preds < 1 offs, 99% preds < 2 offs, 88% preds == 2 offs

### New Era VAL SET

In [16]:
val_data2['date_reg'] = val_data2.date.astype(str) + "_" +  val_data2.region
tr_data2['date_reg'] = tr_data2.date.astype(str) + "_" +  tr_data2.region

assert (val_data2.columns == tr_data2.columns).all()

print(set(val_data2.uid).intersection(set(tr_data2.uid)))

#  Intersection percentage of date and regs before
print(len(set(val_data2.date_reg).intersection(set(tr_data2.date_reg)))/val_data2.date_reg.nunique())

set()
0.9328793774319066


In [17]:
datereg_to_remove = val_data2.date_reg.sample(frac=0.40, random_state=123456789)
tr_data2_ = tr_data2[~tr_data2.date_reg.isin(datereg_to_remove)]

len(set(val_data2.date_reg).intersection(set(tr_data2_.date_reg)))/val_data2.date_reg.nunique()

0.372568093385214

In [18]:
len(set(val_data2.date).intersection(set(tr_data2_.date)))/val_data2.date.nunique()

0.6537013801756587

In [19]:
len(set(test_data.date).intersection(set(train_data.date)))/test_data.date.nunique()

0.701095461658842

In [20]:
tr_data2_.shape, val_data2.shape, train_data.shape, val_data.shape

((5712, 13), (2559, 13), (17060, 12), (2559, 12))

In [21]:
tr_data3 = tr_data2.copy()
val_data3 = val_data2.copy()

datereg_to_remove = tr_data3.date_reg.sample(frac=0.60, random_state=123456789)
val_data3_ = val_data3[~val_data3.date_reg.isin(datereg_to_remove)]

len(set(val_data3_.date_reg).intersection(set(tr_data3.date_reg)))/val_data3_.date_reg.nunique()

0.43902439024390244

In [22]:
len(set(val_data3_.date).intersection(set(tr_data2_.date)))/val_data3_.date.nunique()

0.6186440677966102

In [23]:
val_data3.shape, val_data3_.shape, tr_data3.shape

#  I wonder if this is a good idea!

((2559, 13), (137, 13), (14501, 13))

In [24]:
# Don't foreget to clip values and trees are not good at extrapolation!!

# Guess based on nearest neighbors

In [25]:
train_data.groupby('region').severity.mean()

region
midwest      2.194091
northeast    1.805774
south        1.567652
west         3.747413
Name: severity, dtype: float64

In [26]:
reg_sev_map = {
    'midwest': 2,
    'northeast': 2,
    'south' : 2,
    'west' : 4
}

In [27]:
def knn(row=None, train_data=tr_data, k=1):
    """
    row : pd.Series (row from val_data)
    train_data : pd.DataFrame (all_data)
    k : int number of nearest neighbours to consider
    
    algo:
    1. Get past month data collected till the current row
    2. Get the k nearest neighbours (geodesic dist using lat, lng) from the above data
    3. Get the mean of the severity from the above rows
    4. Return the mean of the nearest neighbours severity

    """
    
    if row is None:
        print('Row None bruv!')
        return None
    
    uid = row.uid
    date = row.date
    region = row.region
    past_date = date - pd.Timedelta(days=30)
    
    past_month_data = train_data[(train_data.date < date) & (train_data.date >= past_date)]
    past_month_data.sort_values(by='date', inplace=True)
    
    # if no past data, return the mean of the region
    if len(past_month_data) == 0:
        return reg_sev_map[region]

    
    dist_matrix =pd.DataFrame(columns=['uid', 'dist'])       # 0th col for uid, 1st col for dist
    for i, past_row in enumerate(past_month_data.itertuples()):
        dist_matrix.loc[i, 'uid'] = past_row.uid
        dist_matrix.loc[i, 'dist'] = get_distance(row.latitude, row.longitude, past_row.latitude, past_row.longitude)  # returns geodesic dist in km

    # get top k nearest neighbours
    n_uids = dist_matrix.sort_values(by='dist').head(k).uid.values
    nn_severity = train_data[train_data.uid.isin(n_uids)].severity.mean()
    
    return np.round(nn_severity)


In [28]:
val_data.uid.isin(tr_data.uid).sum()

0

In [65]:
#  iterate over val_data and get the severity for each row

k = 10

for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
    severity = knn(row, train_data=tr_data, k=k)
    val_data.loc[val_data.uid == row.uid, f'nn{k}_guess'] = severity

100%|██████████| 2559/2559 [06:19<00:00,  6.74it/s]


In [67]:
rmse(val_data.severity, val_data.nn10_guess)

0.8965654582604005

In [68]:
rmse(val_data.severity, val_data.nn5_guess)

0.8738303744155784

In [48]:
# k=1 0.9515418063860925
# k=2 0.9511310393449847
# k=3 0.9076118558993864
# k=4 0.8820647913897578
# k=5 0.8738303744155784
# k=6 0.8884654455787772
# k=7 0.8873651783510002
# k=8 0.8853812517340963
# k=9 0.894165010958815
# k=10 0.8965654582604005

In [128]:
tr_data2.shape, val_data2.shape, tr_data2_.shape

((14501, 13), (2559, 13), (5712, 13))

In [54]:
val_data2.uid.isin(tr_data2_.uid).sum()

0

In [84]:
tr_data2_['latlng'] = tr_data2_.latitude.astype(str) + "_" + tr_data2_.longitude.astype(str)
val_data2['latlng'] = val_data2.latitude.astype(str) + "_" + val_data2.longitude.astype(str)

In [87]:
len(set(tr_data2_.latlng).intersection(set(val_data2.latlng)))/len(set(val_data2.latlng))
# 15% intersection

0.1574023115024766

In [69]:
for row in tqdm(val_data2.itertuples(), total=val_data2.shape[0]):
    severity = knn(row, train_data=tr_data2_, k=5)
    val_data2.loc[val_data2.uid == row.uid, f'nn5_guess'] = severity

100%|██████████| 2559/2559 [03:32<00:00, 12.02it/s]


In [77]:
rmse(val_data2.severity, val_data2.nn5_guess)

1.010882559084407

In [71]:
tr_data3.shape, val_data3.shape, val_data3_.shape

((14501, 13), (2559, 13), (137, 14))

In [78]:
val_data3_.uid.isin(tr_data3.uid).sum()

0

In [88]:
tr_data3['latlng'] = tr_data3.latitude.astype(str) + "_" + tr_data3.longitude.astype(str)
val_data3_['latlng'] = val_data3_.latitude.astype(str) + "_" + val_data3_.longitude.astype(str)

len(set(tr_data3.latlng).intersection(set(val_data3_.latlng)))/len(set(val_data3_.latlng))
# 60% intersection

0.5964912280701754

In [79]:
for row in tqdm(val_data3_.itertuples(), total=val_data3_.shape[0]):
    severity = knn(row, train_data=tr_data3, k=5)
    val_data3_.loc[val_data3_.uid == row.uid, f'nn5_guess'] = severity

100%|██████████| 137/137 [00:18<00:00,  7.59it/s]


In [80]:
rmse(val_data3_.severity, val_data3_.nn5_guess)

0.9778570343163892

2559

In [64]:
data.sort_values(by='date')

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,latlng
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0,44.847993_-93.476318
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0,44.822478_-93.367962
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0,44.877646_-93.557842
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0,44.878889_-93.490833
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0,44.8505_-93.5157
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,2021,12,52,1,west,,,36.7368_-121.734
17559,thki,36.725400,-121.730000,2021-12-29,test,2021,12,52,1,west,,,36.7254_-121.73
17452,teuu,36.772300,-121.788000,2021-12-29,test,2021,12,52,1,west,,,36.7723_-121.788
14254,prfi,36.751800,-121.742000,2021-12-29,test,2021,12,52,1,west,,,36.7518_-121.742


In [59]:
data['latlng'] = data.latitude.astype(str) + '_' + data.longitude.astype(str)

data['latlng'].nunique()

12681

In [61]:
data[data.split == 'test'].latlng.nunique()/len(data[data.split == 'test'])

0.3175115207373272

In [62]:
data[data.split == 'train'].latlng.nunique()/len(data[data.split == 'train'])

0.6221570926143024

In [82]:
set(data[data.split == 'test'].latlng).intersection(set(data[data.split == 'train'].latlng))
# no common latlngs between test and train_data

set()

In [89]:
test_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
12,aair,33.042600,-117.076000,2014-11-01,test,2014,11,44,4,west,,
14,aajw,40.703968,-80.293050,2015-08-26,test,2015,8,35,3,northeast,,
15,aalr,38.972500,-94.672930,2019-08-26,test,2019,8,35,3,midwest,,
16,aalw,34.279000,-118.905000,2018-01-08,test,2018,1,2,1,west,,
...,...,...,...,...,...,...,...,...,...,...,...,...
23556,zzpn,40.136410,-80.473740,2019-07-08,test,2019,7,28,3,northeast,,
23560,zzrv,36.875400,-121.561000,2019-09-17,test,2019,9,38,4,west,,
23563,zzsx,34.210000,-78.929389,2019-07-16,test,2019,7,29,3,south,,
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,


In [None]:
# k = 5

# for row in tqdm(test_data.itertuples(), total=test_data.shape[0]):
#     severity = knn(row, train_data=train_data, k=k)
#     test_data.loc[test_data.uid == row.uid, f'nn{k}_guess'] = severity

In [100]:
#  parrllelize the above code
k = 5
sev_list = Parallel(n_jobs=-1, backend='loky')([delayed(knn)(row, train_data=train_data, k=k) for row in test_data.itertuples()])
len(sev_list)

6510

In [102]:
#  hoping sev_list is in the same order as test_data
test_data[f'nn{k}_guess'] = sev_list
test_data.head()

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,nn5_guess
1,aabn,36.5597,-121.51,2016-08-31,test,2016,8,35,3,west,,,4.0
12,aair,33.0426,-117.076,2014-11-01,test,2014,11,44,4,west,,,4.0
14,aajw,40.703968,-80.29305,2015-08-26,test,2015,8,35,3,northeast,,,1.0
15,aalr,38.9725,-94.67293,2019-08-26,test,2019,8,35,3,midwest,,,4.0
16,aalw,34.279,-118.905,2018-01-08,test,2018,1,2,1,west,,,4.0


# CV it

In [None]:
X_train = train_data.drop(['severity', 'uid', 'date'], axis=1)
y_train = train_data.severity

In [None]:
xgb = XGBRegressor(n_estimators=1000, random_state=123456789, tree_method='gpu_hist', gpu_id=0)
xgb

In [None]:

def train_eval(model, X_train=None, X_val=None, y_train=None, y_val=None, X_test=None, y_test=None):
    """
    train and eval util func,
    returns trained model, soft_preds, and rmse
    REMEMBER to round myself
    """
    
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mse(y_val, np.round(preds), squared=False)
    test_rmse = mse(y_test, np.round(model.predict(X_test)), squared=False)
    print("Train RMSE: ", mse(y_train, np.round(model.predict(X_train)), squared=False))
    print("Val RMSE:", rmse)
    print('TEST RMSE: ', test_rmse)
    
    # print("TEST RMSE:", mse(y_val, np.round(model.predict(X_val)), squared=False))
    return model, preds, rmse, test_rmse


def cv_it(model, X_train=X_train, y_train=y_train, X_test=None, y_test=None, splits=10, cv_predict=False):
#     skf = StratifiedKFold(n_splits=splits, random_state=config.RANDOM_STATE, shuffle=True)
    
    tscv = TimeSeriesSplit(n_splits=splits, test_size=200)
    
    val_rmse = []
    test_rmses = []
    
    if cv_predict:
        cvpreds_test = np.zeros(shape=(6510, splits))
        cvpreds_train = np.zeros(shape=(len(X_train)))

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train, y_train)):

        print(f'-----------------------Fold-{fold}-------------------------')
        X_train_subset, y_train_subset = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_subset, y_val_subset = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        print(f'Training on {X_train_subset.shape[0]} samples' )
        print(f'Validating on {X_val_subset.shape[0]} samples' )

        model, val_preds, rmse, test_rmse= train_eval(
                                    model=model, 
                                    X_train=X_train_subset, 
                                    y_train=y_train_subset, 
                                    X_val=X_val_subset, 
                                    y_val=y_val_subset,
                                    X_test=X_test,
                                    y_test=y_test)
        val_rmse.append(rmse)
        test_rmses.append(test_rmse)
        
        if cv_predict:
            # save predictions for ensembling
            cvpreds_test[:, fold] = model.predict(X_test)
            cvpreds_train[val_idx] = val_preds
                    
    print()
    print("Mean Val RMSE:", np.mean(val_rmse), "std:", np.std(val_rmse))
    print("Mean Test RMSE:", np.mean(test_rmses), "std:", np.std(test_rmses))

    
    if cv_predict:
        return cvpreds_test, cvpreds_train

In [None]:
cv_it(xgb, X_test=X_val, y_test=y_val)

-----------------------Fold-0-------------------------
Training on 3756 samples
Validating on 1000 samples
Train RMSE:  0.40497439406819935
Val RMSE: 0.9523654760647301
TEST RMSE:  0.9606356350066096
-----------------------Fold-1-------------------------
Training on 4756 samples
Validating on 1000 samples
Train RMSE:  0.4433889672540245
Val RMSE: 0.8012490249604052
TEST RMSE:  0.9506592589536919
-----------------------Fold-2-------------------------
Training on 5756 samples
Validating on 1000 samples
Train RMSE:  0.4450328126200894
Val RMSE: 0.8882567196480982
TEST RMSE:  0.9149683438337731
-----------------------Fold-3-------------------------
Training on 6756 samples
Validating on 1000 samples
Train RMSE:  0.47182298915642806
Val RMSE: 0.8276472678623424
TEST RMSE:  0.8819552348728436
-----------------------Fold-4-------------------------
Training on 7756 samples
Validating on 1000 samples
Train RMSE:  0.4760916177252889
Val RMSE: 0.9813256340277675
TEST RMSE:  0.8283344822762391
---

In [None]:
tr_data2_new = tr_data2_.copy()
val_data2_new = val_data2.copy()

X_train2 = tr_data2_new.drop(drop_cols, axis=1)
y_train2 = tr_data2_new['severity']
X_val2 = val_data2_new.drop(drop_cols, axis=1)
y_val2 = val_data2_new['severity']

X_train2['region'] = X_train2['region'].map({'midwest': 1, 'west': 2, 'south': 3, 'northeast': 4}) 
X_train2.week = X_train2.week.astype('int')
X_val2['region'] = X_val2['region'].map({'midwest': 1, 'west': 2, 'south': 3, 'northeast': 4})
X_val2.week = X_val2.week.astype('int')

X_val2.drop('date_reg', axis=1, inplace=True)
X_train2.drop('date_reg', axis=1, inplace=True)

X_train2.head()

Unnamed: 0,latitude,longitude,year,month,week,season,region
13644,44.822478,-93.367962,2013,1,1,1,1
4387,44.847993,-93.476318,2013,1,1,1,1
5566,44.877646,-93.557842,2013,1,1,1,1
6144,44.878889,-93.490833,2013,1,1,1,1
5317,44.8505,-93.5157,2013,1,1,1,1


In [None]:
assert (X_val2.columns == X_train2.columns).all()

In [None]:
cv_it(xgb, X_train=X_train2, y_train=y_train2, X_test=X_val2, y_test=y_val2)

-----------------------Fold-0-------------------------
Training on 3712 samples
Validating on 200 samples
Train RMSE:  0.3207950790748211
Val RMSE: 0.8306623862918074
TEST RMSE:  0.8257766010973492
-----------------------Fold-1-------------------------
Training on 3912 samples
Validating on 200 samples
Train RMSE:  0.3091977878052171
Val RMSE: 0.5196152422706632
TEST RMSE:  0.8219820877364625
-----------------------Fold-2-------------------------
Training on 4112 samples
Validating on 200 samples
Train RMSE:  0.3153806914583502
Val RMSE: 0.7905694150420949
TEST RMSE:  0.7963869172221666
-----------------------Fold-3-------------------------
Training on 4312 samples
Validating on 200 samples
Train RMSE:  0.3083561057294485
Val RMSE: 1.036822067666386
TEST RMSE:  0.8073519158712397
-----------------------Fold-4-------------------------
Training on 4512 samples
Validating on 200 samples
Train RMSE:  0.3122782901459112
Val RMSE: 0.6324555320336759
TEST RMSE:  0.7924516891030828
-----------

In [None]:
0.7826725928953457  + 0.14806624304842844

0.9307388359437742

In [None]:
xgb.fit(X_train2, y_train2)
preds = xgb.predict(X_val2)  # new validation set with lng and lats !
preds = np.clip(np.round(preds), 1, 5)
rmse(y_val2, preds)

0.806625553226778

In [None]:
pd.Series(preds).apply(np.round).clip(1, 5).value_counts(normalize=True)

1.0    0.364596
2.0    0.305588
4.0    0.199297
3.0    0.129347
5.0    0.001172
dtype: float64

In [None]:
tr_data3new = tr_data3.copy()
val_data3__new = val_data3_.copy()

X_train3 = tr_data3new.drop(drop_cols, axis=1)
y_train3 = tr_data3new['severity']
X_val3 = val_data3__new.drop(drop_cols, axis=1)
y_val3 = val_data3__new['severity']

X_train3['region'] = X_train3['region'].map({'midwest': 1, 'west': 2, 'south': 3, 'northeast': 4}) 
X_train3.week = X_train3.week.astype('int')
X_val3['region'] = X_val3['region'].map({'midwest': 1, 'west': 2, 'south': 3, 'northeast': 4})
X_val3.week = X_val3.week.astype('int')

X_val3.drop('date_reg', axis=1, inplace=True)
X_train3.drop('date_reg', axis=1, inplace=True)

X_train3.head()

Unnamed: 0,latitude,longitude,year,month,week,season,region
13644,44.822478,-93.367962,2013,1,1,1,1
4387,44.847993,-93.476318,2013,1,1,1,1
5566,44.877646,-93.557842,2013,1,1,1,1
6144,44.878889,-93.490833,2013,1,1,1,1
5317,44.8505,-93.5157,2013,1,1,1,1


In [148]:
xgb.fit(X_train3, y_train3)
preds = xgb.predict(X_val3)  # new validation set with lng and lats !
preds = np.clip(np.round(preds), 1, 5)
rmse(y_val3, preds)

0.8014585244561053

In [150]:
X_train3['latlng'] = X_train3['latitude'].astype('str') + X_train3['longitude'].astype('str')
X_val3['latlng'] = X_val3['latitude'].astype('str') + X_val3['longitude'].astype('str')

len(set(X_train3.latlng.unique()) & set(X_val3.latlng.unique()))/len(X_val3)

0.49635036496350365

# Submission

In [119]:
#  Making submission with knn-5 @ 1.00

assert sub_format.uid.equals(test_data.reset_index().uid) == True

sub_format.severity = test_data.nn5_guess.values

sub_format.severity = sub_format.severity.astype(int)
sub_format.severity.value_counts()

4    2198
1    1963
2    1631
3     718
Name: severity, dtype: int64

In [116]:
# save submission
sub_format.to_csv('../submissions/to submit/nn5_guess_preds.csv', index=False)

In [None]:
#  May be its better if start from 

# Sooo....

Why am I behind?

- They clearly saw something in metadata that I didn't
- I know for a fact that no one on the top is using images as features
- How the train test split was done :
    - No overlapping geo locations
    - only 51% of test date_regs are in train vs 92% in val
    

# Todos :

- idk..