``Mission : Error Analysis & Model metadata``


# Data and dependencies

In [1]:
import warnings
import sys
import os
import time
import joblib
import random
from tqdm import tqdm
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from geopy.distance import geodesic
from joblib import Parallel, delayed

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedKFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

warnings.filterwarnings('ignore')

In [2]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [3]:
def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

In [4]:
def dens_to_sev(x: float)-> int:
    """takes in density value in cells/ml and returns severity category"""
    if (x < 20_000) : return 1
    elif (x >= 20_000) and (x < 100_000) : return 2
    elif (x >= 100_000) and (x < 1_000_000) : return 3
    elif (x >= 1_000_000) and (x < 10_000_000) : return 4
    elif x > 10_000_000 : return 5

## Add date fts

In [5]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week


seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

metadata['season'] = metadata.month.map(seasons)


region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

data = pd.merge(metadata, region, on='uid', how='left')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


In [6]:
test_data = data[data.split == 'test']
test_data.shape, data.shape

((6510, 12), (23570, 12))

In [7]:
train_data = data[data.split == 'train']
train_data.shape, data.shape

((17060, 12), (23570, 12))

# Utils

In [8]:
#  Utils
def get_data_by_date( date=None, data=train_data):
    return data[data.date == date]


def get_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

def analyize_matches(y_true, y_pred):
    print("Exact matches: ", sum(y_true == y_pred) / len(y_true))
    
    print("Missed by 1: ", sum(abs(y_true - y_pred) == 1) / len(y_true))
    print("Missed by 2: ", sum(abs(y_true - y_pred) == 2) / len(y_true))
    print("Missed by 3: ", sum(abs(y_true - y_pred) == 3) / len(y_true))
    print("Missed by 4: ", sum(abs(y_true - y_pred) == 4) / len(y_true))

In [9]:
te_data = test_data.sort_values(by='date')

# Train test split

In [10]:
tr_data, val_data = train_test_split(train_data, test_size=0.15, random_state=123456789, shuffle=True)
tr_data.shape, val_data.shape

((14501, 12), (2559, 12))

In [11]:
# split data similar to test data
val_data['date_reg'] = val_data.date.astype(str) + "_" +  val_data.region
tr_data['date_reg'] = tr_data.date.astype(str) + "_" +  tr_data.region

assert (val_data.columns == tr_data.columns).all()

print(set(val_data.uid).intersection(set(tr_data.uid)))

# percentage of intersection date and regs before
print(f" % of intersection between date and regions in val and train sets before correction: {len(set(val_data.date_reg).intersection(set(tr_data.date_reg)))/val_data.date_reg.nunique()}")

datereg_to_remove = val_data.date_reg.sample(frac=0.40, random_state=123456789)
tr_data2_te_dist = tr_data[~tr_data.date_reg.isin(datereg_to_remove)]
val_data2_te_dist = val_data

print(f" % of intersection between date and regions in val and train sets after correction: {len(set(val_data2_te_dist.date_reg).intersection(set(tr_data2_te_dist.date_reg)))/val_data2_te_dist.date_reg.nunique()}")


print(f" % of intersection between dates in val and train sets: {len(set(val_data2_te_dist.date).intersection(set(tr_data2_te_dist.date)))/val_data2_te_dist.date.nunique() :<75f} ")
print(f" % of intersection between date in test and train sets: {len(set(test_data.date).intersection(set(train_data.date)))/test_data.date.nunique():<75f}" )

tr_data2_te_dist.shape, val_data2_te_dist.shape, train_data.shape, val_data.shape


set()
 % of intersection between date and regions in val and train sets before correction: 0.9328793774319066
 % of intersection between date and regions in val and train sets after correction: 0.36867704280155644
 % of intersection between dates in val and train sets: 0.656211                                                                    
 % of intersection between date in test and train sets: 0.701095                                                                   


((5617, 13), (2559, 13), (17060, 12), (2559, 13))

In [12]:
tr_data3 = tr_data.copy()
val_data3 = val_data.copy()

datereg_to_remove = tr_data3.date_reg.sample(frac=0.60, random_state=123456789)
val_data3_te_dist = val_data3[~val_data3.date_reg.isin(datereg_to_remove)]
tr_data3_te_dist = tr_data3

print("matching dateregs btw val and tr:", len(set(val_data3_te_dist.date_reg).intersection(set(tr_data3_te_dist.date_reg)))/val_data3_te_dist.date_reg.nunique())
print("matching dates btw val and tr:",len(set(val_data3_te_dist.date).intersection(set(tr_data3_te_dist.date)))/val_data3_te_dist.date.nunique())

tr_data3_te_dist.shape, val_data3_te_dist.shape

matching dateregs btw val and tr: 0.4
matching dates btw val and tr: 0.7589285714285714


((14501, 13), (131, 13))

In [13]:
#  split by time
train_data = train_data.sort_values(by='date')
train_data_ts = train_data[train_data.date < '2019-01-01']
val_data_ts = train_data[train_data.date >= '2019-01-01']

train_data_ts.shape, val_data_ts.shape

((11968, 12), (5092, 12))

## Guess Funcs

In [15]:
fill_2s = []
g_from_past = {}

n_times_called = {}

def make_guess1(row: pd.Series, date=None, tr_data=tr_data) -> pd.Series:

    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        return make_guess1(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    for some_row in rel_data.itertuples():
        dist = get_distance(row.latitude, row.longitude, some_row.latitude, some_row.longitude)
        dists.append(dist)
    
    nearest = rel_data.iloc[np.argmin(dists)]
    return nearest.severity


def make_guess2(row: pd.Series, date=None, tr_data=tr_data, n_times_called=None) -> pd.Series:
    """modified version of make_guess1, Uses mean/mode of severity for the region instead of nearest severity."""
    dists = []
    region = row.region
    date = date
    uid = row.uid
    tr_data = tr_data
    
    if n_times_called is not None:
        if n_times_called.get(uid) is None:
            n_times_called[uid] = 1
        else:
            n_times_called[uid] += 1
        

    if date is None:
        date = row.date

    rel_data = tr_data[(tr_data.date == date) & (tr_data.region == region)]
    
    # check if cur date is past '2013-01-04'
    if date < pd.to_datetime('2013-01-04'):
        print(f'No previous data for this date filling in 2s .. for {row.uid}')
        fill_2s.append(uid)
        return 2

    if rel_data.shape[0] == 0:
        # print(f'No data for this date, trying previous day.. for {row.uid}')
        if g_from_past.get(uid) is not None:
            g_from_past[uid] += 1
        global count 
        count += 1
        return make_guess2(row, date=date - pd.Timedelta(days=1), tr_data=tr_data)

    severty_mode = rel_data.severity.mode()[0]
    severty_mean = np.round(rel_data.severity.mean())

    return severty_mean



def cv_loop(rand, splits=10, guess_func=make_guess1):
    # print("Random Number: ", rand)
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=rand)
    tscv = TimeSeriesSplit(n_splits=splits)
    
    rmses = []
    guess_train_preds = np.zeros((train_data.shape[0]))

    for fold, (train_idx, val_idx) in enumerate(tscv.split(train_data, train_data.severity)):
        print(f"Fold: {fold}")
        tr_data, val_data = train_data.iloc[train_idx], train_data.iloc[val_idx]

        val_data['guess'] = 0

        temp = []
        for row in tqdm(val_data.itertuples(), total=val_data.shape[0]):
            uid_series = val_data[val_data.uid == row.uid]
            severity = guess_func(uid_series.iloc[0], date=row.date, tr_data=tr_data)
            val_data.loc[val_data.uid == row.uid, f'guess'] = severity
            temp.append(severity)
        
        guess_train_preds[val_idx] = temp
        

                
        errror = rmse(val_data.severity, val_data.guess1)
        rmses.append(errror)
        print("RMSE: ", errror)

        print('Train Distribution: ')
        print(tr_data.severity.value_counts(normalize=True))
        print('Val Distribution: ')
        print(val_data.severity.value_counts(normalize=True))
        print('Predicted Distribution: ')
        print(val_data.guess1.value_counts(normalize=True))

    
    print('----------------------------------------------------')

    return rmses, guess_train_preds


In [16]:
tr_data.sort_values(by='date', inplace=True)
val_data.sort_values(by='date', inplace=True)

In [18]:
# Don't forget to clip values and trees are not good at extrapolation!!

# Guess based on nearest neighbors

In [14]:
train_data.groupby('region').severity.mean()

region
midwest      2.194091
northeast    1.805774
south        1.567652
west         3.747413
Name: severity, dtype: float64

In [15]:
reg_sev_map = {
    'midwest': 2,
    'northeast': 2,
    'south' : 2,
    'west' : 4
}

In [16]:
def knn(row=None, train_data=tr_data, k=1):
    """
    row : pd.Series (row from val_data)
    train_data : pd.DataFrame (all_data)
    k : int number of nearest neighbours to consider
    
    algo:
    1. Get past month data collected till the current row
    2. Get the k nearest neighbours (geodesic dist using lat, lng) from the above data
    3. Get the mean of the severity from the above rows
    4. Return the mean of the nearest neighbours severity

    """
    
    if row is None:
        print('Row None bruv!')
        return None
    
    uid = row.uid
    date = row.date
    region = row.region
    past_date = date - pd.Timedelta(days=30)
    
    past_month_data = train_data[(train_data.date < date) & (train_data.date >= past_date)]
    past_month_data.sort_values(by='date', inplace=True)
    
    # if no past data, return the mean of the region
    if len(past_month_data) == 0:
        return reg_sev_map[region]

    
    dist_matrix =pd.DataFrame(columns=['uid', 'dist'])       # 0th col for uid, 1st col for dist
    for i, past_row in enumerate(past_month_data.itertuples()):
        dist_matrix.loc[i, 'uid'] = past_row.uid
        dist_matrix.loc[i, 'dist'] = get_distance(row.latitude, row.longitude, past_row.latitude, past_row.longitude)  # returns geodesic dist in km

    # get mean of top k nearest neighbours
    n_uids = dist_matrix.sort_values(by='dist').head(k).uid.values
    nn_severity = train_data[train_data.uid.isin(n_uids)].severity.mean()
    
    return np.round(nn_severity)


In [17]:
val_data.uid.isin(tr_data.uid).sum()

0

In [18]:
tr_data.shape, tr_data2_te_dist.shape, tr_data3_te_dist.shape

((14501, 13), (5617, 13), (14501, 13))

In [24]:
def knn_wrapper(k=1, train_data=train_data, val_data=val_data):
    sev_list = Parallel(n_jobs=-1, backend='loky')([delayed(knn)(row, train_data=train_data, k=k) for row in val_data.itertuples()])
    return sev_list

In [184]:
%%time
#  iterate over val_data and get the severity for each row

k = 20
val_data_20 = knn_wrapper(k=k, train_data=tr_data, val_data=val_data)  # random split

CPU times: total: 1min 28s
Wall time: 2min 22s


In [185]:
rmse(val_data.severity, val_data_20)
# k=1  0.9511310393449847
# k=5  0.8807347105254485
# k=20 0.9428781194325851
# k=30 0.9951032905241727

0.9428781194325851

In [33]:
# k=1  0.9515418063860925
# k=2  0.9511310393449847
# k=3  0.9076118558993864
# k=4  0.8820647913897578
# k=5  0.8738303744155784
# k=6  0.8884654455787772
# k=7  0.8873651783510002
# k=8  0.8853812517340963
# k=9  0.894165010958815
# k=10 0.8965654582604005
# k=30 0.9951032905241727

In [34]:
val_data2_te_dist.uid.isin(tr_data2_te_dist.uid).sum()

0

In [35]:
tr_data2_te_dist['latlng'] = tr_data2_te_dist.latitude.astype(str) + "_" + tr_data2_te_dist.longitude.astype(str)
val_data2_te_dist['latlng'] = val_data2_te_dist.latitude.astype(str) + "_" + val_data2_te_dist.longitude.astype(str)
len(set(tr_data2_te_dist.latlng).intersection(set(val_data2_te_dist.latlng)))/len(set(val_data2_te_dist.latlng))
# 15% intersection

0.15685195376995048

In [36]:
k = 5
val_data2_te_5 = knn_wrapper(k=k, train_data=tr_data2_te_dist, val_data=val_data2_te_dist)
rmse(val_data2_te_dist.severity, val_data2_te_5)

1.0093350915731063

In [37]:
# k=1  1.0176250615645182
# k=5  1.01088
# k=10 1.075500737635704
# k=15 1.121915684290619
# k=30 1.2245055465487782

1.2245055465487782

In [37]:
analyize_matches(val_data2_te_dist.severity, val_data2_te_5)

Exact matches:  0.5033216100039077
Missed by 1:  0.365767878077374
Missed by 2:  0.10668229777256741
Missed by 3:  0.023055881203595155
Missed by 4:  0.0011723329425556857


In [40]:
sum([0.5033216100039077,  0.365767878077374, 0])

0.8690894880812817

In [35]:
analyize_matches(val_data2_te_dist.severity, val_data2_te_10)

Exact matches:  0.4650254005470887
Missed by 1:  0.39663931223134036
Missed by 2:  0.09808518952715904
Missed by 3:  0.03946854239937476
Missed by 4:  0.0007815552950371239


In [39]:
analyize_matches(val_data2_te_dist.severity, val_data2_te_dist['nn30_guess'])

Exact matches:  0.31652989449003516
Missed by 1:  0.5013677217663149
Missed by 2:  0.1281750683860883
Missed by 3:  0.053927315357561546
Missed by 4:  0.0


In [45]:
analyize_matches(val_data2_te_dist.severity, val_data2_te_dist['nn1_guess'])

Exact matches:  0.5513872606486909
Missed by 1:  0.30754200859710823
Missed by 2:  0.10941774130519734
Missed by 3:  0.03087143415396639
Missed by 4:  0.0007815552950371239


In [28]:
tr_data3_te_dist['latlng'] = tr_data3_te_dist.latitude.astype(str) + "_" + tr_data3_te_dist.longitude.astype(str)
val_data3_te_dist['latlng'] =val_data3_te_dist.latitude.astype(str) + "_" +val_data3_te_dist.longitude.astype(str)

len(set(tr_data3_te_dist.latlng).intersection(set(val_data3_te_dist.latlng)))/len(set(val_data3_te_dist.latlng))
# 60% intersection

0.5754716981132075

In [48]:
k = 30
for row in tqdm(val_data3_te_dist.itertuples(), total=val_data3_te_dist.shape[0]):
    severity = knn(row, train_data=tr_data3, k=k)
    val_data3_te_dist.loc[val_data3_te_dist.uid == row.uid, f'nn{k}_guess'] = severity

100%|██████████| 131/131 [00:14<00:00,  9.14it/s]


In [50]:
rmse(val_data3_te_dist.severity,val_data3_te_dist.nn30_guess)
# 30 1.066
# 15 1.04
# 5 0.977
# 1 1.059

1.066491715932511

# Exapanding means again

In [54]:
data = data.sort_values(by='date')
test_data_sorted = test_data.sort_values(by='date')

all_train_data_sorted = train_data.sort_values(by='date')
train_data_sorted = tr_data.sort_values(by='date')
val_data_sorted = val_data.sort_values(by='date')

In [69]:
grp_by_region = train_data.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(4, inplace=True)
grp_by_region['northeast'].fillna(4, inplace=True)
print(grp_by_region.isna().sum())

# rmse 
rmse(val_data[val_data.index.isin(train_data.index)].severity.sort_index(), grp_by_region.droplevel(0).loc[val_data.index].sort_index())

0


0.9015639227689898

In [86]:
data3 = pd.concat([tr_data3_te_dist, val_data3_te_dist])
data3 = data3.sort_values(by='date')
data3.head()

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,date_reg
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0,2013-01-04_midwest
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0,2013-01-04_midwest
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0,2013-01-04_midwest
5317,fwbt,44.8505,-93.5157,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0,2013-01-04_midwest
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0,2013-01-04_midwest


In [72]:
data2 = pd.concat([tr_data2_te_dist, val_data2_te_dist], axis=0)
data2 = data2.sort_values(by='date')
data2.head()

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,date_reg,nn30_guess,nn20_guess
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0,2013-01-04_midwest,,
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0,2013-01-04_midwest,,
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0,2013-01-04_midwest,,
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0,2013-01-04_midwest,,
5317,fwbt,44.8505,-93.5157,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0,2013-01-04_midwest,,


In [76]:
grp_by_region = data2.groupby('region').severity.expanding(1).mean()
grp_by_region = grp_by_region.map(np.round)

grp_by_region['west'].fillna(4, inplace=True)
grp_by_region['northeast'].fillna(4, inplace=True)
print(grp_by_region.isna().sum())


analyize_matches(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_region.droplevel(0).loc[val_data2_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_region.droplevel(0).loc[val_data2_te_dist.index].sort_index()))
print("trian rmse", rmse(data2[data2.index.isin(data2.index)].severity.sort_index(), grp_by_region.droplevel(0).loc[data2.index].sort_index()))

0
Exact matches:  0.46658851113716293
Missed by 1:  0.4697147323173114
Missed by 2:  0.047674872997264556
Missed by 3:  0.016021883548261038
Missed by 4:  0.0
val rmse 0.8970012130653553
trian rmse 0.925423687808426


In [85]:
grp_by_rs = data2.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)

grp_by_rs.fillna(reg_sev_map, inplace=True)
print(grp_by_rs.isna().sum())


analyize_matches(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index()))
print("trian rmse", rmse(data2[data2.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data2.index].sort_index()))

0
Exact matches:  0.4618991793669402
Missed by 1:  0.4802657288003126
Missed by 2:  0.0492379835873388
Missed by 3:  0.007424775302852677
Missed by 4:  0.0011723329425556857
val rmse 0.8733830591190974
trian rmse 0.8923188201042993


In [157]:
# why the heck nan != nan ??

True

In [163]:
grp_by_rs = data2.groupby(['region', 'season']).severity.expanding(3).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index()))
print("trian rmse", rmse(data2[data2.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data2.index].sort_index()))

32
Exact matches:  0.45720984759671746
Missed by 1:  0.4802657288003126
Missed by 2:  0.0492379835873388
Missed by 3:  0.007424775302852677
Missed by 4:  0.0011723329425556857
val rmse 0.8749706353000126
trian rmse 0.8939226593032625


In [164]:
grp_by_rs = data2.groupby(['region', 'season']).severity.expanding(2).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index()))
print("trian rmse", rmse(data2[data2.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data2.index].sort_index()))

16
Exact matches:  0.45955451348182885
Missed by 1:  0.4802657288003126
Missed by 2:  0.0492379835873388
Missed by 3:  0.007424775302852677
Missed by 4:  0.0011723329425556857
val rmse 0.8738803395059542
trian rmse 0.8932731896924032


In [165]:
grp_by_rs = data2.groupby(['region', 'season']).severity.expanding(5).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index()))
print("trian rmse", rmse(data2[data2.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data2.index].sort_index()))

64
Exact matches:  0.4544744040640875
Missed by 1:  0.4794841735052755
Missed by 2:  0.0492379835873388
Missed by 3:  0.007424775302852677
Missed by 4:  0.0011723329425556857
val rmse 0.8755030384007657
trian rmse 0.8943272674588092


In [169]:
grp_by_rs = data2.groupby(['region', 'season']).severity.expanding(50).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data2_te_dist[val_data2_te_dist.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data2_te_dist.index].sort_index()))
print("trian rmse", rmse(data2[data2.index.isin(data2.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data2.index].sort_index()))

702
Exact matches:  0.4282923016803439
Missed by 1:  0.4618991793669402
Missed by 2:  0.04572098475967175
Missed by 3:  0.005861664712778429
Missed by 4:  0.0007815552950371239
val rmse 0.8714837174049794
trian rmse 0.8960564303740658


In [171]:
grp_by_rs = data3.groupby(['region', 'season']).severity.expanding(1).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index()))
print("trian rmse", rmse(data3[data3.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data3.index].sort_index()))

0
Exact matches:  0.3816793893129771
Missed by 1:  0.4351145038167939
Missed by 2:  0.1297709923664122
Missed by 3:  0.05343511450381679
Missed by 4:  0.0
val rmse 1.1979626470874598
trian rmse 0.8608408797388944


In [172]:
grp_by_rs = data3.groupby(['region', 'season']).severity.expanding(3).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index()))
print("trian rmse", rmse(data3[data3.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data3.index].sort_index()))

32
Exact matches:  0.37404580152671757
Missed by 1:  0.4351145038167939
Missed by 2:  0.1297709923664122
Missed by 3:  0.05343511450381679
Missed by 4:  0.0
val rmse 1.2097315482354263
trian rmse 0.8618048550199326


In [173]:
grp_by_rs = data3.groupby(['region', 'season']).severity.expanding(5).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index()))
print("trian rmse", rmse(data3[data3.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data3.index].sort_index()))

64
Exact matches:  0.37404580152671757
Missed by 1:  0.4351145038167939
Missed by 2:  0.1297709923664122
Missed by 3:  0.05343511450381679
Missed by 4:  0.0
val rmse 1.209740613618887
trian rmse 0.8618578815585727


In [170]:
grp_by_rs = data3.groupby(['region', 'season']).severity.expanding(50).mean()
grp_by_rs = grp_by_rs.map(np.round)

print(grp_by_rs.isna().sum())
# grp_by_rs.fillna(reg_sev_map, inplace=True)
# filling in nan with mean of region
grp_by_rs.fillna(grp_by_rs.groupby('region').transform('mean'), inplace=True)

analyize_matches(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index())
# rmse 
print("val rmse", rmse(val_data3_te_dist[val_data3_te_dist.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[val_data3_te_dist.index].sort_index()))
print("trian rmse", rmse(data3[data3.index.isin(data3.index)].severity.sort_index(), grp_by_rs.droplevel(0).droplevel(0).loc[data3.index].sort_index()))

704
Exact matches:  0.3435114503816794
Missed by 1:  0.3969465648854962
Missed by 2:  0.11450381679389313
Missed by 3:  0.04580152671755725
Missed by 4:  0.0
val rmse 1.2065639638028516
trian rmse 0.8622279194818819


In [None]:
# if iam just like everyone else then how can i win everyone?

## Predicting Test set

In [51]:
test_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
12,aair,33.042600,-117.076000,2014-11-01,test,2014,11,44,4,west,,
14,aajw,40.703968,-80.293050,2015-08-26,test,2015,8,35,3,northeast,,
15,aalr,38.972500,-94.672930,2019-08-26,test,2019,8,35,3,midwest,,
16,aalw,34.279000,-118.905000,2018-01-08,test,2018,1,2,1,west,,
...,...,...,...,...,...,...,...,...,...,...,...,...
23556,zzpn,40.136410,-80.473740,2019-07-08,test,2019,7,28,3,northeast,,
23560,zzrv,36.875400,-121.561000,2019-09-17,test,2019,9,38,4,west,,
23563,zzsx,34.210000,-78.929389,2019-07-16,test,2019,7,29,3,south,,
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,


In [37]:
# %%time
# k = 5

# for row in tqdm(test_data[:100].itertuples(), total=test_data[:100].shape[0]):
#     severity = knn(row, train_data=train_data, k=k)
#     test_data.loc[test_data.uid == row.uid, f'nn{k}_guess'] = severity

In [186]:
%%time

#  parrllelize the above code
k = 20
sev_list = Parallel(n_jobs=-1, backend='loky')([delayed(knn)(row, train_data=train_data, k=k) for row in test_data.itertuples()])
len(sev_list)

CPU times: total: 10min 50s
Wall time: 13min 32s


6510

In [187]:
test_data[f'nn{k}_guess'] = sev_list
test_data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density,latlng,nn10_guess,nn20_guess
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,,36.5597_-121.51,4.0,4.0
12,aair,33.042600,-117.076000,2014-11-01,test,2014,11,44,4,west,,,33.0426_-117.076,4.0,4.0
14,aajw,40.703968,-80.293050,2015-08-26,test,2015,8,35,3,northeast,,,40.703968_-80.29305,2.0,1.0
15,aalr,38.972500,-94.672930,2019-08-26,test,2019,8,35,3,midwest,,,38.9725_-94.67293,4.0,3.0
16,aalw,34.279000,-118.905000,2018-01-08,test,2018,1,2,1,west,,,34.279_-118.905,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23556,zzpn,40.136410,-80.473740,2019-07-08,test,2019,7,28,3,northeast,,,40.13641_-80.47374,1.0,2.0
23560,zzrv,36.875400,-121.561000,2019-09-17,test,2019,9,38,4,west,,,36.8754_-121.561,4.0,4.0
23563,zzsx,34.210000,-78.929389,2019-07-16,test,2019,7,29,3,south,,,34.21_-78.9293894625038,1.0,1.0
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,,36.7085_-121.749,4.0,4.0


In [188]:
train_data['latlng'] = train_data.latitude.astype(str) + "_" + train_data.longitude.astype(str)
test_data['latlng'] = test_data.latitude.astype(str) + "_" + test_data.longitude.astype(str)

len(set(test_data['latlng']).intersection(set(train_data['latlng'])))/len(set(test_data['latlng']))

0.0

# Submission

In [189]:
#  Making submission with knn-20

assert sub_format.uid.equals(test_data.reset_index().uid) == True

sub_format.severity = test_data.nn20_guess.values

sub_format.severity = sub_format.severity.astype(int)
sub_format.severity.value_counts(normalize=True)

2    0.358372
4    0.301690
1    0.230415
3    0.109524
Name: severity, dtype: float64

In [190]:
# save submission
sub_format.to_csv('../submissions/to submit/nn20_guess_preds.csv', index=False)

In [None]:
#  May be its better if start from 

# Sooo....

Why am I behind?

- I know for a fact that no one on the top is using images as features
- How the train test split was done :
    - No overlapping geo locations ( 0 % )
    - only 51% of test date_regs are in train vs 92% in val
    

# Todos :

- idk..
