# Dependencies and data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error as mse
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor

from joblib import dump, load, Parallel, delayed

In [3]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [4]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week

seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

reg_sev_map = {
    'midwest': 2,
    'northeast': 2,
    'south' : 2,
    'west' : 4
}

reg_map = {
    'south' : 0,
    'northeast' : 1,
    'west' : 2,
    'midwest' : 3
}

metadata['season'] = metadata.month.map(seasons)

region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

all_data = pd.merge(metadata, region, on='uid', how='left')
data = all_data.copy(deep=True)
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


# Utils

In [6]:

def analyize_matches(y_true, y_pred, plot=False):
    print("Exact matches: ", sum(y_true == y_pred) / len(y_true))
    print("Missed by 1: ", sum(abs(y_true - y_pred) == 1) / len(y_true))
    print("Missed by 2: ", sum(abs(y_true - y_pred) == 2) / len(y_true))
    print("Missed by 3: ", sum(abs(y_true - y_pred) == 3) / len(y_true))
    print("Missed by 4: ", sum(abs(y_true - y_pred) == 4) / len(y_true))
    
    stupid_vals = []
    for i in range(1, 6):
        stupid_vals.append(
            ((sum([1 for x, y in zip(y_true, y_pred) if x == i and y == i])/len(y_true))*100, (sum(y_true == i)/len(y_true))*100)
            )

    print()
    for i in range(5):
        print(f"Severity {i+1} : accuracy: {np.round(stupid_vals[i][0], 3)} % - prevalence: {np.round(stupid_vals[i][1], 3)} %")
    
    try:
        print()
        print("Classification report:")
        print(classification_report(y_true, y_pred))
    except Exception as e:
        print(e)
        print("Classification report failed")
        
    if plot:
        print()
        sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Reds')


def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

def get_data_by_date( date=None, data=None):
    return data[data.date == date]


def get_distance(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).km

# Knn fts

In [23]:
def knn(row=None, train_data=None, k=1):
    """
    row : pd.Series (row from val_data)
    train_data : pd.DataFrame (all_data)
    k : int number of nearest neighbours to consider
    
    algo:
    1. Get past month data collected till the current row
    2. Get the k nearest neighbours (geodesic dist using lat, lng) from the above data
    3. Get the mean of the severity from the above rows
    4. Return the mean of the nearest neighbours severity

    """
    
    if row is None:
        print('Row None bruv!')
        return None
    
    uid = row.uid
    date = row.date
    region = row.region
    past_date = date - pd.Timedelta(days=30)
    
    past_month_data = train_data[(train_data.date < date) & (train_data.date >= past_date)]
    past_month_data.sort_values(by='date', inplace=True)
    
    # if no past data, return the mean of the region
    if len(past_month_data) == 0:
        return reg_sev_map[region]
        
    dist_matrix =pd.DataFrame(columns=['uid', 'dist'])       # 0th col for uid, 1st col for dist
    for i, past_row in enumerate(past_month_data.itertuples()):
        dist_matrix.loc[i, 'uid'] = past_row.uid
        dist_matrix.loc[i, 'dist'] = get_distance(row.latitude, row.longitude, past_row.latitude, past_row.longitude)   # returns geodesic dist in km

    # get mean of top k nearest neighbours
    n_uids = dist_matrix.sort_values(by='dist').head(k).uid.values
    nn_severity = train_data[train_data.uid.isin(n_uids)].severity.mean()
    

    if nn_severity not in range(1, 6):
        return reg_sev_map[region]

    return np.round(nn_severity)

In [8]:
def knn_wrapper(data, k=5):
    preds = []
    for row in tqdm(data.itertuples(), total=len(data)):
        severity = knn(row, train_data=data, k=15)
        preds.append(severity)
    return preds

In [9]:
# def knn_wrapper(data, k=5):
#     sev_list = Parallel(n_jobs=-1, backend='threading')([delayed(knn)(row, train_data=data, k=k) for row in data.itertuples()])
#     return sev_list

In [10]:
data = data.sort_values(by='date')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,2021,12,52,1,west,,
17559,thki,36.725400,-121.730000,2021-12-29,test,2021,12,52,1,west,,
17452,teuu,36.772300,-121.788000,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.751800,-121.742000,2021-12-29,test,2021,12,52,1,west,,


In [17]:
cut = round(23570/4)
cut

5892

In [None]:
d = [data[:cut], data[cut:2*cut], data[2*cut:3*cut], data[3*cut:]]
all_preds = []
for data_subset in d:
    preds = knn_wrapper(data_subset, k=15)
    all_preds.extend(preds)

In [20]:
%%time
all_preds = Parallel(n_jobs=-1, backend='loky')([delayed(knn_wrapper)(data_subset, k=15) for data_subset in d])

In [24]:
all_preds = [item for sublist in all_preds for item in sublist]
len(all_preds)

23570

In [25]:
preds = pd.Series(all_preds)
preds.isna().sum()

1537

In [36]:
nan_preds = preds[preds.isna()]
new_nan_preds = data.loc[nan_preds.index].region.map(reg_sev_map)
new_nan_preds

80       4
112      2
114      2
118      2
122      2
        ..
23565    4
23566    2
23567    2
23568    2
23569    2
Name: region, Length: 1537, dtype: int64

In [37]:
preds = preds.fillna(new_nan_preds)
preds.isna().sum()

0

In [38]:
rmse(data[data.split == 'train']['severity'], preds[data[data.split == 'train'].index])

1.6195790633128162

In [108]:
preds.to_csv('all_knn15_preds.csv', index=False)

In [23]:
# %%time

# tar_data = data
# knn15_preds = []
# for row in tqdm(tar_data.itertuples(), total=len(tar_data)):
#     severity = knn(row, train_data=tar_data, k=15)
#     knn15_preds.append(severity)

 48%|████▊     | 11260/23570 [33:48<1:28:34,  2.32it/s]

# Add fts

In [39]:
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,2021,12,52,1,west,,
17559,thki,36.725400,-121.730000,2021-12-29,test,2021,12,52,1,west,,
17452,teuu,36.772300,-121.788000,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.751800,-121.742000,2021-12-29,test,2021,12,52,1,west,,


In [54]:
data.groupby(['year', 'month', 'region']).severity.mean().reset_index()

Unnamed: 0,year,month,region,severity
0,2013,1,midwest,1.000000
1,2013,1,south,1.222222
2,2013,1,west,3.785714
3,2013,2,midwest,1.000000
4,2013,2,south,1.090909
...,...,...,...,...
364,2021,11,south,1.739130
365,2021,11,west,4.000000
366,2021,12,northeast,
367,2021,12,south,1.625000


In [57]:
train_data_ = data[data.split == 'train']
train_data_.severity.value_counts(normalize=True)

1.0    0.439449
4.0    0.207913
2.0    0.189859
3.0    0.159379
5.0    0.003400
Name: severity, dtype: float64

In [111]:
grp_by_r = data.groupby(['region']).severity.mean()
grp_by_r = grp_by_r.map(np.round)
print(grp_by_r.isna().sum())  
grp_by_r.fillna(grp_by_r.groupby('region').transform('mean'), inplace=True)
print(grp_by_r.isna().sum())

grp_by_r_preds = data[['region']].apply(lambda x: grp_by_r[x.region], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())

0
0
Exact matches:  0.37116060961313013
Missed by 1:  0.5907385697538101
Missed by 2:  0.02661195779601407
Missed by 3:  0.011488862837045722
Missed by 4:  0.0

Severity 1 : accuracy: 0.0 % - prevalence: 43.945 %
Severity 2 : accuracy: 18.388 % - prevalence: 18.986 %
Severity 3 : accuracy: 0.0 % - prevalence: 15.938 %
Severity 4 : accuracy: 18.728 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      7497
         2.0       0.24      0.97      0.38      3239
         3.0       0.00      0.00      0.00      2719
         4.0       0.85      0.90      0.87      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.37     17060
   macro avg       0.22      0.37      0.25     17060
weighted avg       0.22      0.37      0.25     17060



0.894754808017972

In [112]:
grp_by_r = data.groupby(['month']).severity.mean()
grp_by_r = grp_by_r.map(np.round)
print(grp_by_r.isna().sum())  
grp_by_r.fillna(grp_by_r.groupby('month').transform('mean'), inplace=True)
print(grp_by_r.isna().sum())

grp_by_r_preds = data[['month']].apply(lambda x: grp_by_r[x.month], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())

0
0
Exact matches:  0.18856975381008206
Missed by 1:  0.6035169988276671
Missed by 2:  0.20463071512309497
Missed by 3:  0.0032825322391559202
Missed by 4:  0.0

Severity 1 : accuracy: 0.0 % - prevalence: 43.945 %
Severity 2 : accuracy: 18.312 % - prevalence: 18.986 %
Severity 3 : accuracy: 0.545 % - prevalence: 15.938 %
Severity 4 : accuracy: 0.0 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      7497
         2.0       0.20      0.96      0.33      3239
         3.0       0.07      0.03      0.05      2719
         4.0       0.00      0.00      0.00      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.19     17060
   macro avg       0.05      0.20      0.08     17060
weighted avg       0.05      0.19      0.07     17060



1.2048164380819388

In [113]:
grp_by_r = data.groupby(['season']).severity.mean()
grp_by_r = grp_by_r.map(np.round)
print(grp_by_r.isna().sum())  
grp_by_r.fillna(grp_by_r.groupby('season').transform('mean'), inplace=True)
print(grp_by_r.isna().sum())

grp_by_r_preds = data[['season']].apply(lambda x: grp_by_r[x.season], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())

0
0
Exact matches:  0.18985932004689332
Missed by 1:  0.5988276670574443
Missed by 2:  0.2079132473622509
Missed by 3:  0.003399765533411489
Missed by 4:  0.0

Severity 1 : accuracy: 0.0 % - prevalence: 43.945 %
Severity 2 : accuracy: 18.986 % - prevalence: 18.986 %
Severity 3 : accuracy: 0.0 % - prevalence: 15.938 %
Severity 4 : accuracy: 0.0 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      7497
         2.0       0.19      1.00      0.32      3239
         3.0       0.00      0.00      0.00      2719
         4.0       0.00      0.00      0.00      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.19     17060
   macro avg       0.04      0.20      0.06     17060
weighted avg       0.04      0.19      0.06     17060



1.2087508206024726

In [114]:
grp_by_r = data.groupby(['year']).severity.mean()
grp_by_r = grp_by_r.map(np.round)
print(grp_by_r.isna().sum())  
grp_by_r.fillna(grp_by_r.groupby('year').transform('mean'), inplace=True)
print(grp_by_r.isna().sum())

grp_by_r_preds = data[['year']].apply(lambda x: grp_by_r[x.year], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())

0
0
Exact matches:  0.18985932004689332
Missed by 1:  0.5988276670574443
Missed by 2:  0.2079132473622509
Missed by 3:  0.003399765533411489
Missed by 4:  0.0

Severity 1 : accuracy: 0.0 % - prevalence: 43.945 %
Severity 2 : accuracy: 18.986 % - prevalence: 18.986 %
Severity 3 : accuracy: 0.0 % - prevalence: 15.938 %
Severity 4 : accuracy: 0.0 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      7497
         2.0       0.19      1.00      0.32      3239
         3.0       0.00      0.00      0.00      2719
         4.0       0.00      0.00      0.00      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.19     17060
   macro avg       0.04      0.20      0.06     17060
weighted avg       0.04      0.19      0.06     17060



1.2087508206024726

In [115]:
grp_by_r = data.groupby(['week']).severity.mean()
grp_by_r = grp_by_r.map(np.round)
print(grp_by_r.isna().sum())  
grp_by_r.fillna(grp_by_r.groupby('week').transform('mean'), inplace=True)
print(grp_by_r.isna().sum())

grp_by_r_preds = data[['week']].apply(lambda x: grp_by_r[x.week], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())

1
1
Exact matches:  0.2026377491207503
Missed by 1:  0.6036928487690504
Missed by 2:  0.1888042203985932
Missed by 3:  0.004865181711606096
Missed by 4:  0.0

Severity 1 : accuracy: 1.002 % - prevalence: 43.945 %
Severity 2 : accuracy: 17.761 % - prevalence: 18.986 %
Severity 3 : accuracy: 0.891 % - prevalence: 15.938 %
Severity 4 : accuracy: 0.61 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.83      0.02      0.04      7497
         2.0       0.21      0.94      0.34      3239
         3.0       0.08      0.06      0.06      2719
         4.0       0.81      0.03      0.06      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.20     17060
   macro avg       0.39      0.21      0.10     17060
weighted avg       0.59      0.20      0.11     17060



1.1843548310231518

In [None]:
grp_by_r = data.groupby(['week']).severity.mean()
grp_by_r = grp_by_r.map(np.round)
print(grp_by_r.isna().sum())  
grp_by_r.fillna(grp_by_r.groupby('week').transform('mean'), inplace=True)
print(grp_by_r.isna().sum())

grp_by_r_preds = data[['week']].apply(lambda x: grp_by_r[x.week], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_r_preds.loc[train_data_.index].sort_index())

In [75]:
exp_grp_by_rm = data.groupby(['region', 'month']).severity.expanding(1).mean()
exp_grp_by_rm = exp_grp_by_rm.map(np.round)
print(exp_grp_by_rm.isna().sum())  
exp_grp_by_rm.droplevel(0).droplevel(0)
exp_grp_by_rm.fillna(exp_grp_by_rm.groupby('region').transform('mean'), inplace=True)
print(exp_grp_by_rm.isna().sum())  
analyize_matches(train_data_.severity.sort_index(), exp_grp_by_rm.droplevel(0).droplevel(0).loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), np.round(exp_grp_by_rm.droplevel(0).droplevel(0).loc[train_data_.index].sort_index()))

72
0
Exact matches:  0.5014067995310668
Missed by 1:  0.44021101992966
Missed by 2:  0.04824150058616647
Missed by 3:  0.009730363423212192
Missed by 4:  0.00041031652989449003

Severity 1 : accuracy: 19.965 % - prevalence: 43.945 %
Severity 2 : accuracy: 12.767 % - prevalence: 18.986 %
Severity 3 : accuracy: 0.545 % - prevalence: 15.938 %
Severity 4 : accuracy: 16.864 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.72      0.45      0.56      7497
         2.0       0.26      0.67      0.37      3239
         3.0       0.18      0.03      0.06      2719
         4.0       0.86      0.81      0.84      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.50     17060
   macro avg       0.40      0.39      0.36     17060
weighted avg       0.57      0.50      0.50     17060



0.8528278592784991

In [106]:
grp_by_rm = data.groupby(['region', 'month']).severity.mean()
grp_by_rm = grp_by_rm.map(np.round)
print(grp_by_rm.isna().sum())  
grp_by_rm.fillna(grp_by_rm.groupby('region').transform('mean'), inplace=True)
print(grp_by_rm.isna().sum())

grp_by_rm_preds = data[['region', 'month']].apply(lambda x: grp_by_rm[x.region, x.month], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_rm_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_rm_preds.loc[train_data_.index].sort_index())

1
0
Exact matches:  0.4742672919109027
Missed by 1:  0.4705158264947245
Missed by 2:  0.04660023446658851
Missed by 3:  0.008382180539273153
Missed by 4:  0.00023446658851113716

Severity 1 : accuracy: 15.768 % - prevalence: 43.945 %
Severity 2 : accuracy: 13.271 % - prevalence: 18.986 %
Severity 3 : accuracy: 2.069 % - prevalence: 15.938 %
Severity 4 : accuracy: 16.319 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.73      0.36      0.48      7497
         2.0       0.25      0.70      0.37      3239
         3.0       0.29      0.13      0.18      2719
         4.0       0.86      0.78      0.82      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.47     17060
   macro avg       0.43      0.39      0.37     17060
weighted avg       0.59      0.47      0.48     17060



0.8579672806294627

In [101]:
exp_grp_by_rmy = data.groupby(['region', 'month', 'year']).severity.expanding(1).mean()
exp_grp_by_rmy = exp_grp_by_rmy.map(np.round)
print(exp_grp_by_rmy.isna().sum())  
exp_grp_by_rmy.droplevel(0).droplevel(0)
exp_grp_by_rmy.fillna(exp_grp_by_rmy.groupby('region').transform('mean'), inplace=True)
print(exp_grp_by_rmy.isna().sum())  
analyize_matches(train_data_.severity.sort_index(), exp_grp_by_rmy.droplevel(0).droplevel(0).droplevel(0).loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), np.round(exp_grp_by_rmy.droplevel(0).droplevel(0).droplevel(0).loc[train_data_.index].sort_index()))

611
0
Exact matches:  0.5368112543962485
Missed by 1:  0.41770222743259083
Missed by 2:  0.04044548651817116
Missed by 3:  0.004865181711606096
Missed by 4:  0.00017584994138335287

Severity 1 : accuracy: 20.991 % - prevalence: 43.945 %
Severity 2 : accuracy: 13.118 % - prevalence: 18.986 %
Severity 3 : accuracy: 2.778 % - prevalence: 15.938 %
Severity 4 : accuracy: 16.758 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.035 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.77      0.48      0.59      7497
         2.0       0.28      0.69      0.40      3239
         3.0       0.39      0.17      0.24      2719
         4.0       0.90      0.81      0.85      3547
         5.0       1.00      0.10      0.19        58

    accuracy                           0.54     17060
   macro avg       0.67      0.45      0.45     17060
weighted avg       0.64      0.54      0.55     17060



0.7912549576286168

In [105]:
grp_by_rmy = data.groupby(['region', 'month', 'year']).severity.mean()
grp_by_rmy = grp_by_rmy.map(np.round)
print(grp_by_rmy.isna().sum())  
grp_by_rmy.fillna(grp_by_rmy.groupby('region').transform('mean'), inplace=True)
print(grp_by_rmy.isna().sum())

grp_by_rmy_preds = data[['region', 'month', 'year']].apply(lambda x: grp_by_rmy[x.region, x.month, x.year], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_rmy_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_rmy_preds.loc[train_data_.index].sort_index())

20
0
Exact matches:  0.5093200468933177
Missed by 1:  0.4429073856975381
Missed by 2:  0.04220398593200469
Missed by 3:  0.005392731535756155
Missed by 4:  0.00017584994138335287

Severity 1 : accuracy: 18.576 % - prevalence: 43.945 %
Severity 2 : accuracy: 13.212 % - prevalence: 18.986 %
Severity 3 : accuracy: 2.532 % - prevalence: 15.938 %
Severity 4 : accuracy: 16.612 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.76      0.42      0.54      7497
         2.0       0.27      0.70      0.39      3239
         3.0       0.33      0.16      0.22      2719
         4.0       0.89      0.80      0.84      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.51     17060
   macro avg       0.45      0.42      0.40     17060
weighted avg       0.62      0.51      0.52     17060



0.814292031343483

In [110]:
grp_by_rms = data.groupby(['region', 'month', 'season']).severity.mean()
grp_by_rms = grp_by_rms.map(np.round)
print(grp_by_rms.isna().sum())  
grp_by_rms.fillna(grp_by_rms.groupby('region').transform('mean'), inplace=True)
print(grp_by_rms.isna().sum())

grp_by_rmy_preds = data[['region', 'month', 'season']].apply(lambda x: grp_by_rms[x.region, x.month, x.season], axis=1)
analyize_matches(train_data_.severity.sort_index(), grp_by_rmy_preds.loc[train_data_.index].sort_index())
rmse(train_data_.severity.sort_index(), grp_by_rmy_preds.loc[train_data_.index].sort_index())

1
0
Exact matches:  0.4742672919109027
Missed by 1:  0.4705158264947245
Missed by 2:  0.04660023446658851
Missed by 3:  0.008382180539273153
Missed by 4:  0.00023446658851113716

Severity 1 : accuracy: 15.768 % - prevalence: 43.945 %
Severity 2 : accuracy: 13.271 % - prevalence: 18.986 %
Severity 3 : accuracy: 2.069 % - prevalence: 15.938 %
Severity 4 : accuracy: 16.319 % - prevalence: 20.791 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.34 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.73      0.36      0.48      7497
         2.0       0.25      0.70      0.37      3239
         3.0       0.29      0.13      0.18      2719
         4.0       0.86      0.78      0.82      3547
         5.0       0.00      0.00      0.00        58

    accuracy                           0.47     17060
   macro avg       0.43      0.39      0.37     17060
weighted avg       0.59      0.47      0.48     17060



0.8579672806294627

# Xgboost

In [9]:
data = data.sort_values(by='date')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,2021,12,52,1,west,,
17559,thki,36.725400,-121.730000,2021-12-29,test,2021,12,52,1,west,,
17452,teuu,36.772300,-121.788000,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.751800,-121.742000,2021-12-29,test,2021,12,52,1,west,,


In [121]:
data.drop('latlng', axis=1, inplace=True)
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,2021,12,52,1,west,,
17559,thki,36.725400,-121.730000,2021-12-29,test,2021,12,52,1,west,,
17452,teuu,36.772300,-121.788000,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.751800,-121.742000,2021-12-29,test,2021,12,52,1,west,,


In [10]:
train, test = train_test_split(data[data.split == 'train'], test_size=0.05, random_state=42, shuffle=True)
train.shape, test.shape

((16207, 12), (853, 12))

In [11]:
train.region = train.region.map(reg_map)
test.region = test.region.map(reg_map)

train.week = train.week.astype('int')
test.week = test.week.astype('int')

In [12]:
drop_cols = ['uid', 'split', 'date', 'severity', 'density']

X_train = train.drop(drop_cols, axis=1)
y_train = train['severity']

X_val = test.drop(drop_cols, axis=1)
y_val = test['severity']

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((16207, 7), (16207,), (853, 7), (853,))

In [13]:
y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)

(1.0    0.438699
 4.0    0.207194
 2.0    0.190967
 3.0    0.159807
 5.0    0.003332
 Name: severity, dtype: float64,
 1.0    0.453693
 4.0    0.221571
 2.0    0.168816
 3.0    0.151231
 5.0    0.004689
 Name: severity, dtype: float64)

In [14]:
X_val

Unnamed: 0,latitude,longitude,year,month,week,season,region
3823,35.643750,-79.279197,2019,12,49,1,0
21145,35.628766,-79.307028,2015,3,13,2,0
2979,40.108330,-75.864280,2019,4,14,2,1
16401,35.980000,-78.814305,2019,11,47,4,0
1476,37.967400,-121.464000,2013,7,29,3,2
...,...,...,...,...,...,...,...
22078,37.441900,-121.003000,2021,7,28,3,2
14008,35.636671,-79.292347,2015,12,50,1,0
15383,35.690086,-79.193133,2019,7,27,3,0
17739,38.060900,-121.209000,2017,5,20,2,2


In [29]:
%%time

xgb_reg = XGBRegressor(
    n_estimators=4000, 
    max_depth=5, 
    learning_rate=0.02, 
    tree_method='gpu_hist',
    gpu_id=0, 
    n_jobs=-1, 
    verbose=1, 
    random_state=42,
    )

xgb_reg.fit(X_train, y_train)
preds = xgb_reg.predict(X_val)
preds = pd.Series(np.round(preds)).clip(1, 5).values
print("train rmse", rmse(y_train, xgb_reg.predict(X_train)))
print("test rmse:", rmse(y_val, preds))

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


train rmse 0.5272982208913567
test rmse: 0.679631649768417
CPU times: total: 12.1 s
Wall time: 20.3 s


In [21]:
analyize_matches(y_val, preds)

Exact matches:  0.652989449003517
Missed by 1:  0.3071512309495897
Missed by 2:  0.03985932004689332
Missed by 3:  0.0
Missed by 4:  0.0

Severity 1 : accuracy: 29.074 % - prevalence: 45.369 %
Severity 2 : accuracy: 10.199 % - prevalence: 16.882 %
Severity 3 : accuracy: 5.744 % - prevalence: 15.123 %
Severity 4 : accuracy: 20.281 % - prevalence: 22.157 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.469 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.82      0.64      0.72       387
         2.0       0.32      0.60      0.41       144
         3.0       0.54      0.38      0.45       129
         4.0       0.95      0.92      0.93       189
         5.0       0.00      0.00      0.00         4

    accuracy                           0.65       853
   macro avg       0.52      0.51      0.50       853
weighted avg       0.72      0.65      0.67       853



In [69]:
X_val.isin(X_train).all()

latitude     False
longitude    False
year         False
month        False
week         False
season       False
region       False
dtype: bool

# Test predictions

In [204]:
# what the fucking fuck?!!!!

all_train = all_data[all_data.split == 'train']
all_test = all_data[all_data.split == 'test']

all_train.region = all_train.region.map(reg_map)
all_test.region = all_test.region.map(reg_map)

all_train.week = all_train.week.astype('int')
all_test.week = all_test.week.astype('int')

drop_cols = ['uid', 'split', 'date', 'severity', 'density']
all_train.shape, all_test.shape

((17060, 12), (6510, 12))

In [205]:
X = all_train.drop(drop_cols, axis=1)
y = all_train['severity']
X_test = all_test.drop(drop_cols, axis=1)

X.shape, y.shape, X_test.shape

((17060, 7), (17060,), (6510, 7))

In [206]:
# hgb_reg = HistGradientBoostingRegressor(random_state=42, learning_rate=0.05, max_iter=300, early_stopping=True, scoring='loss')
# hgb_reg.fit(X, y)
# test_preds = hgb_reg.predict(X_test)
# test_preds = pd.Series(np.round(test_preds)).clip(1, 5).values
# print("train rmse", rmse(y, hgb_reg.predict(X)))

train rmse 0.5917295419527717


In [207]:
y.value_counts(normalize=True)

1.0    0.439449
4.0    0.207913
2.0    0.189859
3.0    0.159379
5.0    0.003400
Name: severity, dtype: float64

In [208]:
pd.Series(test_preds).value_counts(normalize=True)

2.0    0.390015
1.0    0.277573
4.0    0.208141
3.0    0.124270
dtype: float64

# Submission

In [209]:
sub_format.severity = test_preds.astype(int)
display(sub_format.sample(5))
sub_format.to_csv('../submissions/to submit/idkanything_histgrad.csv', index=False)

Unnamed: 0,uid,region,severity
6329,ziou,midwest,3
5274,vblu,northeast,2
4418,rsos,west,2
213,axlb,northeast,2
5977,xxxb,west,4
