# Dependencies and data

In [161]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error as mse

In [87]:
metadata = pd.read_csv('../data/metadata.csv')
sub_format = pd.read_csv('../data/submission_format.csv')
train_labels = pd.read_csv('../data/train_labels.csv')

In [89]:
metadata.date = pd.to_datetime(metadata.date)
metadata['year'] = metadata.date.dt.year
metadata['month'] = metadata.date.dt.month
metadata['week'] = metadata.date.dt.isocalendar().week

seasons = {
    1: 1,
    2: 1,
    3: 2,
    4: 2,
    5: 2,
    6: 3,
    7: 3,
    8: 3,
    9: 4,
    10: 4,
    11: 4,
    12: 1
}

# reg_sev_map = {
#     'midwest': 2,
#     'northeast': 2,
#     'south' : 2,
#     'west' : 4
# }

reg_map = {
    'south' : 0,
    'northeast' : 1,
    'west' : 2,
    'midwest' : 3
}

metadata['season'] = metadata.month.map(seasons)

region = pd.concat((train_labels, sub_format[['region', 'uid']]), axis=0)

all_data = pd.merge(metadata, region, on='uid', how='left')
data = all_data.copy(deep=True)
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
0,aabm,39.080319,-86.430867,2018-05-14,train,2018,5,20,2,midwest,1.0,585.0
1,aabn,36.559700,-121.510000,2016-08-31,test,2016,8,35,3,west,,
2,aacd,35.875083,-78.878434,2020-11-19,train,2020,11,47,4,south,1.0,290.0
3,aaee,35.487000,-79.062133,2016-08-24,train,2016,8,34,3,south,1.0,1614.0
4,aaff,38.049471,-99.827001,2019-07-23,train,2019,7,30,3,midwest,3.0,111825.0
...,...,...,...,...,...,...,...,...,...,...,...,...
23565,zzvv,36.708500,-121.749000,2014-12-02,test,2014,12,49,1,west,,
23566,zzwo,39.792190,-99.971050,2017-06-19,train,2017,6,25,3,midwest,2.0,48510.0
23567,zzwq,35.794000,-79.012551,2015-03-24,train,2015,3,13,2,south,1.0,1271.0
23568,zzyb,35.742000,-79.238600,2016-11-21,train,2016,11,47,4,south,1.0,9682.0


# Utils

In [133]:

def analyize_matches(y_true, y_pred, plot=False):
    print("Exact matches: ", sum(y_true == y_pred) / len(y_true))
    print("Missed by 1: ", sum(abs(y_true - y_pred) == 1) / len(y_true))
    print("Missed by 2: ", sum(abs(y_true - y_pred) == 2) / len(y_true))
    print("Missed by 3: ", sum(abs(y_true - y_pred) == 3) / len(y_true))
    print("Missed by 4: ", sum(abs(y_true - y_pred) == 4) / len(y_true))
    
    stupid_vals = []
    for i in range(1, 6):
        stupid_vals.append(
            ((sum([1 for x, y in zip(y_true, y_pred) if x == i and y == i])/len(y_true))*100, (sum(y_true == i)/len(y_true))*100)
            )

    print()
    for i in range(5):
        print(f"Severity {i+1} : accuracy: {np.round(stupid_vals[i][0], 3)} % - prevalence: {np.round(stupid_vals[i][1], 3)} %")
    
    try:
        print()
        print("Classification report:")
        print(classification_report(y_true, y_pred))
    except Exception as e:
        print(e)
        print("Classification report failed")
        
    if plot:
        print()
        sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, fmt='d', cmap='Reds')


def rmse(y_true, y_pred):
    return mse(y_true, y_pred, squared=False)

# Xgboost

In [224]:
data = data.sort_values(by='date')
data

Unnamed: 0,uid,latitude,longitude,date,split,year,month,week,season,region,severity,density
4387,evep,44.847993,-93.476318,2013-01-04,train,2013,1,1,1,midwest,1.0,115.0
13644,paev,44.822478,-93.367962,2013-01-04,train,2013,1,1,1,midwest,1.0,1884.0
5566,gdxr,44.877646,-93.557842,2013-01-04,train,2013,1,1,1,midwest,1.0,1416.0
6144,guny,44.878889,-93.490833,2013-01-04,train,2013,1,1,1,midwest,1.0,558.0
5317,fwbt,44.850500,-93.515700,2013-01-04,train,2013,1,1,1,midwest,1.0,476.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12443,nsoi,36.736800,-121.734000,2021-12-29,test,2021,12,52,1,west,,
17559,thki,36.725400,-121.730000,2021-12-29,test,2021,12,52,1,west,,
17452,teuu,36.772300,-121.788000,2021-12-29,test,2021,12,52,1,west,,
14254,prfi,36.751800,-121.742000,2021-12-29,test,2021,12,52,1,west,,


In [225]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data[data.split == 'train'], test_size=0.02, random_state=42, shuffle=True)
train.shape, test.shape

((16718, 12), (342, 12))

In [226]:
train.region = train.region.map(reg_map)
test.region = test.region.map(reg_map)

train.week = train.week.astype('int')
test.week = test.week.astype('int')

In [227]:
drop_cols = ['uid', 'split', 'date', 'severity', 'density']

X_train = train.drop(drop_cols, axis=1)
y_train = train['severity']

X_val = test.drop(drop_cols, axis=1)
y_val = test['severity']

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((16718, 7), (16718,), (342, 7), (342,))

In [228]:
y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)

(1.0    0.438868
 4.0    0.207920
 2.0    0.190274
 3.0    0.159469
 5.0    0.003469
 Name: severity, dtype: float64,
 1.0    0.467836
 4.0    0.207602
 2.0    0.169591
 3.0    0.154971
 Name: severity, dtype: float64)

In [229]:
y_train = y_train - 1
y_val = y_val - 1

In [230]:
X_val

Unnamed: 0,latitude,longitude,year,month,week,season,region
3823,35.643750,-79.279197,2019,12,49,1,0
21145,35.628766,-79.307028,2015,3,13,2,0
2979,40.108330,-75.864280,2019,4,14,2,1
16401,35.980000,-78.814305,2019,11,47,4,0
1476,37.967400,-121.464000,2013,7,29,3,2
...,...,...,...,...,...,...,...
12919,35.692934,-79.187843,2017,7,29,3,0
11401,35.715264,-79.146368,2016,6,25,3,0
1521,35.856449,-78.729297,2014,8,33,3,0
2885,35.742000,-79.213247,2015,8,32,3,0


In [231]:
xgb_clf = XGBClassifier(n_estimators=2000, max_depth=5, learning_rate=0.02, objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0, n_jobs=-1, verbose=1)
xgb_clf.fit(X_train, y_train)
preds = xgb_clf.predict(X_val)
print("train rmse", rmse(y_train, xgb_clf.predict(X_train)))
print("test rmse:", rmse(y_val, preds))

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


train rmse 0.6932674024866884
test rmse: 0.7492686492653552


In [232]:
analyize_matches(y_val, preds)

Exact matches:  0.672514619883041
Missed by 1:  0.2543859649122807
Missed by 2:  0.07017543859649122
Missed by 3:  0.0029239766081871343
Missed by 4:  0.0

Severity 1 : accuracy: 1.754 % - prevalence: 16.959 %
Severity 2 : accuracy: 6.725 % - prevalence: 15.497 %
Severity 3 : accuracy: 19.298 % - prevalence: 20.76 %
Severity 4 : accuracy: 0.0 % - prevalence: 0.0 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.0 %

Classification report:
              precision    recall  f1-score   support

         0.0       0.71      0.84      0.77       160
         1.0       0.20      0.10      0.14        58
         2.0       0.50      0.43      0.46        53
         3.0       0.88      0.93      0.90        71

    accuracy                           0.67       342
   macro avg       0.57      0.58      0.57       342
weighted avg       0.62      0.67      0.64       342



In [233]:
y_train = y_train + 1
y_val = y_val + 1

In [234]:
xgb_reg = XGBRegressor(n_estimators=2000, max_depth=5, learning_rate=0.02, objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0, n_jobs=-1, verbose=1)
xgb_reg.fit(X_train, y_train)
preds = xgb_reg.predict(X_val)
preds = pd.Series(np.round(preds)).clip(1, 5).values
print("train rmse", rmse(y_train, xgb_reg.predict(X_train)))
print("test rmse:", rmse(y_val, preds))

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


train rmse 0.560854655951999
test rmse: 0.6688560540599386


In [236]:
analyize_matches(y_val, preds)

Exact matches:  0.6491228070175439
Missed by 1:  0.31871345029239767
Missed by 2:  0.03216374269005848
Missed by 3:  0.0
Missed by 4:  0.0

Severity 1 : accuracy: 29.532 % - prevalence: 46.784 %
Severity 2 : accuracy: 9.649 % - prevalence: 16.959 %
Severity 3 : accuracy: 6.14 % - prevalence: 15.497 %
Severity 4 : accuracy: 19.591 % - prevalence: 20.76 %
Severity 5 : accuracy: 0.0 % - prevalence: 0.0 %

Classification report:
              precision    recall  f1-score   support

         1.0       0.83      0.63      0.72       160
         2.0       0.30      0.57      0.39        58
         3.0       0.55      0.40      0.46        53
         4.0       0.93      0.94      0.94        71

    accuracy                           0.65       342
   macro avg       0.65      0.64      0.63       342
weighted avg       0.72      0.65      0.67       342



In [237]:
X_val.isin(X_train).all()

latitude     False
longitude    False
year         False
month        False
week         False
season       False
region       False
dtype: bool

In [238]:
# what the fucking fuck?!!!!

all_train = all_data[all_data.split == 'train']
all_test = all_data[all_data.split == 'test']

all_train.region = all_train.region.map(reg_map)
all_test.region = all_test.region.map(reg_map)

all_train.week = all_train.week.astype('int')
all_test.week = all_test.week.astype('int')

drop_cols = ['uid', 'split', 'date', 'severity', 'density']
all_train.shape, all_test.shape

((17060, 12), (6510, 12))

In [247]:
X = all_train.drop(drop_cols, axis=1)
y = all_train['severity']
X_test = all_test.drop(drop_cols, axis=1)

X.shape, y.shape, X_test.shape

((17060, 7), (17060,), (6510, 7))

In [248]:
xgb_reg = XGBRegressor(n_estimators=2000, max_depth=5, learning_rate=0.02, objective='reg:squarederror', tree_method='gpu_hist', gpu_id=0, n_jobs=-1, verbose=1)
xgb_reg.fit(X, y)
test_preds = xgb_reg.predict(X_test)
test_preds = pd.Series(np.round(test_preds)).clip(1, 5).values
print("train rmse", rmse(y, xgb_reg.predict(X)))

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


train rmse 0.5620251273062682


In [249]:
y.value_counts(normalize=True)

1.0    0.439449
4.0    0.207913
2.0    0.189859
3.0    0.159379
5.0    0.003400
Name: severity, dtype: float64

In [250]:
pd.Series(test_preds).value_counts(normalize=True)

4.0    0.300461
2.0    0.279109
1.0    0.249770
3.0    0.169278
5.0    0.001382
dtype: float64

# submission

In [252]:
sub_format.severity = test_preds.astype(int)
display(sub_format.sample(5))
sub_format.to_csv('../submissions/to submit/idkanything_submission.csv', index=False)

Unnamed: 0,uid,region,severity
3034,mego,south,1
4400,rqqi,west,4
1105,ekwk,west,4
715,cxmp,northeast,1
2243,jape,west,4
