In [1]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from lightgbm.callback import early_stopping
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import warnings

import math

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

path = os.getcwd()

In [3]:
train = pd.read_csv('../input/kddbr2022apk/train_sequence_fold.csv')
test = pd.read_csv('../input/kddbr2022apk/test_sequence.csv')
df = pd.concat([train, test])

features_1 = pd.read_csv('../input/kddbr2022apk/complete_04.csv')
features_1 = features_1[["Filename","w_std","w_mean","w_median","w_min","w_max","w_var","n_std","n_mean","n_median","n_min","n_max","n_var","d_std","d_mean","d_median","d_min","d_max","d_var"]]
features_1.columns = ["Filename","_w_std","_w_mean","_w_median","_w_min","_w_max","_w_var","_n_std","_n_mean","_n_median","_n_min","_n_max","_n_var","_d_std","_d_mean","_d_median","_d_min","_d_max","_d_var"]
features_2 = pd.read_csv('../input/kddbr2022apk/complete_05.csv')
features_3 = pd.read_csv('../input/kddbr2022apk/complete_06.csv')

features = pd.merge(features_1, features_2, on='Filename', how='left')
features = pd.merge(features, features_3, on='Filename', how='left')

mario_features_tr = pd.read_csv('../input/kddbr2022apk/mario_train_vec_v1.csv')
mario_features_ts = pd.read_csv('../input/kddbr2022apk/mario_test_vec_v1.csv')

mario_features = pd.concat([mario_features_tr, mario_features_ts])
mario_features2 = pd.read_csv('../input/kddbr2022apk/vecs_v3_fe.csv')


In [4]:
df = pd.merge(df, features, on='Filename', how='left')
df = pd.merge(df, mario_features, on='Filename', how='left')
df = pd.merge(df, mario_features2, on='Filename', how='left')

df = df.sort_values(by=['run_id', 'run_seq_id']).reset_index(drop=True)

In [5]:
df['sqrt_altitude'] = np.sqrt(df['Altitude'])
df['sqrt_delta'] = np.sqrt(df['Delta'])
df['sqrt_altitude_delta'] = np.sqrt(df['Altitude'] + df['Delta'])

df['pow2_altitude'] = (df['Altitude'] ** 2)
df['pow2_delta'] = (df['Delta'] ** 2)
df['pow2_altitude_delta'] = (df['Altitude'] + df['Delta']) ** 2

tgt_columns = [
    'Delta', 'Altitude', 
    'base_05_n_mean', '_n_mean', 'base_05_f05_H_1', 'base_05_n_median', 'base_05_n_max', 'base_05_n_min', 'base_05_f05_H_4', 'x', 'base_05_f05_H_25', 'base_05_f05_H_7', 'x_mean', 'x1_mean', 'x2_mean',
    'base_05_w_mean', '_w_mean', 'base_05_f05_W_21', 'base_05_w_median', 'base_05_w_max', 'base_05_w_min', 'base_05_f05_W_18', 'y', 'base_05_f05_W_6', 'base_05_f05_W_0', 'y_mean', 'y1_mean', 'y2_mean',
    'base_06_n_mean', 'base_06_f06_H_1', 'base_06_n_median', 'base_06_n_max',
    'base_06_w_mean', 'base_06_f06_W_21', 'base_06_w_median', 'base_06_w_max'
]

for col in tqdm(tgt_columns):
    df['shift_'+col] = df.groupby('run_id')[col].shift().reset_index(0,drop=True)
    df['next_'+col] = df.groupby('run_id')[col].shift(-1).reset_index(0,drop=True)
    df['shift2_'+col] = df.groupby('run_id')[col].shift(2).reset_index(0,drop=True)
    df['next2_'+col] = df.groupby('run_id')[col].shift(-2).reset_index(0,drop=True)
    df['shift3_'+col] = df.groupby('run_id')[col].shift(3).reset_index(0,drop=True)
    df['next3_'+col] = df.groupby('run_id')[col].shift(-3).reset_index(0,drop=True)
    df['shift4_'+col] = df.groupby('run_id')[col].shift(4).reset_index(0,drop=True)
    df['next4_'+col] = df.groupby('run_id')[col].shift(-4).reset_index(0,drop=True)
    df['shift5_'+col] = df.groupby('run_id')[col].shift(5).reset_index(0,drop=True)
    df['next5_'+col] = df.groupby('run_id')[col].shift(-5).reset_index(0,drop=True)
    df['seq_avg_w3_'+col] = df.groupby('run_id')[col].rolling(3).mean().reset_index(0,drop=True)
    df['seq_avg_w2_'+col] = df.groupby('run_id')[col].rolling(2).mean().reset_index(0,drop=True)
    df['seq_avg_w5_'+col] = df.groupby('run_id')[col].rolling(5).mean().reset_index(0,drop=True)
    df['seq_avg_w10_'+col] = df.groupby('run_id')[col].rolling(10).mean().reset_index(0,drop=True)
    df['seq_avg_w25_'+col] = df.groupby('run_id')[col].rolling(25).mean().reset_index(0,drop=True)
    df['seq_avg_w50_'+col] = df.groupby('run_id')[col].rolling(50).mean().reset_index(0,drop=True)
    df['seq_std_w3_'+col] = df.groupby('run_id')[col].rolling(3).std().reset_index(0,drop=True)
    df['seq_std_w2_'+col] = df.groupby('run_id')[col].rolling(2).std().reset_index(0,drop=True)
    df['seq_std_w5_'+col] = df.groupby('run_id')[col].rolling(5).std().reset_index(0,drop=True)
    df['seq_std_w10_'+col] = df.groupby('run_id')[col].rolling(10).std().reset_index(0,drop=True)
    df['seq_std_w25_'+col] = df.groupby('run_id')[col].rolling(25).std().reset_index(0,drop=True)
    df['seq_std_w50_'+col] = df.groupby('run_id')[col].rolling(50).std().reset_index(0,drop=True)
    df['seq_max_w3_'+col] = df.groupby('run_id')[col].rolling(3).max().reset_index(0,drop=True)
    df['seq_max_w2_'+col] = df.groupby('run_id')[col].rolling(2).max().reset_index(0,drop=True)
    df['seq_max_w5_'+col] = df.groupby('run_id')[col].rolling(5).max().reset_index(0,drop=True)
    df['seq_max_w10_'+col] = df.groupby('run_id')[col].rolling(10).max().reset_index(0,drop=True)
    df['seq_max_w25_'+col] = df.groupby('run_id')[col].rolling(25).max().reset_index(0,drop=True)
    df['seq_max_w50_'+col] = df.groupby('run_id')[col].rolling(50).max().reset_index(0,drop=True)
    df['seq_min_w3_'+col] = df.groupby('run_id')[col].rolling(3).min().reset_index(0,drop=True)
    df['seq_min_w2_'+col] = df.groupby('run_id')[col].rolling(2).min().reset_index(0,drop=True)
    df['seq_min_w5_'+col] = df.groupby('run_id')[col].rolling(5).min().reset_index(0,drop=True)
    df['seq_min_w10_'+col] = df.groupby('run_id')[col].rolling(10).min().reset_index(0,drop=True)
    df['seq_min_w25_'+col] = df.groupby('run_id')[col].rolling(25).min().reset_index(0,drop=True)
    df['seq_min_w50_'+col] = df.groupby('run_id')[col].rolling(50).min().reset_index(0,drop=True)
    df['seq_diff_last_'+col] = df.groupby('run_id')[col].diff(1).reset_index(0,drop=True)
    df['seq_diff_next_'+col] = df.groupby('run_id')[col].diff(-1).reset_index(0,drop=True)
    df['seq_diff_last2_'+col] = df.groupby('run_id')[col].diff(2).reset_index(0,drop=True)
    df['seq_diff_next2_'+col] = df.groupby('run_id')[col].diff(-2).reset_index(0,drop=True)
    df['seq_diff_last3_'+col] = df.groupby('run_id')[col].diff(3).reset_index(0,drop=True)
    df['seq_diff_next3_'+col] = df.groupby('run_id')[col].diff(-3).reset_index(0,drop=True)
    df['seq_diff_last4_'+col] = df.groupby('run_id')[col].diff(4).reset_index(0,drop=True)
    df['seq_diff_next4_'+col] = df.groupby('run_id')[col].diff(-4).reset_index(0,drop=True)
    df['seq_diff_last5_'+col] = df.groupby('run_id')[col].diff(5).reset_index(0,drop=True)
    df['seq_diff_next5_'+col] = df.groupby('run_id')[col].diff(-5).reset_index(0,drop=True)
    df['seq_div_last_'+col] = df[col] / df['shift_'+col]
    df['seq_div_next_'+col] = df[col] / df['next_'+col]
    df['seq_div_last2_'+col] = df[col] / df['shift2_'+col]
    df['seq_div_next2_'+col] = df[col] / df['next2_'+col]
    df['seq_div_last3_'+col] = df[col] / df['shift3_'+col]
    df['seq_div_next3_'+col] = df[col] / df['next3_'+col]
    
    df['seq_min_'+col] = df.groupby('run_id')[col].transform('min')
    df['seq_max_'+col] = df.groupby('run_id')[col].transform('max')
    df['seq_std_'+col] = df.groupby('run_id')[col].transform('std')
    df['seq_avg_'+col] = df.groupby('run_id')[col].transform('mean')
    

#     df[col+'_inverse'] = df.groupby('run_id')[col].transform(lambda x: x[::-1])
#     df['seq_avg_next_w3_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(3).mean().reset_index(0,drop=True)
#     df['seq_avg_next_w5_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(5).mean().reset_index(0,drop=True)
#     df['seq_std_next_w3_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(3).std().reset_index(0,drop=True)    
#     df['seq_std_next_w5_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(5).std().reset_index(0,drop=True)
#     df['seq_max_next_w3_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(3).max().reset_index(0,drop=True)
#     df['seq_max_next_w5_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(5).max().reset_index(0,drop=True) 
#     df['seq_min_next_w3_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(3).min().reset_index(0,drop=True)
#     df['seq_min_next_w5_'+col] = df.groupby('run_id')[col+'_inverse'].rolling(5).min().reset_index(0,drop=True)
#     del df[col+'_inverse']
    

df['r_vetorial_now'] = (df['base_05_n_mean'] ** 2) + (df['base_05_w_mean'] ** 2)
df['shift_r_vetorial'] = (df['shift_base_05_n_mean'] ** 2) + (df['shift_base_05_w_mean'] ** 2)
df['shift2_r_vetorial'] = (df['shift2_base_05_n_mean'] ** 2) + (df['shift2_base_05_w_mean'] ** 2)
df['next_r_vetorial'] = (df['next_base_05_n_mean'] ** 2) + (df['next_base_05_w_mean'] ** 2)
df['next2_r_vetorial'] = (df['next2_base_05_n_mean'] ** 2) + (df['next2_base_05_w_mean'] ** 2)
df['avg_shift_r_vet'] = (df['r_vetorial_now'] + df['shift_r_vetorial'] + df['shift2_r_vetorial']) / 3
df['avg_next_r_vet'] = (df['r_vetorial_now'] + df['next_r_vetorial'] + df['next2_r_vetorial']) / 3
df['avg_all_r_vet'] = (df['r_vetorial_now'] + df['next_r_vetorial'] + df['next2_r_vetorial'] + df['shift_r_vetorial'] + df['shift2_r_vetorial']) / 5

df['div_r_next2_next'] = df['next2_r_vetorial'] / df['next_r_vetorial']
df['div_r_next_now'] = df['next_r_vetorial'] / df['r_vetorial_now']
df['div_r_now_shift'] = df['r_vetorial_now'] / df['shift_r_vetorial']
df['div_r_shift1_shift2'] = df['shift_r_vetorial'] / df['shift2_r_vetorial']
df['avg_div_r'] = (df['div_r_next2_next'] + df['div_r_next_now'] + df['div_r_now_shift'] + df['div_r_shift1_shift2']) / 4




100%|██████████| 36/36 [03:29<00:00,  5.82s/it]


In [6]:
print(df.shape)

(146262, 2131)


In [7]:
train = df.loc[~df.North.isnull(), :].sort_values(by=['run_id', 'run_seq_id']).reset_index(drop=True)
test = df.loc[df.North.isnull(), :].sort_values(by=['run_id', 'run_seq_id']).reset_index(drop=True)

In [8]:
import gc
del df
gc.collect()

23

In [9]:
train_columns = [col for col in train.columns if col not in ['Filename', 'North', 'East', 'fold', 'run_id', 'run_seq_id']]

In [10]:
n_folds = 5

north_oof_preds = np.zeros(len(train))
east_oof_preds = np.zeros(len(train))

north_test_preds = np.zeros(len(test))
east_test_preds = np.zeros(len(test))

targets = ['North', 'East']

In [11]:
train = train.replace([np.inf, -np.inf], np.nan)
#trn_x = trn_x.fillna(-999)
test = test.replace([np.inf, -np.inf], np.nan)

In [12]:
%%time
for target in targets:
    y = train[target]
    oof_preds = np.zeros(len(train))
    test_preds_fold = np.zeros(len(test))
    score_folds  = []

    # folds = GroupKFold(n_splits=n_folds)
    # for fold_, (trn_, val_) in enumerate(folds.split(y, y, train.run_id)):
    for fold_ in train.fold.sort_values().unique():
        print("Fold: {}".format(fold_))

        trn_ = train.loc[train.fold != fold_].index
        val_ = train.loc[train.fold == fold_].index

        trn_x, trn_y = train.loc[trn_, train_columns], y.loc[trn_]
        val_x, val_y = train.loc[val_, train_columns], y.loc[val_]
        
        dtrain = xgb.DMatrix(trn_x, trn_y)
        dvalid = xgb.DMatrix(val_x, val_y)
        
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
        params = {
          'objective': "reg:squarederror", #"reg:logistic",
          'booster': "gbtree",
          'eval_metric': 'rmse', 
          'colsample_bytree': 0.85,
          'subsample': 0.85,
          'eta': 0.1,
          'max_depth': 5,
          'min_child_weight': 64, 
          'max_bin': 32,
          'nthread':-1,
          'seed': 99,
          'tree_method': 'gpu_hist'
        }
        
        execs = 1
        preds = np.zeros(len(val_x))
        test_preds_exec = np.zeros(len(test))
        
        for p in range(0,execs):
            print("Iteration: {}".format(p+1))
            params['seed'] += p
            model = xgb.train(params, dtrain, 
                               10000, watchlist, 
                               maximize=False, 
                               early_stopping_rounds = 200, 
                               verbose_eval=200)
                            #early_stopping_rounds = 1000)
            
            preds += ((model.predict(xgb.DMatrix(val_x))) / execs)
            test_preds_exec += ((model.predict(xgb.DMatrix(test[train_columns]))) / execs)

        test_preds_fold += (test_preds_exec / n_folds)
        oof_preds[val_] = preds
        score_folds.append(np.sqrt(mean_squared_error(val_y, preds)))
        print("FOLD RMSE = {}".format(np.sqrt(mean_squared_error(val_y, preds))))
        del dtrain, dvalid, trn_x, val_x
        gc.collect()

    print("############################################################")
    print("{} - MEAN RMSE = {}".format(target, np.mean(score_folds)))
    print("{} - OOF RMSE = {}".format(target, np.sqrt(mean_squared_error(y, oof_preds))))
    print(score_folds)

    if target == 'North':
        north_oof_preds = oof_preds
        north_test_preds = test_preds_fold
    else:
        east_oof_preds = oof_preds
        east_test_preds = test_preds_fold

Fold: 1.0
Iteration: 1
[0]	train-rmse:1.17978	valid-rmse:1.17070
[200]	train-rmse:0.18565	valid-rmse:0.26983
[400]	train-rmse:0.14697	valid-rmse:0.26329
[600]	train-rmse:0.12489	valid-rmse:0.26131
[800]	train-rmse:0.10880	valid-rmse:0.25997
[1000]	train-rmse:0.09630	valid-rmse:0.25925
[1200]	train-rmse:0.08588	valid-rmse:0.25836
[1400]	train-rmse:0.07789	valid-rmse:0.25829
[1600]	train-rmse:0.07075	valid-rmse:0.25806
[1800]	train-rmse:0.06475	valid-rmse:0.25750
[2000]	train-rmse:0.05910	valid-rmse:0.25745
[2062]	train-rmse:0.05765	valid-rmse:0.25748
FOLD RMSE = 0.2574873937765054
Fold: 2.0
Iteration: 1
[0]	train-rmse:1.16526	valid-rmse:1.22752
[200]	train-rmse:0.17348	valid-rmse:0.31679
[400]	train-rmse:0.13267	valid-rmse:0.31372
[600]	train-rmse:0.10816	valid-rmse:0.31029
[800]	train-rmse:0.09117	valid-rmse:0.30852
[1000]	train-rmse:0.07884	valid-rmse:0.30644
[1200]	train-rmse:0.06880	valid-rmse:0.30516
[1400]	train-rmse:0.06129	valid-rmse:0.30442
[1600]	train-rmse:0.05485	valid-rmse:

In [13]:
all_preds = np.concatenate([north_oof_preds, east_oof_preds])
all_target = np.concatenate([train.North, train.East])

In [14]:
print("ALL - OOF RMSE = {}".format(np.sqrt(mean_squared_error(all_target, all_preds))))

ALL - OOF RMSE = 0.2927876076573253


In [15]:
sub_oof = pd.DataFrame({
    'Filename': train.Filename,
    'oof_north': north_oof_preds,
    'oof_east': east_oof_preds
})

sub_oof.to_csv('oof_xgb_seq_002.csv', index=False)

In [16]:
sub_north_test = pd.DataFrame({
    'Id': test.Filename,
    'Predicted': north_test_preds
})

sub_east_test = pd.DataFrame({
    'Id': test.Filename,
    'Predicted': east_test_preds
})

In [17]:
sub_north_test.Id = sub_north_test['Id'].astype(str) + ':North'
sub_east_test.Id  = sub_east_test['Id'].astype(str) + ':East'

sub_all_test = pd.concat([sub_north_test, sub_east_test])

In [18]:
sub_all_test[['Id', 'Predicted']].to_csv('sub_xgb_seq_002.csv', index=False)

In [19]:
sub_all_test

Unnamed: 0,Id,Predicted
0,0f57a863af3b7e5bf59a94319a408ff7.jpg:North,-0.172370
1,b7d4e3a6acb3b956d8b09416958e8892.jpg:North,-0.347464
2,9a1bd630a47dabafa6f4cab7338d01df.jpg:North,-0.489103
3,3f6ebf8aef0f352720bc0cb534d878b6.jpg:North,-0.509274
4,7e810f05bd88ff0f86dce7116b9fa527.jpg:North,-0.553121
...,...,...
55026,fc65c23ae61ce2c6fdbc637a8296d1d7.jpg:East,1.499336
55027,fc6c0d99ec42bad5e2d5fbb1e71c5a50.jpg:East,1.430359
55028,fdbb732633c6a3d32a67fc56edc7cde4.jpg:East,-0.488253
55029,feac1b902c7487dcbdf298bd4be75690.jpg:East,-0.200935
