In [1]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
from lightgbm.callback import early_stopping
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import warnings

import math

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

path = os.getcwd()

In [3]:
train = pd.read_csv('../input/kddbr2022apk/train_sequence_fold.csv')
test = pd.read_csv('../input/kddbr2022apk/test_sequence.csv')
df = pd.concat([train, test])

features_1 = pd.read_csv('../input/kddbr2022apk/complete_04.csv')
features_1 = features_1[["Filename","w_std","w_mean","w_median","w_min","w_max","w_var","n_std","n_mean","n_median","n_min","n_max","n_var","d_std","d_mean","d_median","d_min","d_max","d_var"]]
features_1.columns = ["Filename","_w_std","_w_mean","_w_median","_w_min","_w_max","_w_var","_n_std","_n_mean","_n_median","_n_min","_n_max","_n_var","_d_std","_d_mean","_d_median","_d_min","_d_max","_d_var"]
features_2 = pd.read_csv('../input/kddbr2022apk/complete_05.csv')
features_3 = pd.read_csv('../input/kddbr2022apk/complete_06.csv')
features_4 = pd.read_csv('../input/kddbr2022apk/complete_12.csv')

features = pd.merge(features_1, features_2, on='Filename', how='left')
features = pd.merge(features, features_3, on='Filename', how='left')


mario_features_tr = pd.read_csv('../input/kddbr2022apk/mario_train_vec_v1.csv')
mario_features_ts = pd.read_csv('../input/kddbr2022apk/mario_test_vec_v1.csv')

mario_features = pd.concat([mario_features_tr, mario_features_ts])
mario_features2 = pd.read_csv('../input/kddbr2022apk/vecs_v3_fe.csv')


In [4]:
df = pd.merge(df, features, on='Filename', how='left')
df = pd.merge(df, mario_features, on='Filename', how='left')
df = pd.merge(df, mario_features2, on='Filename', how='left')
#df = pd.merge(df, features_4, on='Filename', how='left')

df = df.sort_values(by=['run_id','run_seq_id']).reset_index(drop=True)

In [5]:
import gc
del features, mario_features, mario_features2, features_4, features_2, features_3, features_1
gc.collect()

23

In [6]:
df['sqrt_altitude'] = np.sqrt(df['Altitude'])
df['sqrt_delta'] = np.sqrt(df['Delta'])
df['sqrt_altitude_delta'] = np.sqrt(df['Altitude'] + df['Delta'])

df['pow2_altitude'] = (df['Altitude'] ** 2)
df['pow2_delta'] = (df['Delta'] ** 2)
df['pow2_altitude_delta'] = (df['Altitude'] + df['Delta']) ** 2

df['log_Altitude'] = np.log(df['Altitude'])
df['log_Delta'] = np.log(df['Delta'])
df['log_base_05_n_mean'] = np.log(df['base_05_n_mean'])

df['pow13_Altitude'] = df['Altitude'] ** (1/3)
df['pow13_Delta'] = df['Delta'] ** (1/3)
df['pow13_base_05_n_mean'] = df['base_05_n_mean'] ** (1/3)

tgt_columns = [
    'Delta', 'Altitude', 'log_Altitude', 'log_Delta', 'pow13_Altitude', 'pow13_Delta', 'log_base_05_n_mean', 'pow13_base_05_n_mean',
    'base_05_n_mean', '_n_mean', 'base_05_f05_H_1', 'base_05_n_median', 'base_05_n_max', 'base_05_n_min', 'base_05_f05_H_4', 'x', 'base_05_f05_H_25', 'base_05_f05_H_7', 'x_mean', 'x1_mean', 'x2_mean',
    'base_05_w_mean', '_w_mean', 'base_05_f05_W_21', 'base_05_w_median', 'base_05_w_max', 'base_05_w_min', 'base_05_f05_W_18', 'y', 'base_05_f05_W_6', 'base_05_f05_W_0', 'y_mean', 'y1_mean', 'y2_mean',
    'base_06_n_mean', 'base_06_f06_H_1', 'base_06_n_median', 'base_06_n_max',
    'base_06_w_mean', 'base_06_f06_W_21', 'base_06_w_median', 'base_06_w_max'
]

seq_win = [2,3,5,10,25,30,50]
nxt_sft_win = [1,2,3,4,5]

for col in tqdm(tgt_columns):
    for win in nxt_sft_win:
        df['shift'+str(win)+'_'+col] = df.groupby('run_id')[col].shift(win).reset_index(0,drop=True)
        df['seq_diff_last'+str(win)+'_'+col] = df.groupby('run_id')[col].diff(win).reset_index(0,drop=True)
    
    for win in seq_win:
        df['seq_avg_w'+str(win)+'_'+col] = df.groupby('run_id')[col].rolling(win).mean().reset_index(0,drop=True)
        df['seq_std_w'+str(win)+'_'+col] = df.groupby('run_id')[col].rolling(win).std().reset_index(0,drop=True)
        df['seq_max_w'+str(win)+'_'+col] = df.groupby('run_id')[col].rolling(win).max().reset_index(0,drop=True)
        df['seq_min_w'+str(win)+'_'+col] = df.groupby('run_id')[col].rolling(win).min().reset_index(0,drop=True)
        df['seq_skw_w'+str(win)+'_'+col] = df.groupby('run_id')[col].rolling(win).skew().reset_index(0,drop=True)
    
    
    df['seq_div_last1_'+col] = df[col] / df['shift1_'+col]
    df['seq_div_last2_'+col] = df[col] / df['shift2_'+col]
    df['seq_div_last3_'+col] = df[col] / df['shift3_'+col]
    df['seq_div_last1_last2_'+col] = df['shift1_'+col] / df['shift2_'+col]
    df['seq_div_last2_last3_'+col] = df['shift2_'+col] / df['shift3_'+col]
    
    df['seq_divdiv_last1-last1_last2_'+col] = df['seq_div_last1_'+col] / df['seq_div_last1_last2_'+col]
    df['seq_divdiv_last1_last2-last2_last3_'+col] = df['seq_div_last1_last2_'+col] / df['seq_div_last2_last3_'+col]
    
    df['seq_min_'+col] = df.groupby('run_id')[col].transform('min')
    df['seq_max_'+col] = df.groupby('run_id')[col].transform('max')
    df['seq_std_'+col] = df.groupby('run_id')[col].transform('std')
    df['seq_avg_'+col] = df.groupby('run_id')[col].transform('mean')
    df['seq_skw_'+col] = df.groupby('run_id')[col].transform('skew')

df['r_vetorial_now'] = (df['base_05_n_mean'] ** 2) + (df['base_05_w_mean'] ** 2)
df['shift_r_vetorial'] = (df['shift1_base_05_n_mean'] ** 2) + (df['shift1_base_05_w_mean'] ** 2)
df['shift2_r_vetorial'] = (df['shift2_base_05_n_mean'] ** 2) + (df['shift2_base_05_w_mean'] ** 2)
df['avg_shift_r_vet'] = (df['r_vetorial_now'] + df['shift_r_vetorial'] + df['shift2_r_vetorial']) / 3
df['div_r_now_shift'] = df['r_vetorial_now'] / df['shift_r_vetorial']
df['div_r_shift1_shift2'] = df['shift_r_vetorial'] / df['shift2_r_vetorial']
df['avg_div_r'] = (df['div_r_now_shift'] + df['div_r_shift1_shift2']) / 2

100%|██████████| 42/42 [03:43<00:00,  5.33s/it]


In [7]:
print(df.shape)

(146262, 2581)


In [8]:
train = df.loc[~df.North.isnull(), :].sort_values(by=['run_id', 'run_seq_id']).reset_index(drop=True)
test = df.loc[df.North.isnull(), :].sort_values(by=['run_id', 'run_seq_id']).reset_index(drop=True)

In [9]:
import gc
del df
gc.collect()

23

In [10]:
train_columns = [col for col in train.columns if col not in ['Filename', 'North', 'East', 'fold', 'run_id', 'run_seq_id']]

In [11]:
n_folds = 5

north_oof_preds = np.zeros(len(train))
east_oof_preds = np.zeros(len(train))

north_test_preds = np.zeros(len(test))
east_test_preds = np.zeros(len(test))

targets = ['North', 'East']

In [12]:
train = train.replace([np.inf, -np.inf], np.nan)
#trn_x = trn_x.fillna(-999)
test = test.replace([np.inf, -np.inf], np.nan)

In [13]:
%%time
for target in targets:
    y = train[target]
    oof_preds = np.zeros(len(train))
    test_preds_fold = np.zeros(len(test))
    score_folds  = []

    # folds = GroupKFold(n_splits=n_folds)
    # for fold_, (trn_, val_) in enumerate(folds.split(y, y, train.run_id)):
    for fold_ in train.fold.sort_values().unique():
        print("Fold: {}".format(fold_))

        trn_ = train.loc[train.fold != fold_].index
        val_ = train.loc[train.fold == fold_].index

        trn_x, trn_y = train.loc[trn_, train_columns], y.loc[trn_]
        val_x, val_y = train.loc[val_, train_columns], y.loc[val_]
        
        dtrain = xgb.DMatrix(trn_x, trn_y)
        dvalid = xgb.DMatrix(val_x, val_y)
        
        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
        
        params = {
          'objective': "reg:squarederror", #"reg:logistic",
          'booster': "gbtree",
          'eval_metric': 'rmse', 
          'colsample_bytree': 0.5,
          'subsample': 0.5,
          'eta': 0.1,
          'max_depth': 8,
          'min_child_weight': 20, 
          'max_bin': 20,
          'nthread':-1,
          'seed': 1234,
          'tree_method': 'gpu_hist'
        }
        
        execs = 1
        preds = np.zeros(len(val_x))
        test_preds_exec = np.zeros(len(test))
        
        for p in range(0,execs):
            print("Iteration: {}".format(p+1))
            params['seed'] += p
            model = xgb.train(params, dtrain, 
                               10000, watchlist, 
                               maximize=False, 
                               early_stopping_rounds = 200, 
                               verbose_eval=200)
                            #early_stopping_rounds = 1000)
            del dtrain, dvalid, trn_x
            gc.collect()
            preds += ((model.predict(xgb.DMatrix(val_x))) / execs)
            test_preds_exec += ((model.predict(xgb.DMatrix(test[train_columns]))) / execs)

        test_preds_fold += (test_preds_exec / n_folds)
        oof_preds[val_] = preds
        score_folds.append(np.sqrt(mean_squared_error(val_y, preds)))
        print("FOLD RMSE = {}".format(np.sqrt(mean_squared_error(val_y, preds))))
        del model, val_x
        gc.collect()

    print("############################################################")
    print("{} - MEAN RMSE = {}".format(target, np.mean(score_folds)))
    print("{} - OOF RMSE = {}".format(target, np.sqrt(mean_squared_error(y, oof_preds))))
    print(score_folds)

    if target == 'North':
        north_oof_preds = oof_preds
        north_test_preds = test_preds_fold
    else:
        east_oof_preds = oof_preds
        east_test_preds = test_preds_fold

Fold: 1.0
Iteration: 1
[0]	train-rmse:1.17591	valid-rmse:1.17542
[200]	train-rmse:0.14321	valid-rmse:0.30863
[400]	train-rmse:0.10409	valid-rmse:0.30612
[600]	train-rmse:0.08306	valid-rmse:0.30629
[800]	train-rmse:0.06595	valid-rmse:0.30618
[941]	train-rmse:0.05724	valid-rmse:0.30609
FOLD RMSE = 0.30608667909000753
Fold: 2.0
Iteration: 1
[0]	train-rmse:1.16112	valid-rmse:1.22811
[200]	train-rmse:0.13922	valid-rmse:0.33457
[400]	train-rmse:0.09671	valid-rmse:0.33319
[600]	train-rmse:0.07250	valid-rmse:0.33244
[761]	train-rmse:0.05883	valid-rmse:0.33253
FOLD RMSE = 0.3325292439940732
Fold: 3.0
Iteration: 1
[0]	train-rmse:1.18460	valid-rmse:1.13061
[200]	train-rmse:0.14813	valid-rmse:0.30395
[400]	train-rmse:0.10817	valid-rmse:0.30257
[600]	train-rmse:0.08484	valid-rmse:0.30305
[656]	train-rmse:0.07899	valid-rmse:0.30336
FOLD RMSE = 0.30335659908004997
Fold: 4.0
Iteration: 1
[0]	train-rmse:1.17329	valid-rmse:1.18231
[200]	train-rmse:0.07501	valid-rmse:0.47216
[400]	train-rmse:0.04524	vali

In [14]:
all_preds = np.concatenate([north_oof_preds, east_oof_preds])
all_target = np.concatenate([train.North, train.East])

In [15]:
print("ALL - OOF RMSE = {}".format(np.sqrt(mean_squared_error(all_target, all_preds))))

ALL - OOF RMSE = 0.32850341906801034


In [21]:
sub_oof = pd.DataFrame({
    'Filename': train.Filename,
    'oof_north': north_oof_preds,
    'oof_east': east_oof_preds
})

sub_oof.to_csv('oof_xgb_seq_016.csv', index=False)

In [17]:
sub_north_test = pd.DataFrame({
    'Id': test.Filename,
    'Predicted': north_test_preds
})

sub_east_test = pd.DataFrame({
    'Id': test.Filename,
    'Predicted': east_test_preds
})

In [18]:
sub_north_test.Id = sub_north_test['Id'].astype(str) + ':North'
sub_east_test.Id  = sub_east_test['Id'].astype(str) + ':East'

sub_all_test = pd.concat([sub_north_test, sub_east_test])

In [22]:
sub_all_test[['Id', 'Predicted']].to_csv('sub_xgb_seq_016.csv', index=False)

In [20]:
sub_all_test

Unnamed: 0,Id,Predicted
0,0f57a863af3b7e5bf59a94319a408ff7.jpg:North,-0.200769
1,b7d4e3a6acb3b956d8b09416958e8892.jpg:North,-0.251817
2,9a1bd630a47dabafa6f4cab7338d01df.jpg:North,-0.407167
3,3f6ebf8aef0f352720bc0cb534d878b6.jpg:North,-0.449348
4,7e810f05bd88ff0f86dce7116b9fa527.jpg:North,-0.586769
...,...,...
55026,fc65c23ae61ce2c6fdbc637a8296d1d7.jpg:East,1.582865
55027,fc6c0d99ec42bad5e2d5fbb1e71c5a50.jpg:East,1.236618
55028,fdbb732633c6a3d32a67fc56edc7cde4.jpg:East,-0.539893
55029,feac1b902c7487dcbdf298bd4be75690.jpg:East,-0.379317
