In [None]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from lightgbm.callback import early_stopping
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import warnings
import math

In [None]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

path = os.getcwd()

In [None]:
train = pd.read_csv('../../input/train_sequence_fold.csv')
test = pd.read_csv('../../input/test_sequence.csv')


In [None]:
models = [
    'xgb_seq_001', 'xgb_seq_002', 'xgb_seq_003', 'xgb_seq_004', 'xgb_seq_005', 'xgb_seq_006', 'xgb_seq_008', 'xgb_seq_009', 'xgb_seq_011',
    'lgb_seq_003',
    'cat_seq_001', 'cat_seq_003',
    'cat_onlylag', 'xgb_onlylag'
]

In [None]:
for model_name in models:
    oof_df = pd.read_csv('../../output/oof_'+model_name+'.csv')
    oof_df.columns = ['Filename', model_name+'_north', model_name+'_east']
    train = pd.merge(train, oof_df, on='Filename', how='left')

    sub_df = pd.read_csv('../../output/sub_'+model_name+'.csv')
    sub_df_east = sub_df.loc[sub_df.Id.str.contains('East')]
    sub_df_east['Id'] = sub_df_east['Id'].apply(lambda x: x.split(':')[0])
    sub_df_east.columns = ['Filename', model_name+'_east']
    sub_df_north = sub_df.loc[sub_df.Id.str.contains('North')]
    sub_df_north['Id'] = sub_df_north['Id'].apply(lambda x: x.split(':')[0])
    sub_df_north.columns = ['Filename', model_name+'_north']
    sub_df = pd.merge(sub_df_north, sub_df_east, on='Filename', how='left')
    test = pd.merge(test, sub_df, on='Filename', how='left')


In [None]:
test.head()

In [None]:
train_columns = [col for col in train.columns if col not in ['Filename', 'North', 'East', 'fold', 'run_id', 'run_seq_id', 'Altitude', 'Delta']]

In [None]:
train = train.fillna(-999)

In [None]:
n_folds = 5

north_oof_preds = np.zeros(len(train))
east_oof_preds = np.zeros(len(train))

north_test_preds = np.zeros(len(test))
east_test_preds = np.zeros(len(test))

targets = ['North', 'East']

In [None]:
%%time
for target in targets:
    y = train[target]
    oof_preds = np.zeros(len(train))
    test_preds_fold = np.zeros(len(test))
    score_folds  = []

    for fold_ in train.fold.sort_values().unique():
        print("Fold: {}".format(fold_))

        trn_ = train.loc[train.fold != fold_].index
        val_ = train.loc[train.fold == fold_].index

        trn_x, trn_y = train.loc[trn_, train_columns], y.loc[trn_]
        val_x, val_y = train.loc[val_, train_columns], y.loc[val_]
        
        execs = 1
        preds = np.zeros(len(val_x))
        test_preds_exec = np.zeros(len(test))
        
        for p in range(0,execs):
            print("Iteration: {}".format(p+1))
            model = LinearRegression().fit(trn_x, trn_y)
            preds += ((model.predict(val_x)) / execs)
            test_preds_exec += ((model.predict(test[train_columns])) / execs)
        test_preds_fold += (test_preds_exec / n_folds)
        oof_preds[val_] = preds
        score_folds.append(np.sqrt(mean_squared_error(val_y, preds)))
        print("FOLD RMSE = {}".format(np.sqrt(mean_squared_error(val_y, preds))))

    print("############################################################")
    print("{} - MEAN RMSE = {}".format(target, np.mean(score_folds)))
    print("{} - OOF RMSE = {}".format(target, np.sqrt(mean_squared_error(y, oof_preds))))
    print(score_folds)

    if target == 'North':
        north_oof_preds = oof_preds
        north_test_preds = test_preds_fold
    else:
        east_oof_preds = oof_preds
        east_test_preds = test_preds_fold

In [None]:
all_preds = np.concatenate([north_oof_preds, east_oof_preds])
all_target = np.concatenate([train.North, train.East])


In [None]:
print("ALL - OOF RMSE = {}".format(np.sqrt(mean_squared_error(all_target, all_preds))))

In [None]:
sub_north_test = pd.DataFrame({
    'Id': test.Filename,
    'Predicted': north_test_preds
})

sub_east_test = pd.DataFrame({
    'Id': test.Filename,
    'Predicted': east_test_preds
})

In [None]:
sub_north_test.Id = sub_north_test['Id'].astype(str) + ':North'
sub_east_test.Id  = sub_east_test['Id'].astype(str) + ':East'

sub_all_test = pd.concat([sub_north_test, sub_east_test])

In [None]:
sub_all_test[['Id', 'Predicted']].to_csv('../../output/stacking_lr_final.csv', index=False)

In [None]:
sub_all_test