In [1]:
## https://www.kaggle.com/tezdhar/breaking-lb-fresh-start

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)
from math import sqrt


In [3]:
train_df=pd.read_csv('E:/Projects/Santander Value Prediction/train.csv')
test_df = pd.read_csv('E:/Projects/Santander Value Prediction//test.csv')

### Get features and target

In [4]:
FEATURES = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
            '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
            'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
            '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
            'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
            '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
            '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2', 
            '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98']

In [5]:
def get_pred(data, lag=2):
    d1 = data[FEATURES[:-lag]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2 = data[FEATURES[lag:]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['pred'] = data[FEATURES[lag - 2]]
    d3 = d2[~d2.duplicated(['key'], keep=False)]
    return d1.merge(d3, how='left', on='key').pred.fillna(0)

In [6]:
def get_all_pred(data, max_lag):
    target = pd.Series(index=data.index, data=np.zeros(data.shape[0]))
    for lag in range(2, max_lag + 1):
        pred = get_pred(data, lag)
        mask = (target == 0) & (pred != 0)
        target[mask] = pred[mask]
    return target

### Training Set Analysis 

In [7]:
for max_lag in range(2, 33):
    pred_train = get_all_pred(train_df, max_lag)
    have_data = pred_train != 0
    print(f'Max lag {max_lag}: Score = {sqrt(mean_squared_error(np.log1p(train_df.target[have_data]), np.log1p(pred_train[have_data])))} on {have_data.sum()} out of {train_df.shape[0]} training samples')

Max lag 2: Score = 0.14119979229541013 on 1351 out of 4459 training samples
Max lag 3: Score = 0.11772801045868977 on 1947 out of 4459 training samples
Max lag 4: Score = 0.1700015225894321 on 2340 out of 4459 training samples
Max lag 5: Score = 0.16694209979909475 on 2586 out of 4459 training samples
Max lag 6: Score = 0.1617700665181974 on 2754 out of 4459 training samples
Max lag 7: Score = 0.1578320556062351 on 2899 out of 4459 training samples
Max lag 8: Score = 0.17261095273421487 on 3014 out of 4459 training samples
Max lag 9: Score = 0.19187680371761628 on 3110 out of 4459 training samples
Max lag 10: Score = 0.19145381804966508 on 3188 out of 4459 training samples
Max lag 11: Score = 0.1927802474230348 on 3237 out of 4459 training samples
Max lag 12: Score = 0.19219457876972026 on 3296 out of 4459 training samples
Max lag 13: Score = 0.19103885825602165 on 3336 out of 4459 training samples
Max lag 14: Score = 0.23096553241898027 on 3382 out of 4459 training samples
Max lag 15:

### Test Set Predictions

In [8]:
pred_test = get_all_pred(test_df, 29)
have_data = pred_test != 0
print(f'Have predictions for {have_data.sum()} out of {test_df.shape[0]} test_df samples')

Have predictions for 7803 out of 49342 test_df samples


In [9]:
sub = pd.read_csv('E:/Projects/Santander Value Prediction/sample_submission.csv')
sub.loc[have_data, 'target'] = pred_test[have_data]
sub.to_csv(f'new_submission.csv', index=False)