In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../data"))

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

# Any results you write to the current directory are saved as output.

['train.csv', 'test.csv', 'sample_submission.csv']


In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

In [3]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f', 
        'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 'd6bb78916', 'b43a7cfd5',
        '58232a6fb', '1702b5bf0', '324921c7b', '62e59a501', '2ec5b290f', '241f0f867',
        'fb49e4212', '66ace2992', 'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7',
        '1931ccfdd', '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
        '6619d81fc', '1db387535']

In [4]:
from multiprocessing import Pool
CPU_CORES = 1
def _get_leak(df, cols, lag=0):
    """ To get leak value, we do following:
       1. Get string of all values after removing first two time steps
       2. For all rows we shift the row by two steps and again make a string
       3. Just find rows where string from 2 matches string from 1
       4. Get 1st time step of row in 3 (Currently, there is additional condition to only fetch value if we got exactly one match in step 3)"""
    series_str = df[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
    series_shifted_str = df[cols].shift(lag+2, axis=1)[cols[lag+2:]].apply(lambda x: "_".join(x.round(2).astype(str)), axis=1)
    target_rows = series_shifted_str.progress_apply(lambda x: np.where(x == series_str)[0])
    target_vals = target_rows.apply(lambda x: df.loc[x[0], cols[lag]] if len(x)==1 else 0)
    return target_vals

def get_all_leak(df, cols=None, nlags=15):
    """
    We just recursively fetch target value for different lags
    """
    df =  df.copy()
    #with Pool(processes=CPU_CORES) as p:
    #    res = [p.apply_async(_get_leak, args=(df, cols, i)) for i in range(nlags)]
    #    res = [r.get() for r in res]
    
    for i in range(nlags):
        print("Processing lag {}".format(i))
        df["leaked_target_"+str(i)] = _get_leak(df, cols, i)
    return df

In [5]:
test["target"] = train["target"].mean()

all_df = pd.concat([train[["ID", "target"] + cols], test[["ID", "target"]+ cols]]).reset_index(drop=True)
all_df.head()

Unnamed: 0,ID,target,f190486d6,58e2e02e6,eeb9cd3aa,9fd594eec,6eef030c1,15ace8c9f,fb0f5dbfe,58e056e12,...,963a49cdc,26fc93eb7,1931ccfdd,703885424,70feb1494,491b9ee45,23310aa6f,e176a204a,6619d81fc,1db387535
0,000d6aaf2,38000000.0,1866666.66,12066666.66,700000.0,600000.0,900000.0,4100000.0,0.0,0.0,...,13200000.0,3205000.0,2000000.0,0.0,1200000.0,0.0,0.0,0.0,400000.0,0.0
1,000fbd867,600000.0,0.0,2850000.0,2225000.0,1800000.0,800000.0,0.0,0.0,3300000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0027d6b71,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6000000.0,...,12000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0028cbf45,2000000.0,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,002a68644,14400000.0,0.0,0.0,0.0,0.0,37662000.0,0.0,4000000.0,6700000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
NLAGS = 40 #Increasing this might help push score a bit
all_df = get_all_leak(all_df, cols=cols, nlags=NLAGS)

Processing lag 0


100%|██████████| 53801/53801 [03:40<00:00, 243.75it/s]


Processing lag 1


100%|██████████| 53801/53801 [03:54<00:00, 229.46it/s]


Processing lag 2


100%|██████████| 53801/53801 [03:35<00:00, 249.59it/s]


Processing lag 3


100%|██████████| 53801/53801 [03:36<00:00, 253.53it/s]


Processing lag 4


100%|██████████| 53801/53801 [03:32<00:00, 253.67it/s]


Processing lag 5


100%|██████████| 53801/53801 [03:18<00:00, 270.46it/s]


Processing lag 6


100%|██████████| 53801/53801 [03:19<00:00, 269.95it/s]


Processing lag 7


100%|██████████| 53801/53801 [03:27<00:00, 259.31it/s]


Processing lag 8


100%|██████████| 53801/53801 [03:24<00:00, 263.14it/s]


Processing lag 9


100%|██████████| 53801/53801 [03:19<00:00, 269.04it/s]


Processing lag 10


100%|██████████| 53801/53801 [03:21<00:00, 267.37it/s]


Processing lag 11


100%|██████████| 53801/53801 [03:25<00:00, 261.18it/s]


Processing lag 12


100%|██████████| 53801/53801 [03:20<00:00, 268.63it/s]


Processing lag 13


100%|██████████| 53801/53801 [03:19<00:00, 269.29it/s]


Processing lag 14


100%|██████████| 53801/53801 [03:15<00:00, 274.87it/s]


Processing lag 15


100%|██████████| 53801/53801 [03:09<00:00, 283.80it/s]


Processing lag 16


100%|██████████| 53801/53801 [03:04<00:00, 291.45it/s]


Processing lag 17


100%|██████████| 53801/53801 [03:06<00:00, 289.21it/s]


Processing lag 18


100%|██████████| 53801/53801 [03:03<00:00, 293.03it/s]


Processing lag 19


100%|██████████| 53801/53801 [03:02<00:00, 295.24it/s]


Processing lag 20


100%|██████████| 53801/53801 [03:02<00:00, 294.04it/s]


Processing lag 21


100%|██████████| 53801/53801 [03:07<00:00, 286.73it/s]


Processing lag 22


100%|██████████| 53801/53801 [03:03<00:00, 293.32it/s]


Processing lag 23


100%|██████████| 53801/53801 [03:10<00:00, 281.95it/s]


Processing lag 24


100%|██████████| 53801/53801 [03:04<00:00, 290.97it/s]


Processing lag 25


100%|██████████| 53801/53801 [03:06<00:00, 288.75it/s]


Processing lag 26


100%|██████████| 53801/53801 [03:03<00:00, 293.93it/s]


Processing lag 27


100%|██████████| 53801/53801 [03:05<00:00, 290.51it/s]


Processing lag 28


100%|██████████| 53801/53801 [02:58<00:00, 302.01it/s]


Processing lag 29


100%|██████████| 53801/53801 [03:10<00:00, 281.81it/s]
  2%|▏         | 1097/53801 [00:00<00:04, 10961.41it/s]

Processing lag 30


100%|██████████| 53801/53801 [00:04<00:00, 11059.91it/s]
  2%|▏         | 1024/53801 [00:00<00:05, 10235.37it/s]

Processing lag 31


100%|██████████| 53801/53801 [00:04<00:00, 10960.71it/s]
  2%|▏         | 1080/53801 [00:00<00:04, 10798.80it/s]

Processing lag 32


100%|██████████| 53801/53801 [00:04<00:00, 11066.61it/s]
  2%|▏         | 1084/53801 [00:00<00:04, 10833.42it/s]

Processing lag 33


100%|██████████| 53801/53801 [00:04<00:00, 11073.58it/s]
  2%|▏         | 1104/53801 [00:00<00:04, 11037.30it/s]

Processing lag 34


100%|██████████| 53801/53801 [00:04<00:00, 10871.13it/s]
  2%|▏         | 1104/53801 [00:00<00:04, 11037.25it/s]

Processing lag 35


100%|██████████| 53801/53801 [00:04<00:00, 10821.23it/s]
  2%|▏         | 1054/53801 [00:00<00:05, 10531.12it/s]

Processing lag 36


100%|██████████| 53801/53801 [00:05<00:00, 10698.26it/s]
  2%|▏         | 1065/53801 [00:00<00:04, 10645.85it/s]

Processing lag 37


100%|██████████| 53801/53801 [00:05<00:00, 10648.24it/s]
  2%|▏         | 1101/53801 [00:00<00:04, 11001.30it/s]

Processing lag 38


100%|██████████| 53801/53801 [00:04<00:00, 10873.68it/s]
  2%|▏         | 1089/53801 [00:00<00:04, 10880.96it/s]

Processing lag 39


100%|██████████| 53801/53801 [00:04<00:00, 10997.95it/s]


In [7]:
leaky_cols = ["leaked_target_"+str(i) for i in range(NLAGS)]
train = train.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")
test = test.join(all_df.set_index("ID")[leaky_cols], on="ID", how="left")

In [8]:
train[["target"]+leaky_cols].head(10)

Unnamed: 0,target,leaked_target_0,leaked_target_1,leaked_target_2,leaked_target_3,leaked_target_4,leaked_target_5,leaked_target_6,leaked_target_7,leaked_target_8,...,leaked_target_30,leaked_target_31,leaked_target_32,leaked_target_33,leaked_target_34,leaked_target_35,leaked_target_36,leaked_target_37,leaked_target_38,leaked_target_39
0,38000000.0,38000000.0,38000000.0,38000000.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,600000.0,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,600000.0,...,0,0,0,0,0,0,0,0,0,0
2,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,14400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,2800000.0,0.0,0.0,0.0,2800000.0,0.0,0.0,0.0,2800000.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,164000.0,0.0,0.0,0.0,164000.0,0.0,164000.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7,600000.0,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,979000.0,979000.0,979000.0,979000.0,979000.0,979000.0,979000.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,460000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,460000.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
train["nonzero_mean"] = train[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)
test["nonzero_mean"] = test[transact_cols].apply(lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1)

In [10]:
#We start with 1st lag target and recusrsively fill zero's
train["compiled_leak"] = 0
test["compiled_leak"] = 0
for i in range(NLAGS):
    train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "leaked_target_"+str(i)]
    test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "leaked_target_"+str(i)]
    
print("Leak values found in train and test ", sum(train["compiled_leak"] > 0), sum(test["compiled_leak"] > 0))
print("% of correct leaks values in train ", sum(train["compiled_leak"] == train["target"])/sum(train["compiled_leak"]))

train.loc[train["compiled_leak"] == 0, "compiled_leak"] = train.loc[train["compiled_leak"] == 0, "nonzero_mean"]
test.loc[test["compiled_leak"] == 0, "compiled_leak"] = test.loc[test["compiled_leak"] == 0, "nonzero_mean"]

Leak values found in train and test  3492 9128
% of correct leaks values in train  1.6393107212267198e-07


In [11]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y, np.log1p(train["compiled_leak"]).fillna(14.49)))

0.8098037456774047

In [12]:
#submission
sub = test[["ID"]]
sub["target"] = test["compiled_leak"]
sub.to_csv("/tmp/baseline_submission_with_leaks.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
