In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from tqdm import tqdm

In [10]:
train = pd.read_feather("../input/train.ftr")
test = pd.read_feather("../input/test.ftr")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

In [11]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f', 
        'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 'd6bb78916', 'b43a7cfd5', 
        '58232a6fb', '1702b5bf0', '324921c7b', '62e59a501', '2ec5b290f', '241f0f867', 
        'fb49e4212', '66ace2992', 'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', 
        '1931ccfdd', '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
        '6619d81fc', '1db387535', 
        'fc99f9426', '91f701ba2', '0572565c2', '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'
       ]

In [12]:
colset_filename = os.listdir('../input/continue_col/')
colset_filename_40 = [fn for fn in colset_filename if fn.split('.')[0].split('_')[1] == '40']

In [13]:
colset = []
for c in colset_filename_40:
    colset += list(np.load('../input/continue_col/' + c))[::-1]

colset_lis = []
for fn in colset_filename_40:
    colset_lis.append(list(np.load('../input/continue_col/' + fn)[::-1]))

In [14]:
len(colset_lis)

90

# find leak train

In [15]:
def fast_get_leak(df, cols, colset_lis, lag=0):
    f1 = []  
    f2 = []
    for col_set in colset_lis:
        f1 += col_set[:-lag-2]
        f2 += col_set[lag+2:]
    
    d1 = df[f1].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2 = df[f2].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    
    d2['pred'] = df[cols[lag]]
    d3 = d1[~d1.duplicated(['key'], keep=False)]
    d4 = d2[~d2.duplicated(['key'], keep=False)]
    d5 = d4.merge(d3, how='inner', on='key')
    return d1.merge(d5, how='left', on='key').pred.fillna(0)

In [16]:
def compiled_leak_result():
    
    max_nlags = 38 
    train_leak = train[["ID", "target"] + colset]
    train_leak["compiled_leak"] = 0
    
    mask = train.target.notnull()
    
    scores = []
    leaky_value_counts = []
    leaky_value_corrects = []
    leaky_cols = []
    
    for i in tqdm(range(max_nlags)):
        c = "leaked_target_"+str(i)
        
        print('Processing lag', i)
        train_leak[c] = fast_get_leak(train_leak, cols, colset_lis, lag=i)
        
        leaky_cols.append(c)
        train_leak = train.join(
            train_leak.set_index("ID")[leaky_cols+["compiled_leak"]], 
            on="ID", how="left"
        )
        
        zeroleak = train_leak["compiled_leak"]==0
        train_leak.loc[zeroleak, "compiled_leak"] = train_leak.loc[zeroleak, c]

        leaky_value_counts.append(sum(train_leak[mask]["compiled_leak"] > 0))
        _correct_counts = sum(train_leak[mask]["compiled_leak"]==train_leak[mask]["target"])
        leaky_value_corrects.append(_correct_counts/leaky_value_counts[-1])
        print("Leak values found in train", leaky_value_counts[-1])
        print(
            "% of correct leaks values in train ", 
            leaky_value_corrects[-1]
        )
    
    result = dict(
        leaky_count=leaky_value_counts,
        leaky_correct=leaky_value_corrects,
    )
    return train_leak, result

In [17]:
train_leak, result = compiled_leak_result()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
  0%|                                                                                           | 0/38 [00:00<?, ?it/s]

Processing lag 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Leak values found in train 1390
% of correct leaks values in train  1.0


  3%|██▏                                                                                | 1/38 [00:06<04:18,  6.98s/it]

Processing lag 1
Leak values found in train 2016
% of correct leaks values in train  1.0


  5%|████▎                                                                              | 2/38 [00:13<04:05,  6.82s/it]

Processing lag 2
Leak values found in train 2413
% of correct leaks values in train  1.0


  8%|██████▌                                                                            | 3/38 [00:20<03:55,  6.74s/it]

Processing lag 3
Leak values found in train 2674
% of correct leaks values in train  1.0


 11%|████████▋                                                                          | 4/38 [00:26<03:46,  6.65s/it]

Processing lag 4
Leak values found in train 2848
% of correct leaks values in train  1.0


 13%|██████████▉                                                                        | 5/38 [00:32<03:36,  6.55s/it]

Processing lag 5
Leak values found in train 3002
% of correct leaks values in train  1.0


 16%|█████████████                                                                      | 6/38 [00:38<03:26,  6.46s/it]

Processing lag 6
Leak values found in train 3124
% of correct leaks values in train  1.0


 18%|███████████████▎                                                                   | 7/38 [00:44<03:17,  6.38s/it]

Processing lag 7
Leak values found in train 3226
% of correct leaks values in train  1.0


 21%|█████████████████▍                                                                 | 8/38 [00:50<03:08,  6.29s/it]

Processing lag 8
Leak values found in train 3316
% of correct leaks values in train  1.0


 24%|███████████████████▋                                                               | 9/38 [00:55<02:59,  6.20s/it]

Processing lag 9
Leak values found in train 3370
% of correct leaks values in train  1.0


 26%|█████████████████████▌                                                            | 10/38 [01:01<02:51,  6.11s/it]

Processing lag 10
Leak values found in train 3433
% of correct leaks values in train  1.0


 29%|███████████████████████▋                                                          | 11/38 [01:06<02:42,  6.03s/it]

Processing lag 11
Leak values found in train 3479
% of correct leaks values in train  1.0


 32%|█████████████████████████▉                                                        | 12/38 [01:11<02:34,  5.95s/it]

Processing lag 12
Leak values found in train 3525
% of correct leaks values in train  1.0


 34%|████████████████████████████                                                      | 13/38 [01:16<02:26,  5.87s/it]

Processing lag 13
Leak values found in train 3572
% of correct leaks values in train  1.0


 37%|██████████████████████████████▏                                                   | 14/38 [01:21<02:19,  5.79s/it]

Processing lag 14
Leak values found in train 3612
% of correct leaks values in train  1.0


 39%|████████████████████████████████▎                                                 | 15/38 [01:25<02:11,  5.71s/it]

Processing lag 15
Leak values found in train 3641
% of correct leaks values in train  1.0


 42%|██████████████████████████████████▌                                               | 16/38 [01:30<02:03,  5.64s/it]

Processing lag 16
Leak values found in train 3661
% of correct leaks values in train  1.0


 45%|████████████████████████████████████▋                                             | 17/38 [01:34<01:56,  5.55s/it]

Processing lag 17
Leak values found in train 3687
% of correct leaks values in train  1.0


 47%|██████████████████████████████████████▊                                           | 18/38 [01:38<01:49,  5.47s/it]

Processing lag 18
Leak values found in train 3710
% of correct leaks values in train  1.0


 50%|█████████████████████████████████████████                                         | 19/38 [01:42<01:42,  5.39s/it]

Processing lag 19
Leak values found in train 3730
% of correct leaks values in train  1.0


 53%|███████████████████████████████████████████▏                                      | 20/38 [01:46<01:35,  5.31s/it]

Processing lag 20
Leak values found in train 3746
% of correct leaks values in train  1.0


 55%|█████████████████████████████████████████████▎                                    | 21/38 [01:49<01:28,  5.22s/it]

Processing lag 21
Leak values found in train 3765
% of correct leaks values in train  1.0


 58%|███████████████████████████████████████████████▍                                  | 22/38 [01:53<01:22,  5.14s/it]

Processing lag 22
Leak values found in train 3782
% of correct leaks values in train  1.0


 61%|█████████████████████████████████████████████████▋                                | 23/38 [01:56<01:15,  5.06s/it]

Processing lag 23
Leak values found in train 3799
% of correct leaks values in train  1.0


 63%|███████████████████████████████████████████████████▊                              | 24/38 [01:59<01:09,  4.98s/it]

Processing lag 24
Leak values found in train 3812
% of correct leaks values in train  1.0


 66%|█████████████████████████████████████████████████████▉                            | 25/38 [02:02<01:03,  4.90s/it]

Processing lag 25
Leak values found in train 3820
% of correct leaks values in train  1.0


 68%|████████████████████████████████████████████████████████                          | 26/38 [02:05<00:57,  4.82s/it]

Processing lag 26
Leak values found in train 3832
% of correct leaks values in train  1.0


 71%|██████████████████████████████████████████████████████████▎                       | 27/38 [02:08<00:52,  4.74s/it]

Processing lag 27
Leak values found in train 3839
% of correct leaks values in train  1.0


 74%|████████████████████████████████████████████████████████████▍                     | 28/38 [02:10<00:46,  4.66s/it]

Processing lag 28
Leak values found in train 3846
% of correct leaks values in train  1.0


 76%|██████████████████████████████████████████████████████████████▌                   | 29/38 [02:13<00:41,  4.59s/it]

Processing lag 29
Leak values found in train 3853
% of correct leaks values in train  1.0


 79%|████████████████████████████████████████████████████████████████▋                 | 30/38 [02:15<00:36,  4.51s/it]

Processing lag 30
Leak values found in train 3860
% of correct leaks values in train  1.0


 82%|██████████████████████████████████████████████████████████████████▉               | 31/38 [02:17<00:30,  4.43s/it]

Processing lag 31
Leak values found in train 3863
% of correct leaks values in train  1.0


 84%|█████████████████████████████████████████████████████████████████████             | 32/38 [02:19<00:26,  4.35s/it]

Processing lag 32
Leak values found in train 3868
% of correct leaks values in train  1.0


 87%|███████████████████████████████████████████████████████████████████████▏          | 33/38 [02:20<00:21,  4.27s/it]

Processing lag 33
Leak values found in train 3877
% of correct leaks values in train  1.0


 89%|█████████████████████████████████████████████████████████████████████████▎        | 34/38 [02:22<00:16,  4.19s/it]

Processing lag 34
Leak values found in train 3883
% of correct leaks values in train  1.0


 92%|███████████████████████████████████████████████████████████████████████████▌      | 35/38 [02:24<00:12,  4.12s/it]

Processing lag 35
Leak values found in train 3886
% of correct leaks values in train  1.0


 95%|█████████████████████████████████████████████████████████████████████████████▋    | 36/38 [02:25<00:08,  4.04s/it]

Processing lag 36
Leak values found in train 3886
% of correct leaks values in train  1.0


 97%|███████████████████████████████████████████████████████████████████████████████▊  | 37/38 [02:26<00:03,  3.97s/it]

Processing lag 37
Leak values found in train 3887
% of correct leaks values in train  1.0


100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [02:27<00:00,  3.89s/it]


In [18]:
result = pd.DataFrame.from_dict(result, orient='columns')

In [20]:
result.tail()

Unnamed: 0,leaky_count,leaky_correct
33,3877,1.0
34,3883,1.0
35,3886,1.0
36,3886,1.0
37,3887,1.0


# find leak test

In [26]:
non_ugly_indexes = np.load('../input/test_non_ugly_indexes.npy')
test = test.iloc[non_ugly_indexes].reset_index(drop=True)

In [27]:
test['target'] = np.nan

In [28]:
test_leak = test[["ID", "target"] + colset]
test_leak["compiled_leak"] = 0
leaky_cols = []
max_nlags = 38
for i in tqdm(range(max_nlags)):
    c = "leaked_target_"+str(i)

    print('Processing lag', i)
    test_leak[c] = fast_get_leak(test_leak, cols, colset_lis, lag=i)

    leaky_cols.append(c)
    test_leak = test.join(
        test_leak.set_index("ID")[leaky_cols+["compiled_leak"]], 
        on="ID", how="left"
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  0%|                                                                                           | 0/38 [00:00<?, ?it/s]

Processing lag 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
  3%|██▏                                                                                | 1/38 [00:23<14:43, 23.87s/it]

Processing lag 1


  5%|████▎                                                                              | 2/38 [00:47<14:23, 24.00s/it]

Processing lag 2


  8%|██████▌                                                                            | 3/38 [01:11<13:53, 23.82s/it]

Processing lag 3


 11%|████████▋                                                                          | 4/38 [01:34<13:22, 23.62s/it]

Processing lag 4


 13%|██████████▉                                                                        | 5/38 [01:57<12:52, 23.41s/it]

Processing lag 5


 16%|█████████████                                                                      | 6/38 [02:18<12:20, 23.15s/it]

Processing lag 6


 18%|███████████████▎                                                                   | 7/38 [02:40<11:49, 22.89s/it]

Processing lag 7


 21%|█████████████████▍                                                                 | 8/38 [03:00<11:18, 22.62s/it]

Processing lag 8


 24%|███████████████████▋                                                               | 9/38 [03:21<10:48, 22.36s/it]

Processing lag 9


 26%|█████████████████████▌                                                            | 10/38 [03:40<10:18, 22.09s/it]

Processing lag 10


 29%|███████████████████████▋                                                          | 11/38 [04:00<09:49, 21.83s/it]

Processing lag 11


 32%|█████████████████████████▉                                                        | 12/38 [04:18<09:20, 21.54s/it]

Processing lag 12


 34%|████████████████████████████                                                      | 13/38 [04:36<08:51, 21.28s/it]

Processing lag 13


 37%|██████████████████████████████▏                                                   | 14/38 [04:53<08:23, 21.00s/it]

Processing lag 14


 39%|████████████████████████████████▎                                                 | 15/38 [05:10<07:56, 20.71s/it]

Processing lag 15


 42%|██████████████████████████████████▌                                               | 16/38 [05:26<07:29, 20.43s/it]

Processing lag 16


 45%|████████████████████████████████████▋                                             | 17/38 [05:41<07:01, 20.09s/it]

Processing lag 17


 47%|██████████████████████████████████████▊                                           | 18/38 [05:55<06:35, 19.76s/it]

Processing lag 18


 50%|█████████████████████████████████████████                                         | 19/38 [06:09<06:09, 19.42s/it]

Processing lag 19


 53%|███████████████████████████████████████████▏                                      | 20/38 [06:21<05:43, 19.10s/it]

Processing lag 20


 55%|█████████████████████████████████████████████▎                                    | 21/38 [06:34<05:19, 18.78s/it]

Processing lag 21


 58%|███████████████████████████████████████████████▍                                  | 22/38 [06:45<04:55, 18.45s/it]

Processing lag 22


 61%|█████████████████████████████████████████████████▋                                | 23/38 [06:57<04:32, 18.14s/it]

Processing lag 23


 63%|███████████████████████████████████████████████████▊                              | 24/38 [07:07<04:09, 17.82s/it]

Processing lag 24


 66%|█████████████████████████████████████████████████████▉                            | 25/38 [07:17<03:47, 17.51s/it]

Processing lag 25


 68%|████████████████████████████████████████████████████████                          | 26/38 [07:27<03:26, 17.20s/it]

Processing lag 26


 71%|██████████████████████████████████████████████████████████▎                       | 27/38 [07:35<03:05, 16.89s/it]

Processing lag 27


 74%|████████████████████████████████████████████████████████████▍                     | 28/38 [07:44<02:45, 16.58s/it]

Processing lag 28


 76%|██████████████████████████████████████████████████████████████▌                   | 29/38 [07:51<02:26, 16.27s/it]

Processing lag 29


 79%|████████████████████████████████████████████████████████████████▋                 | 30/38 [07:58<02:07, 15.97s/it]

Processing lag 30


 82%|██████████████████████████████████████████████████████████████████▉               | 31/38 [08:05<01:49, 15.66s/it]

Processing lag 31


 84%|█████████████████████████████████████████████████████████████████████             | 32/38 [08:11<01:32, 15.36s/it]

Processing lag 32


 87%|███████████████████████████████████████████████████████████████████████▏          | 33/38 [08:17<01:15, 15.06s/it]

Processing lag 33


 89%|█████████████████████████████████████████████████████████████████████████▎        | 34/38 [08:21<00:59, 14.76s/it]

Processing lag 34


 92%|███████████████████████████████████████████████████████████████████████████▌      | 35/38 [08:26<00:43, 14.47s/it]

Processing lag 35


 95%|█████████████████████████████████████████████████████████████████████████████▋    | 36/38 [08:30<00:28, 14.17s/it]

Processing lag 36


 97%|███████████████████████████████████████████████████████████████████████████████▊  | 37/38 [08:33<00:13, 13.88s/it]

Processing lag 37


100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [08:36<00:00, 13.59s/it]


In [29]:
test_leak["compiled_leak"] = 0
NLAGS = 38
for i in range(NLAGS):
    test_leak.loc[test_leak["compiled_leak"] == 0, "compiled_leak"] = test_leak.loc[test_leak["compiled_leak"] == 0, "leaked_target_"+str(i)]

In [30]:
sum(test_leak.compiled_leak != 0)

7851

In [32]:
test_leak.reset_index(drop=True).to_feather('../input/leak_test.ftr')