In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.stats import pearsonr
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import NearestNeighbors
import xgboost as xgb

# load data

In [2]:
train = pd.read_parquet("../input/train_base_shifted.parquet").reset_index(drop=True)
test = pd.read_parquet("../input/test_base_shifted.parquet").reset_index()

df = pd.concat([train, test], axis=0).reset_index(drop=True)
save_df = df.copy()

In [3]:
test.head()

Unnamed: 0,customer_ID,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,B_1_nanstd,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
0,-9223277493928322471,0.01266,0.720849,0.752822,0.743628,0.27735,0,1,0,0.010191,...,1,13,2,1,0,1,0,13,1,2
1,-9223220269070810982,0.048047,0.798196,0.902296,0.802795,0.0,0,0,0,0.013031,...,1,13,2,1,0,1,0,13,5,1
2,-9223219380479694318,0.016592,0.750023,0.810885,0.783128,0.599145,0,2,0,0.007154,...,1,13,0,1,0,1,0,13,5,1
3,-9223202973368451495,0.028179,0.681882,0.771254,0.771254,2.213015,0,8,0,0.003199,...,1,13,0,1,0,1,0,13,5,1
4,-9223190037945288673,0.025339,0.939252,1.007827,0.962337,7.434328,0,20,20,0.038536,...,1,13,0,2,0,1,0,13,2,2


In [4]:
train_size = len(train)

In [5]:
df["P_2_last"] = df["P_2_last"].fillna(np.nanmean(df["P_2_last"]))

In [6]:
importance = pd.read_csv("xgb_shifted_feature_importance.csv")

In [7]:
importance.head(20)

Unnamed: 0,feature,importance_0,importance_1,importance_2,importance_3,importance_4,importance
0,P_2_last,242.0,237.0,255.0,274.0,228.0,247.2
1,D_39_last,170.0,160.0,168.0,178.0,184.0,172.0
2,B_4_last,137.0,140.0,143.0,153.0,139.0,142.4
3,B_3_last,108.0,110.0,112.0,128.0,127.0,117.0
4,D_46_last,115.0,122.0,108.0,122.0,112.0,115.8
5,B_5_last,105.0,98.0,105.0,118.0,122.0,109.6
6,P_2_nanmean_0,101.0,109.0,95.0,103.0,119.0,105.4
7,P_2_nanmin,98.0,92.0,97.0,112.0,105.0,100.8
8,B_4_nanstd,105.0,111.0,89.0,105.0,93.0,100.6
9,R_1_last,93.0,98.0,84.0,106.0,99.0,96.0


# knn based on P_2_last

In [8]:
n_neighbors = 1000
nn = NearestNeighbors(n_neighbors=n_neighbors, p=2)
nn.fit(np.expand_dims(df["P_2_last"], axis=1))
neighbors = nn.kneighbors(np.expand_dims(df["P_2_last"], axis=1))[1]

In [9]:
neighbors.shape

(1383534, 1000)

# knn mean features

In [10]:
# important_features = importance["feature"].tolist()[:20] + ["target"]
important_features = [
    "P_2_last",
    "D_39_last",
    "B_4_last",
    "B_3_last",
    "D_46_last",
    "B_5_last",
    "R_1_last",
    "D_48_last",
    "B_1_last",
    "S_3_last",
    "D_43_last",
    "B_11_last",
    "D_44_last",
    "B_2_last",
]

target = [
    "target"
]

In [11]:
target_n_neighbors = [50, 100, 250, 500]

In [12]:
for n_neighbor in target_n_neighbors:
    
    feature_values = df[important_features].values
    feature_n_neighbor_mean_values = []
    
    target_values = df[target].values
    feature_n_neighbor_mean_target = []
    
    for sample_idx in tqdm(range(neighbors.shape[0])):
        
        sample_fitted_neighbor = []
        for neighbor_idx in neighbors[sample_idx, 1:]:
            
            if neighbor_idx < train_size:
                sample_fitted_neighbor.append(neighbor_idx)
                
            if len(sample_fitted_neighbor) == n_neighbor:
                break
                
        sample_fitted_neighbor = np.array(sample_fitted_neighbor)
#         feature_n_neighbor_mean_values.append(
#             feature_values[[sample_idx], :] / (np.nanmean(feature_values[sample_fitted_neighbor, :], axis=0, keepdims=True) + 1e-8)
#         )
        feature_n_neighbor_mean_values.append(np.nanmean(feature_values[sample_fitted_neighbor, :], axis=0, keepdims=True))
        feature_n_neighbor_mean_target.append(np.nanmean(target_values[sample_fitted_neighbor, :], axis=0, keepdims=True))
    
    feature_n_neighbor_mean_values = np.concatenate(feature_n_neighbor_mean_values, axis=0)
    feature_n_neighbor_mean_values = pd.DataFrame(feature_n_neighbor_mean_values)
    feature_n_neighbor_mean_values.columns = [(feature + "_knn_{}_mean".format(n_neighbor)) for feature in important_features]
    
    feature_n_neighbor_mean_target = np.concatenate(feature_n_neighbor_mean_target, axis=0)
    feature_n_neighbor_mean_target = pd.DataFrame(feature_n_neighbor_mean_target)
    feature_n_neighbor_mean_target.columns = ["target_knn_{}_mean".format(n_neighbor)]
    
    save_df = pd.concat([save_df, feature_n_neighbor_mean_values, feature_n_neighbor_mean_target], axis=1)

100%|██████████| 1383534/1383534 [03:26<00:00, 6684.59it/s]
100%|██████████| 1383534/1383534 [04:56<00:00, 4665.60it/s]
100%|██████████| 1383534/1383534 [09:03<00:00, 2545.06it/s]
100%|██████████| 1383534/1383534 [11:30<00:00, 2005.00it/s]


In [13]:
save_df.head()

Unnamed: 0,customer_ID,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,B_1_nanstd,...,B_5_last_knn_500_mean,R_1_last_knn_500_mean,D_48_last_knn_500_mean,B_1_last_knn_500_mean,S_3_last_knn_500_mean,D_43_last_knn_500_mean,B_11_last_knn_500_mean,D_44_last_knn_500_mean,B_2_last_knn_500_mean,target_knn_500_mean
0,-9223358381327749917,0.057145,0.340178,0.498727,0.387708,4.628507,0,16,0,0.048472,...,0.042347,0.169306,0.700322,0.260108,0.314619,0.254675,0.2376,2.63662,0.306458,0.586022
1,-9223193039457028513,0.013094,0.964483,1.002478,1.001372,0.0,0,0,0,0.001941,...,0.086096,0.005792,0.058166,0.026486,0.147974,0.060262,0.020217,0.002994,0.877075,0.002985
2,-9223189665817919541,0.038025,0.694073,0.828761,0.694073,0.0,0,0,0,0.002724,...,0.127609,0.027184,0.301515,0.104834,0.200762,0.126901,0.096392,0.355705,0.685637,0.07619
3,-9223188534444851899,0.002688,0.786647,0.794826,0.787945,0.0,0,0,0,0.00257,...,0.081756,0.015115,0.198332,0.050194,0.171331,0.099084,0.043029,0.100946,0.775764,0.024768
4,-9223173911659837606,0.078554,0.038207,0.252421,0.040486,6.144625,0,17,13,0.005226,...,0.016631,0.721172,0.894551,0.265244,0.332161,0.263078,0.236491,3.512456,0.15161,0.922819


In [14]:
save_df.tail()

Unnamed: 0,customer_ID,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,B_1_nanstd,...,B_5_last_knn_500_mean,R_1_last_knn_500_mean,D_48_last_knn_500_mean,B_1_last_knn_500_mean,S_3_last_knn_500_mean,D_43_last_knn_500_mean,B_11_last_knn_500_mean,D_44_last_knn_500_mean,B_2_last_knn_500_mean,target_knn_500_mean
1383529,9223311419908670169,0.028624,0.862442,0.953213,0.900472,3.404371,0,12,0,0.005543,...,0.102723,0.005615,0.104707,0.035499,0.151891,0.078683,0.028014,0.018127,0.845403,0.002959
1383530,9223316227884056852,0.076948,0.67448,0.920596,0.788452,5.48658,0,20,3,0.247958,...,0.122128,0.00817,0.201905,0.046835,0.172853,0.099028,0.039307,0.157895,0.808877,0.024922
1383531,9223317482642190638,0.01484,0.910732,0.950521,0.949891,0.0,0,0,0,0.003158,...,0.096389,0.007018,0.090063,0.02755,0.152756,0.069203,0.020791,0.002994,0.881296,0.0
1383532,9223341949877516615,0.053336,0.842613,0.975475,0.967907,0.0,0,0,0,0.003414,...,0.080601,0.00621,0.076322,0.025262,0.144505,0.082408,0.018707,0.0,0.878224,0.0
1383533,9223363807913010481,0.036598,0.435905,0.556915,0.556912,6.041523,0,14,0,0.032079,...,0.077126,0.075321,0.506052,0.207519,0.244738,0.169116,0.19593,1.028986,0.461901,0.267956


In [15]:
save_df.shape

(1383534, 1511)

In [16]:
train_with_knn = save_df.iloc[:train.shape[0]]
test_with_knn = save_df.iloc[train.shape[0]:]

In [17]:
corr, _ = pearsonr(train_with_knn["target"], train_with_knn["target_knn_500_mean"].fillna(0))
corr

0.687718916345599

In [18]:
corr, _ = pearsonr(train_with_knn["target"], train_with_knn["D_39_last"].fillna(0))
corr

0.3394948755924919

In [19]:
corr, _ = pearsonr(train_with_knn["target"], train_with_knn["P_2_last"].fillna(0))
corr

-0.653306551511406

In [20]:
corr, _ = pearsonr(train_with_knn["target"], train_with_knn["P_2_last_knn_500_mean"].fillna(0))
corr

-0.6656585236862348

In [21]:
train_with_knn.to_parquet("../input/train_base_shifted_knn.parquet")
test_with_knn.to_parquet("../input/test_base_shifted_knn.parquet")