In [22]:
import numpy as np
import pandas as pd
import gc, os

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from multiprocessing import cpu_count
from tqdm import tqdm

In [23]:
# ===== fake samples =====
te_ = pd.read_csv('../../datasets/santander_customer_transaction_prediction/test.csv.zip').drop(['ID_code'], axis=1).values

unique_samples = []
unique_count = np.zeros_like(te_)
for feature in tqdm(range(te_.shape[1])):
    _, index_, count_ = np.unique(te_[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]


100%|██████████| 200/200 [00:03<00:00, 54.96it/s]


In [24]:
SEED = np.random.randint(42)
np.random.seed(SEED)

var_len = 200 

# =============================================================================
# load
# =============================================================================
train = pd.read_csv("../../datasets/santander_customer_transaction_prediction/train.csv.zip")
test  = pd.read_csv("../../datasets/santander_customer_transaction_prediction/test.csv.zip").drop(synthetic_samples_indexes)

X_train = train.iloc[:, 2:].values
y_train = train.target.values

X_test = test.iloc[:, 1:].values

X = np.concatenate([X_train, X_test], axis=0)
# del X_train, X_test; gc.collect()

In [25]:
# reverse_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 15, 16, 18, 19, 22, 24, 25, 26,
#                 27, 29, 32, 35, 37, 40, 41, 47, 48, 49, 51, 52, 53, 55, 60, 61,
#                 62, 65, 66, 67, 69, 70, 71, 74, 78, 79, 82, 84, 89, 90, 91, 94,
#                 95, 96, 97, 99, 103, 105, 106, 110, 111, 112, 118, 119, 125, 128,
#                 130, 133, 134, 135, 137, 138, 140, 144, 145, 147, 151, 155, 157,
#                 159, 161, 162, 163, 164, 167, 168, 170, 171, 173, 175, 176, 179,
#                 180, 181, 184, 185, 187, 189, 190, 191, 195, 196, 199,
                
#                 ]

# for j in reverse_list:
#     X[:, j] *= -1


# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

# count encoding
X_cnt = np.zeros((len(X), var_len * 4))

for j in tqdm(range(var_len)):
    for i in range(1, 4):
        x = np.round(X[:, j], i+1)
        dic = pd.value_counts(x).to_dict()
        X_cnt[:, i+j*4] = pd.Series(x).map(dic)
    x = X[:, j]
    dic = pd.value_counts(x).to_dict()
    X_cnt[:, j*4] = pd.Series(x).map(dic)

# raw + count feature
X_raw = X.copy() # rename for readable
del X; gc.collect()

X = np.zeros((len(X_raw), var_len * 5))
for j in tqdm(range(var_len)):
    X[:, 5*j+1:5*j+5] = X_cnt[:, 4*j:4*j+4]
    X[:, 5*j] = X_raw[:, j]

# treat each var as same
X_train_concat = np.concatenate([
    np.concatenate([
        X[:200000, 5*cnum:5*cnum+5], 
        np.ones((len(y_train), 1)).astype("int")*cnum
    ], axis=1) for cnum in range(var_len)], axis=0)
y_train_concat = np.concatenate([y_train for cnum in range(var_len)], axis=0)

100%|██████████| 200/200 [00:39<00:00,  5.05it/s]
100%|██████████| 200/200 [00:03<00:00, 54.40it/s]


In [27]:
X_train_concat.shape

(40000000, 6)