In [1]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm, trange

In [3]:
datapath = Path('data')

In [4]:
df_train = pd.read_csv(datapath / 'train.csv')
df_train.head()
y_train = df_train['target']

In [5]:
df_test_with_synthetic_samples = pd.read_csv(datapath / 'test.csv')
df_test_with_synthetic_samples.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


## Remove synthetic rows from test

In [6]:
def dedup_test(df_test):
    unique_samples = []
    unique_count = np.zeros_like(df_test)
    for feature in trange(df_test.shape[1]):
        _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
        unique_count[index_[count_ == 1], feature] += 1

    # Samples which have unique values are real the others are fake
    real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
#     synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
    return df_test[real_samples_indexes]

In [7]:
deduped_test = dedup_test(df_test_with_synthetic_samples.drop(columns=['ID_code']).values)

100%|██████████| 200/200 [00:04<00:00, 44.30it/s]


In [8]:
df_test = pd.DataFrame(deduped_test, columns=df_test_with_synthetic_samples.drop(columns=['ID_code']).columns)
df_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
1,17.3035,-2.4212,13.3989,8.3998,11.0777,9.6449,5.9596,17.8477,-4.8068,7.4643,...,4.4676,4.4214,0.9303,1.4994,15.2648,-1.7931,6.5316,10.4855,23.4631,0.7283
2,10.6137,-2.1898,8.909,3.8014,13.8602,-5.9802,5.5515,15.4716,-0.1714,7.6178,...,13.1683,4.0625,-0.1537,7.9787,18.4518,0.1,-7.8212,9.2355,15.0721,-7.3475
3,14.8595,-4.5378,13.6483,5.648,9.9144,1.519,5.0358,13.4524,-2.5419,9.445,...,2.6735,5.8526,4.8517,2.502,22.8224,-0.9325,8.6849,10.2848,17.4932,6.08
4,14.1732,-5.149,9.7591,3.7316,10.37,-21.9202,7.713,18.8749,0.468,7.8453,...,0.864,5.9058,1.314,4.8961,20.1087,1.1051,7.7184,9.3406,21.1746,-2.0098


## Create categorical features

In [9]:
def categorize_feature(var, df_train_0, df_train_1):
    var_s = f'var_{var}'
    
    # Category 1 (This value appears at least another time in data with target==1 and no 0;)
    _, unq_inv_1, unq_cnt_1 = np.unique(df_train_1[var_s], return_inverse=True, return_counts=True)
    categories1_mat_1 = np.zeros(len(df_train_1))
    categories1_mat_1[ unq_cnt_1[unq_inv_1] > 1 ] = 1
    
    # Category 2 (This value appears at least another time in data with target==0 and no 1;)
    _, unq_inv_0, unq_cnt_0 = np.unique(df_train_0[var_s], return_inverse=True, return_counts=True)
    categories2_mat_0 = np.zeros(len(df_train_0))
    categories2_mat_0[ unq_cnt_0[unq_inv_0] > 1 ] = 2
    
    # Category 3 (This value appears at least two more time in data with target==0 & 1;)
    idx_0 = np.in1d(df_train_0[var_s].values, df_train_1[var_s].values)
    idx_1 = np.in1d(df_train_1[var_s].values, df_train_0[var_s].values)
    categories3_mat_0 = np.zeros(len(df_train_0))
    categories3_mat_1 = np.zeros(len(df_train_1))
    categories3_mat_0[ idx_0 ] = 3
    categories3_mat_1[ idx_1 ] = 3
    categories3_mat = np.append(categories3_mat_0, categories3_mat_1)
    
    # Category 4: (This value is unique in data;)
    df_total = np.append(df_train_0[var_s], df_train_1[var_s])
    _, unq_inv, unq_cnt = np.unique(df_total, return_inverse=True, return_counts=True)
    categories4_mat = np.zeros(len(df_total))
    categories4_mat[ unq_cnt[unq_inv] == 1 ] = 4
    
    # Category 5: (This value is unique in data + test (only including real test samples);)
    df_train_test = np.append(df_total, df_test[var_s])
    _, unq_inv_tt, unq_cnt_tt = np.unique(df_train_test, return_inverse=True, return_counts=True)
    categories5_mat = np.zeros(len(df_train_test))
    categories5_mat[ unq_cnt_tt[unq_inv_tt] == 1 ] = 5
    categories5_mat = categories5_mat[0:len(df_total)]
    
    # Merge categories
    category_mat = np.append(categories2_mat_0, categories1_mat_1)
    category_mat[ categories3_mat == 3 ] = 3
    category_mat[ categories4_mat == 4 ] = 4
    category_mat[ categories5_mat == 5 ] = 5
    return category_mat

In [10]:
df_train_0 = df_train[df_train['target'] == 0]
df_train_1 = df_train[df_train['target'] == 1]

In [11]:
categorical_columns = np.zeros((200_000, 200), dtype=int)
for i in trange(200):
    categorical_columns[:,i] = categorize_feature(i, df_train_0, df_train_1)

100%|██████████| 200/200 [00:26<00:00,  7.39it/s]


In [12]:
len(np.where( categorical_columns[:,0] == 1)[0] ), \
len(np.where( categorical_columns[:,0] == 2)[0] ), \
len(np.where( categorical_columns[:,0] == 3)[0] ), \
len(np.where( categorical_columns[:,0] == 4)[0] ), \
len(np.where( categorical_columns[:,0] == 5)[0] )

(630, 113844, 45171, 17291, 23064)

In [13]:
len(np.where( categorical_columns[:,0] == 1)[0] ) + \
len(np.where( categorical_columns[:,0] == 2)[0] ) + \
len(np.where( categorical_columns[:,0] == 3)[0] ) + \
len(np.where( categorical_columns[:,0] == 4)[0] ) + \
len(np.where( categorical_columns[:,0] == 5)[0] )

200000

## Create extra "not unique feat"

In [14]:
def create_not_unique_feat(var, df_train_0, df_train_1):
    var_s = f'var_{var}'
    
    df_total = np.append(df_train_0[var_s], df_train_1[var_s])
    df_train_test = np.append(df_total, df_test[var_s])
    mean = df_train_test.mean()
    _, unq_inv_tt, unq_cnt_tt = np.unique(df_train_test, return_inverse=True, return_counts=True)
    df_train_test[ unq_cnt_tt[unq_inv_tt] == 1 ] = mean
    
    return df_train_test[0:200_000]

In [15]:
not_unique_columns = np.zeros((200_000, 200))
for i in trange(200):
    not_unique_columns[:,i] = create_not_unique_feat(i, df_train_0, df_train_1)

100%|██████████| 200/200 [00:07<00:00, 25.39it/s]


## Rebuild df_train with extra features

In [16]:
df_train_cat = pd.DataFrame(categorical_columns, columns=[f'cat_{i}' for i in range(200)])

In [17]:
df_train_fe = df_train_0.append(df_train_1)

In [19]:
df_train_fe = df_train_fe.join(df_train_cat)

In [20]:
df_train_not_uniq = pd.DataFrame(not_unique_columns, columns=[f'nu_{i}' for i in range(200)])

In [21]:
df_train_fe = df_train_fe.join(df_train_not_uniq)

In [22]:
df_train_fe.shape

(200000, 602)

In [23]:
df_train_fe.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,nu_190,nu_191,nu_192,nu_193,nu_194,nu_195,nu_196,nu_197,nu_198,nu_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,1.927634,3.328807,17.992364,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [24]:
df_train_fe.tail()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,nu_190,nu_191,nu_192,nu_193,nu_194,nu_195,nu_196,nu_197,nu_198,nu_199
199966,train_199966,1,13.5797,2.5526,6.0512,5.273,12.2182,-3.4048,7.3623,17.8372,...,3.221251,3.9147,0.5027,1.9833,20.9601,-2.4768,6.1954,9.3146,19.9228,-3.29878
199976,train_199976,1,7.9663,-2.8485,9.0919,7.3298,9.669,-16.7872,4.5094,12.4351,...,1.7653,8.6697,4.8305,3.328807,16.3678,-0.101,6.0606,9.523,17.6373,-3.29878
199981,train_199981,1,12.814,0.6386,14.1657,7.1044,8.9365,-0.3274,6.5949,14.6078,...,12.327,7.445552,4.6307,7.2337,15.4533,2.8233,5.9289,8.292,12.8102,-3.29878
199986,train_199986,1,12.0298,-8.78,7.7071,7.4015,9.2305,-16.2174,5.9064,17.9268,...,9.7605,5.8678,0.852,0.5064,20.6244,0.6213,-3.923,8.6071,18.9748,3.933
199990,train_199990,1,14.1475,1.8568,11.0066,3.6779,12.1944,-16.5936,5.3217,14.8508,...,0.455,6.5912,2.2296,5.8103,23.0054,-1.0048,6.8868,9.2086,16.3833,9.6348


In [26]:
df_train_fe.reset_index(drop=True).to_feather(datapath / 'train_with_fe.fth')