In [None]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from glob import glob
from tqdm import tqdm
from datetime import datetime, date
from collections import defaultdict, Counter
from multiprocessing import cpu_count, Pool

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import warnings
warnings.simplefilter('ignore')

In [None]:
PATH = os.path.join('..', 'data')

KEY = 'card_id'

SEED = 18
# SEED = np.random.randint(9999)

NTHREAD = cpu_count()

NFOLD = 4

In [None]:
features = os.listdir('../feature')

In [None]:
train = pd.read_csv(os.path.join(PATH, 'train.csv'))

for f in features:
    print(f'Merge: {f}', end=' ')
    t = pd.read_pickle(os.path.join('..', 'feature', f))
    train = pd.merge(train, t, on=KEY, how='left')
    print('Done!!')

for f in [
    'hist_purchase_date_max', 'hist_purchase_date_min', 
    'N_hist_auth_purchase_date_max', 'N_hist_auth_purchase_date_min', 
    'Y_hist_auth_purchase_date_max', 'Y_hist_auth_purchase_date_min', 
#     'new_purchase_date_max', 'new_purchase_date_min', 
    'N_new_auth_purchase_date_max', 'N_new_auth_purchase_date_min',
    'Y_new_auth_purchase_date_max', 'Y_new_auth_purchase_date_min', 
#     'union_purchase_date_max', 'union_purchase_date_min', 
    'N_union_auth_purchase_date_max', 'N_union_auth_purchase_date_min',
    'Y_union_auth_purchase_date_max', 'Y_union_auth_purchase_date_min']:
    train[f] = train[f].astype(np.int64) * 1e-9

In [None]:
train.columns

In [None]:
train.columns.unique()

In [None]:
cnt = Counter(train.columns)

In [None]:
for t in cnt.most_common(14):
    print(f"'{t[0]}',")

In [None]:
drop_cols = [
   'N_authorized_flag_x',
    'Y_authorized_flag_x',
    'N_authorized_flag_y',
    'Y_authorized_flag_y',
    'union_transactions_count_x',
    'union_transactions_count_y',
    'hist_month_lag_mean_x',
    'hist_month_lag_std_x',
    'hist_month_diff_mean_x',
    'hist_month_diff_std_x',
    'hist_month_lag_mean_y',
    'hist_month_lag_std_y',
    'hist_month_diff_mean_y',
    'hist_month_diff_std_y', 
]

In [None]:
train.drop(drop_cols, axis=1, inplace=True)

In [None]:
train.columns.unique()

In [None]:
train = train.loc[:, ~(train.dtypes == object)]

In [None]:
train.head()

In [None]:
del train['target']

In [None]:
gc.collect()

In [None]:
train = train.fillna(0)

In [None]:
def drop_duplicated_columns(train):
    drop_cols = set()
    for ei,ci in enumerate(tqdm(train.columns)):
        ti = train[ci].values
        for cj in train.columns[ei+1:]:
            tj = train[cj].values
            if (np.max(ti) != np.min(tj)) or (np.min(ti) != np.min(tj)):
                continue
            if sum(ti != tj) == 0:
                print(ci, cj)
                drop_cols.add(cj)
                
    return drop_cols

In [None]:
train.dtypes.unique()

In [None]:
train_int64 = train.loc[:, (train.dtypes == 'int64')]
int64 = drop_duplicated_columns(train_int64)
del train_int64

In [None]:
train_float64 = train.loc[:, (train.dtypes == 'float64')]
float64 = drop_duplicated_columns(train_float64)
del train_float64

In [None]:
train_float16 = train.loc[:, (train.dtypes == 'float16')]
float16 = drop_duplicated_columns(train_float16)
del train_float16

In [None]:
train_float32 = train.loc[:, (train.dtypes == 'float32')]
float32 = drop_duplicated_columns(train_float32)
del train_float32

In [None]:
train_int8 = train.loc[:, (train.dtypes == 'int8')]
int8 = drop_duplicated_columns(train_int8)
del train_int8

In [None]:
train_int16 = train.loc[:, (train.dtypes == 'int16')]
int16 = drop_duplicated_columns(train_int16)
del train_int16

In [None]:
train_uint8 = train.loc[:, (train.dtypes == 'uint8')]
uint8 = drop_duplicated_columns(train_uint8)
del train_uint8

In [None]:
set(float64)

In [None]:
set(float16)

In [None]:
set(float32)

In [None]:
pd.DataFrame(list(set(float64)), columns={'duplicated_columns'}).head()

In [None]:
int64

In [None]:
float16

In [None]:
float32

In [None]:
float64

In [None]:
duplicated_columns = set()

In [None]:
duplicated_columns = duplicated_columns.union(float16)

In [None]:
duplicated_columns = duplicated_columns.union(float32)

In [None]:
duplicated_columns = duplicated_columns.union(float64)

In [None]:
duplicated_columns = duplicated_columns.union(int8)

In [None]:
duplicated_columns = duplicated_columns.union(int16)

In [None]:
duplicated_columns = duplicated_columns.union(int64)

In [None]:
duplicated_columns = duplicated_columns.union(uint8)

In [None]:
duplicated_columns = duplicated_columns.union(set(
    ['N_authorized_flag_x',
    'Y_authorized_flag_x',
    'N_authorized_flag_y',
    'Y_authorized_flag_y',
    'union_transactions_count_x',
    'union_transactions_count_y',
    'hist_month_lag_mean_x',
    'hist_month_lag_std_x',
    'hist_month_diff_mean_x',
    'hist_month_diff_std_x',
    'hist_month_lag_mean_y',
    'hist_month_lag_std_y',
    'hist_month_diff_mean_y',
    'hist_month_diff_std_y']))

In [None]:
len(duplicated_columns)

In [None]:
duplicated = pd.DataFrame(list(duplicated_columns), columns={'duplicated_columns'})

In [None]:
duplicated.head()

In [None]:
duplicated.to_csv('../py/duplicated_columns.py')

In [None]:
len(train.columns)

In [None]:
train_M = train.loc[:, (train.dtypes == '<M8[ns]')]

In [None]:
train_M

In [None]:
duplicated.duplicated_columns.values