This code is modified from https://www.kaggle.com/kyakovlev/ieee-lgbm-with-groupkfold-cv as a benchmark.  
If convenient, please upvote this kernel.

In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, GroupKFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm


import math
warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
# ########################### Model
# import lightgbm as lgb

# def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
#     #folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
#     folds = GroupKFold(n_splits=NFOLDS)
#     X,y = tr_df[features_columns], tr_df[target]    
#     P,P_y = tt_df[features_columns], tt_df[target]  

#     tt_df = tt_df[['TransactionID',target]]    
#     predictions = np.zeros(len(tt_df))
    
#     for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
#         print('Fold:',fold_)
#         tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
#         vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
#         print(len(tr_x),len(vl_x))
#         tr_data = lgb.Dataset(tr_x, label=tr_y)

#         if LOCAL_TEST:
#             vl_data = lgb.Dataset(P, label=P_y) 
#         else:
#             vl_data = lgb.Dataset(vl_x, label=vl_y)  

#         estimator = lgb.train(
#             lgb_params,
#             tr_data,
#             valid_sets = [tr_data, vl_data],
#             verbose_eval = 200,
#         )   
        
#         pp_p = estimator.predict(P)
#         predictions += pp_p/NFOLDS

#         if LOCAL_TEST:
#             feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
#             print(feature_imp)
# #         feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
# #         print(feature_imp)
        
#         del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
#         gc.collect()
        
#         feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
#         feature_imp.to_csv("feature.csv")
#     tt_df['prediction'] = predictions
    
#     return tt_df
# ## -------------------
########################### Model
import lightgbm as lgb

def make_predictions(tr_df, tt_df, features_columns, target, lgb_params, NFOLDS=2):
    
    #folds = TimeSeriesSplit(n_splits=NFOLDS)
    folds = GroupKFold(n_splits=NFOLDS)

    X,y = tr_df[features_columns], tr_df[target]    
    P,P_y = tt_df[features_columns], tt_df[target]  
    split_groups = tr_df['DT_M']

    tt_df = tt_df[['TransactionID',target]]    
    predictions = np.zeros(len(tt_df))
    oof = np.zeros(len(tr_df))
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)):
        print('Fold:',fold_)
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        vl_x, vl_y = X.iloc[val_idx,:], y[val_idx]
            
        print(len(tr_x),len(vl_x))
        tr_data = lgb.Dataset(tr_x, label=tr_y)
        vl_data = lgb.Dataset(vl_x, label=vl_y)  

        estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = [tr_data, vl_data],
            verbose_eval = 200,
        )   
        
        pp_p = estimator.predict(P)
        predictions += pp_p/NFOLDS
        
        oof_preds = estimator.predict(vl_x)
        oof[val_idx] = (oof_preds - oof_preds.min())/(oof_preds.max() - oof_preds.min())

        if LOCAL_TEST:
            feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
            print(feature_imp)

        feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
        feature_imp.to_csv("feature.csv",index = False)         
            
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
        
    tt_df['prediction'] = predictions
    print('OOF AUC:', metrics.roc_auc_score(y, oof))
    if LOCAL_TEST:
        print('Holdout AUC:', metrics.roc_auc_score(tt_df[TARGET], tt_df['prediction']))
    
    return tt_df

In [4]:
########################### Vars
#################################################################################
SEED = 42
seed_everything(SEED)
LOCAL_TEST = False
TARGET = 'isFraud'
START_DATE = datetime.datetime.strptime('2017-12-01', '%Y-%m-%d')

In [5]:
########################### DATA LOAD
#################################################################################
print('Load Data')
train_df = pd.read_pickle('data\train_transaction.pkl')

if LOCAL_TEST:
    
    # Convert TransactionDT to "Month" time-period. 
    # We will also drop penultimate block 
    # to "simulate" test set values difference
    train_df['DT_M'] = train_df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    train_df['DT_M'] = (train_df['DT_M'].dt.year-2017)*12 + train_df['DT_M'].dt.month 
    test_df = train_df[train_df['DT_M']==train_df['DT_M'].max()].reset_index(drop=True)
    train_df = train_df[train_df['DT_M']<(train_df['DT_M'].max()-1)].reset_index(drop=True)
    
    train_identity = pd.read_pickle('data\train_identity.pkl')
    test_identity  = train_identity[train_identity['TransactionID'].isin(
                                    test_df['TransactionID'])].reset_index(drop=True)
    train_identity = train_identity[train_identity['TransactionID'].isin(
                                    train_df['TransactionID'])].reset_index(drop=True)
    del train_df['DT_M'], test_df['DT_M']
    
else:
    test_df = pd.read_pickle('data\test_transaction.pkl')
    train_identity = pd.read_pickle('data\train_identity.pkl')
    test_identity = pd.read_pickle('data\test_identity.pkl')
    
base_columns = list(train_df) + list(train_identity)
print('Shape control:', train_df.shape, test_df.shape)

Load Data
Shape control: (590540, 394) (506691, 394)


In [6]:
########################### All features columns
#################################################################################
## Main Data
# 'TransactionID',                     -> This is pure noise, we cannot use this column as feature
# 'isFraud',                           -> Our Target
# 'TransactionDT',                     -> Time from reference time point. VERY valuable column
# 'TransactionAmt',                    -> Many unique values and has to be combined with other columns
#                                         The best score boost should come from 
#                                         TransactionDT->TransactionAmt combination
# 'ProductCD',                         -> 100% categorical feature options to use:
#                                         Frequency encoding/Target encoding/
#                                         Combinations with other columns/Model categorical feature
# 'card1' - 'card6',                   -> Categorical features with information about Client
# 'addr1' - 'addr2',                   -> add2 - Country / addr1 - subzone
# 'dist1' - 'dist2',                   -> dist2 - Country distance / dist1 - local distance from merchant
# 'P_emaildomain' - 'R_emaildomain',   -> Categorical feature. It's possible to make 
#                                         subgroup feature from it or general group
# 'C1' - 'C14'                         -> Counts. Should be numerical features (all ints?)
# 'D1' - 'D15'                         
# 'M1' - 'M9'
# 'V1' - 'V339'

## Identity Data
# 'TransactionID'
# 'id_01' - 'id_38'
# 'DeviceType',
# 'DeviceInfo'

In [7]:
for df in [train_df, test_df]:
    df['is_na_sum1'] = df.isna().sum(axis = 1)
    df['not_na_sum1'] = df.notna().sum(axis = 1)
for df in [train_identity,test_identity]:
    df['is_na_sum2'] = df.isna().sum(axis = 1)
    df['not_na_sum2'] = df.notna().sum(axis = 1)

In [8]:
# for df in [train_df, test_df]:
#     df['P_is_protonmail'] = 0
#     df['P_is_protonmail'][(df['P_emaildomain'] == 'protonmail.com')|(df['P_emaildomain'] == 'mail.com')|(df['P_emaildomain'] == 'outlook.es')|(df['P_emaildomain'] == 'aim.com')|(df['P_emaildomain'] == 'outlook.com')] = 1
#     df['R_is_protonmail'] = 0
#     df['R_is_protonmail'][(df['R_emaildomain'] == 'protonmail.com')|(df['R_emaildomain'] == 'mail.com')|(df['R_emaildomain'] == 'netzero.net')|(df['R_emaildomain'] == 'outlook.com')|(df['R_emaildomain'] == 'outlook.es')|(df['R_emaildomain'] == 'icloud.com')|(df['R_emaildomain'] == 'gmail.com')] = 1   

In [9]:
#train_df['isFraud'].mean()

In [10]:
# emain_domain = set(list(train_df['P_emaildomain'].value_counts().keys()) + list(train_df['R_emaildomain'].value_counts().keys()))
# l = []
# for emain in emain_domain:
#     all_ = (train_df['R_emaildomain'] == emain).sum()
#     sample = train_df['isFraud'][train_df['R_emaildomain'] == emain].mean()
#     l.append([emain,all_,sample])

# l.sort(key = lambda x:x[2],reverse = True)
# [i[0] for i in l if i[2] > 0.1]    
# l

In [11]:
########################### D9 and TransactionDT
# Let's add temporary "time variables" for aggregations
# and add normal "time variables"

# Also, seems that D9 column is an hour
# and it is the same as df['DT'].dt.hour
for df in [train_df, test_df]:
    # Temporary
    df['DT'] = df['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
    df['DT_M'] = (df['DT'].dt.year-2017)*12 + df['DT'].dt.month
    df['DT_W'] = (df['DT'].dt.year-2017)*52 + df['DT'].dt.weekofyear
    df['DT_D'] = (df['DT'].dt.year-2017)*365 + df['DT'].dt.dayofyear
    
    df['DT_hour'] = df['DT'].dt.hour
    df['DT_day_week'] = df['DT'].dt.dayofweek
    df['DT_day'] = df['DT'].dt.day
    
    # D9 column
    df['D9'] = np.where(df['D9'].isna(),0,1)

In [12]:
for df in [train_df,test_df]:
    df['D_re'] = (df['TransactionDT'] / 86400).apply(np.round)
    df['first_date'] = df['D_re'] - df['D1']
    df['first_date2'] = df['D_re'] - df['D4']
    df.drop('D_re',axis = 1,inplace = True)

In [13]:
# temp_df = pd.concat([train_df[['DT_hour','isFraud']],test_df[['DT_hour','isFraud']]])
# dic = dict()
# for i in range(24):
#     dic[i] = (temp_df['DT_hour'] == i).sum()
# for df in [train_df, test_df]:
#     # Temporary
#     df['TransactionPerHour'] = df['DT_hour'].map(dic)

In [14]:
temp_df = pd.concat([train_df[['TransactionDT','isFraud']],test_df[['TransactionDT','isFraud']]])
temp_df['hours'] = temp_df['TransactionDT'] // 3600
groups = temp_df.groupby('hours')
dic = dict()
for name,group in groups:
    dic[name] = group.count()[0]

In [15]:
for df in [train_df, test_df]:
    df['hours'] = df['TransactionDT'] // 3600
    df['TransactionPerHour'] = df['hours'].map(dic)
    df.drop(['hours'],axis = 1,inplace = True)

In [16]:
########################### Reset values for "noise" card1
i_cols = ['card1']

for col in i_cols: 
    print(col)
    valid_card = pd.concat([train_df[[col]], test_df[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card = valid_card[valid_card>2]
    valid_card = list(valid_card.index)

    train_df[col] = np.where(train_df[col].isin(test_df[col]), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(train_df[col]), test_df[col], np.nan)

    train_df[col] = np.where(train_df[col].isin(valid_card), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(valid_card), test_df[col], np.nan)
    
i_cols = ['DeviceInfo']    
for col in i_cols: 
    print(col)
    valid_card = pd.concat([train_identity[[col]], test_identity[[col]]])
    valid_card = valid_card[col].value_counts()
    valid_card = valid_card[valid_card>2]
    valid_card = list(valid_card.index)

    train_identity[col] = np.where(train_identity[col].isin(test_identity[col]), train_identity[col], np.nan)
    test_identity[col]  = np.where(test_identity[col].isin(train_identity[col]), test_identity[col], np.nan)

    train_identity[col] = np.where(train_identity[col].isin(valid_card), train_identity[col], np.nan)
    test_identity[col]  = np.where(test_identity[col].isin(valid_card), test_identity[col], np.nan)
    
for col in ['card2','card3','card4','card5','card6',]: 
    train_df[col] = np.where(train_df[col].isin(test_df[col]), train_df[col], np.nan)
    test_df[col]  = np.where(test_df[col].isin(train_df[col]), test_df[col], np.nan)

card1
DeviceInfo


In [17]:
########################### M columns (except M4)
# All these columns are binary encoded 1/0
# We can have some features from it
i_cols = ['M1','M2','M3','M5','M6','M7','M8','M9']

for df in [train_df, test_df]:
    df['M_sum'] = df[i_cols].sum(axis=1).astype(np.int8)
    df['M_na'] = df[i_cols].isna().sum(axis=1).astype(np.int8)

In [18]:
# ########################### ProductCD and M4 Target mean
# #for col in ['ProductCD','M4','P_emaildomain','R_emaildomain']:
# for col in ['ProductCD','M4']:
#     temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
#                                                         columns={'mean': col+'_target_mean'})
#     temp_dict.index = temp_dict[col].values
#     temp_dict = temp_dict[col+'_target_mean'].to_dict()

#     train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
#     test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

In [19]:
#经cv和lb都检验无效，暂时删去
# for df in [train_df,test_df]:
#     df['TransactionAmt_decimal'] = ((df['TransactionAmt'] - df['TransactionAmt'].astype(int)) * 1000).astype(int)

In [20]:
# def values_normalization(dt_df, periods, columns):
#     for period in periods:
#         for col in columns:
#             new_col = col +'_'+ period
#             dt_df[col] = dt_df[col].astype(float)  

#             temp_min = dt_df.groupby([period])[col].agg(['min']).reset_index()
#             temp_min.index = temp_min[period].values
#             temp_min = temp_min['min'].to_dict()

#             temp_max = dt_df.groupby([period])[col].agg(['max']).reset_index()
#             temp_max.index = temp_max[period].values
#             temp_max = temp_max['max'].to_dict()

#             temp_mean = dt_df.groupby([period])[col].agg(['mean']).reset_index()
#             temp_mean.index = temp_mean[period].values
#             temp_mean = temp_mean['mean'].to_dict()

#             temp_std = dt_df.groupby([period])[col].agg(['std']).reset_index()
#             temp_std.index = temp_std[period].values
#             temp_std = temp_std['std'].to_dict()

#             dt_df['temp_min'] = dt_df[period].map(temp_min)
#             dt_df['temp_max'] = dt_df[period].map(temp_max)
#             dt_df['temp_mean'] = dt_df[period].map(temp_mean)
#             dt_df['temp_std'] = dt_df[period].map(temp_std)

#             dt_df[new_col+'_min_max'] = (dt_df[col]-dt_df['temp_min'])/(dt_df['temp_max']-dt_df['temp_min'])
#             dt_df[new_col+'_std_score'] = (dt_df[col]-dt_df['temp_mean'])/(dt_df['temp_std'])
#             del dt_df['temp_min'],dt_df['temp_max'],dt_df['temp_mean'],dt_df['temp_std']
#     return dt_df
# periods = ['DT_D','DT_W','DT_M']
# for df in [train_df, test_df]:
#     df = values_normalization(df, periods, ['TransactionAmt'])

In [21]:
########################### Let's play "sudoku" and fill nans in cards columns
#以card1作为基准，来补全其他card
i_cols = ['TransactionID','card1','card2','card3','card4','card5','card6']

full_df = pd.concat([train_df[i_cols], test_df[i_cols]])

## I've used frequency encoding before so we have ints here
## we will drop very rare cards
full_df['card6'] = np.where(full_df['card6']==30, np.nan, full_df['card6'])
full_df['card6'] = np.where(full_df['card6']==16, np.nan, full_df['card6'])

i_cols = ['card2','card3','card4','card5','card6']

## We will find best match for nan values and fill with it
for col in i_cols:
    temp_df = full_df.groupby(['card1',col])[col].agg(['count']).reset_index()
    temp_df = temp_df.sort_values(by=['card1','count'], ascending=False).reset_index(drop=True)
    del temp_df['count']
    temp_df = temp_df.drop_duplicates(keep='first').reset_index(drop=True)
    temp_df.index = temp_df['card1'].values
    temp_df = temp_df[col].to_dict()
    full_df[col] = np.where(full_df[col].isna(), full_df['card1'].map(temp_df), full_df[col])
    
    
i_cols = ['card1','card2','card3','card4','card5','card6']
for col in i_cols:
    train_df[col] = full_df[full_df['TransactionID'].isin(train_df['TransactionID'])][col].values
    test_df[col] = full_df[full_df['TransactionID'].isin(test_df['TransactionID'])][col].values

In [22]:
########################### TransactionAmt

# Let's add some kind of client uID based on cardID ad addr columns
# The value will be very specific for each client so we need to remove it
# from final feature. But we can use it for aggregations.
train_df['uid'] = train_df['card1'].astype(str)+'_'+train_df['card2'].astype(str)
test_df['uid'] = test_df['card1'].astype(str)+'_'+test_df['card2'].astype(str)

train_df['uid2'] = train_df['uid'].astype(str)+'_'+train_df['card3'].astype(str)+'_'+train_df['card5'].astype(str)
test_df['uid2'] = test_df['uid'].astype(str)+'_'+test_df['card3'].astype(str)+'_'+test_df['card5'].astype(str)

train_df['uid3'] = train_df['uid2'].astype(str)+'_'+train_df['addr1'].astype(str)+'_'+train_df['addr2'].astype(str)
test_df['uid3'] = test_df['uid2'].astype(str)+'_'+test_df['addr1'].astype(str)+'_'+test_df['addr2'].astype(str)

##
train_df['uid4'] = train_df['uid3'].astype(str)+'_'+train_df['P_emaildomain'].astype(str)
test_df['uid4'] = test_df['uid3'].astype(str)+'_'+test_df['P_emaildomain'].astype(str)

train_df['uid5'] = train_df['uid3'].astype(str)+'_'+train_df['R_emaildomain'].astype(str)
test_df['uid5'] = test_df['uid3'].astype(str)+'_'+test_df['R_emaildomain'].astype(str)

train_df['uid7'] = train_df['card1'].astype(str)+'_'+train_df['addr1'].astype(str)+'_'+train_df['addr2'].astype(str)+'_'+train_df['P_emaildomain'].astype(str)+'_'+train_df['first_date'].astype(str)
test_df['uid7'] = test_df['card1'].astype(str)+'_'+test_df['addr1'].astype(str)+'_'+test_df['addr2'].astype(str)+'_'+test_df['P_emaildomain'].astype(str)+'_'+test_df['first_date'].astype(str)

train_df['uid8'] = train_df['card1'].astype(str)+'_'+train_df['addr1'].astype(str)+'_'+train_df['addr2'].astype(str)+'_'+train_df['P_emaildomain'].astype(str)+'_'+train_df['first_date2'].astype(str)
test_df['uid8'] = test_df['card1'].astype(str)+'_'+test_df['addr1'].astype(str)+'_'+test_df['addr2'].astype(str)+'_'+test_df['P_emaildomain'].astype(str)+'_'+test_df['first_date2'].astype(str)

##

# Check if the Transaction Amount is common or not (we can use freq encoding here)
# In our dialog with a model we are telling to trust or not to these values   
train_df['TransactionAmt_check'] = np.where(train_df['TransactionAmt'].isin(test_df['TransactionAmt']), 1, 0)
test_df['TransactionAmt_check']  = np.where(test_df['TransactionAmt'].isin(train_df['TransactionAmt']), 1, 0)

# For our model current TransactionAmt is a noise
# https://www.kaggle.com/kyakovlev/ieee-check-noise
# (even if features importances are telling contrariwise)
# There are many unique values and model doesn't generalize well
# Lets do some aggregations
i_cols = ['card1','card2','card3','card5','uid','uid2','uid3','uid4','uid5','addr1','uid7','uid8']
data_cols = ['V307','TransactionAmt','D15','dist1']
#data_cols = ['TransactionAmt','D15','dist1','D4']
#data_cols = ['TransactionAmt']

for col_data in data_cols:
    for col in i_cols:
        print([col_data,col])
        for agg_type in ['mean', 'std','max','min','median','nunique','skew']:
            new_col_name = col+'_' + col_data + '_' +agg_type
            temp_df = pd.concat([train_df[[col, col_data]], test_df[[col,col_data]]])
            temp_df = temp_df.groupby([col])[col_data].agg([agg_type]).reset_index().rename(
                                                    columns={agg_type: new_col_name})

            temp_df.index = list(temp_df[col])
            temp_df = temp_df[new_col_name].to_dict()   

            train_df[new_col_name] = train_df[col].map(temp_df)
            test_df[new_col_name]  = test_df[col].map(temp_df)

    
  

['V307', 'card1']
['V307', 'card2']
['V307', 'card3']
['V307', 'card5']
['V307', 'uid']
['V307', 'uid2']
['V307', 'uid3']
['V307', 'uid4']
['V307', 'uid5']
['V307', 'addr1']
['V307', 'uid7']
['V307', 'uid8']
['TransactionAmt', 'card1']
['TransactionAmt', 'card2']
['TransactionAmt', 'card3']
['TransactionAmt', 'card5']
['TransactionAmt', 'uid']
['TransactionAmt', 'uid2']
['TransactionAmt', 'uid3']
['TransactionAmt', 'uid4']
['TransactionAmt', 'uid5']
['TransactionAmt', 'addr1']
['TransactionAmt', 'uid7']
['TransactionAmt', 'uid8']
['D15', 'card1']
['D15', 'card2']
['D15', 'card3']
['D15', 'card5']
['D15', 'uid']
['D15', 'uid2']
['D15', 'uid3']
['D15', 'uid4']
['D15', 'uid5']
['D15', 'addr1']
['D15', 'uid7']
['D15', 'uid8']
['dist1', 'card1']
['dist1', 'card2']
['dist1', 'card3']
['dist1', 'card5']
['dist1', 'uid']
['dist1', 'uid2']
['dist1', 'uid3']
['dist1', 'uid4']
['dist1', 'uid5']
['dist1', 'addr1']
['dist1', 'uid7']
['dist1', 'uid8']


In [23]:
# for col in i_cols:
#     new_col_name = col+'_TransactionAmt_'+ "cumsum"
#     temp_df = pd.concat([train_df[[col, 'TransactionAmt']], test_df[[col,'TransactionAmt']]])
#     #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
#     temp = temp_df.groupby(col)['TransactionAmt'].cumsum()

#     train_df[new_col_name] = temp.iloc[:train_df.shape[0]]
#     test_df[new_col_name]  = temp.iloc[train_df.shape[0]:] 

#     new_col_name = col+'_TransactionAmt_'+ "cumcount"
#     temp_df = pd.concat([train_df[[col, 'TransactionAmt']], test_df[[col,'TransactionAmt']]])
#     #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
#     temp = temp_df.groupby(col)['TransactionAmt'].cumcount()

#     train_df[new_col_name] = temp.iloc[:train_df.shape[0]]
#     test_df[new_col_name]  = temp.iloc[train_df.shape[0]:]   

In [24]:
# Small "hack" to transform distribution 
# (doesn't affect auc much, but I like it more)
# please see how distribution transformation can boost your score 
# (not our case but related)
# https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html
train_df['TransactionAmt'] = np.log1p(train_df['TransactionAmt'])
test_df['TransactionAmt'] = np.log1p(test_df['TransactionAmt']) 

In [25]:
########################### 'P_emaildomain' - 'R_emaildomain'
p = 'P_emaildomain'
r = 'R_emaildomain'
uknown = 'email_not_provided'

for df in [train_df, test_df]:
    df[p] = df[p].fillna(uknown)
    df[r] = df[r].fillna(uknown)
    
    # Check if P_emaildomain matches R_emaildomain
    df['email_check'] = np.where((df[p]==df[r])&(df[p]!=uknown),1,0)

    df[p+'_prefix'] = df[p].apply(lambda x: x.split('.')[0])
    df[r+'_prefix'] = df[r].apply(lambda x: x.split('.')[0])

## Local test doesn't show any boost here, 
## but I think it's a good option for model stability 

## Also, we will do frequency encoding later

In [26]:
#试一下对Email的清理有没有用
#
email_dic = {
'yahoo'  : 'Yahoo',
'ymail'  : 'Yahoo',
'frontier'  : 'Yahoo',
'rocketmail' : 'Yahoo',
'hotmail': 'Microsoft',
'outlook' : 'Microsoft',
'live' : 'Microsoft',
'msn': 'Microsoft',
'icloud'  : 'Appe',
'mac'  : 'Appe',
'me' : 'Appe',
'prodigy' : 'AT&T',
'att' : 'AT&T',
'sbcglobal': 'AT&T',
'centurylink' :'Centurylink',
'embarqmail':'Centurylink', 
'q' :'Centurylink',
'aim' : 'AOL',
'aol': 'AOL',
'twc'  : 'Spectrum',
'charter' : 'Spectrum',
'email_not_provided':'other'
}
train_df['R_emaildomain_prefix2'] = train_df['R_emaildomain_prefix'].map(email_dic)
train_df['P_emaildomain_prefix2'] = train_df['P_emaildomain_prefix'].map(email_dic)
test_df['R_emaildomain_prefix2'] = test_df['R_emaildomain_prefix'].map(email_dic)
test_df['P_emaildomain_prefix2'] = test_df['P_emaildomain_prefix'].map(email_dic)

In [27]:
for df in [train_df,test_df]:
    df['R_emaildomain_prefix2'][df['R_emaildomain_prefix2'].isnull() == True] = df['R_emaildomain_prefix']
    df['P_emaildomain_prefix2'][df['P_emaildomain_prefix2'].isnull() == True] = df['P_emaildomain_prefix']
    df.drop('R_emaildomain_prefix',axis = 1,inplace = True)
    df.drop('P_emaildomain_prefix',axis = 1,inplace = True)

In [28]:
########################### Device info
for df in [train_identity, test_identity]:
    ########################### Device info
    df['DeviceInfo'] = df['DeviceInfo'].fillna('unknown_device').str.lower()
    df['DeviceInfo_device'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['DeviceInfo_version'] = df['DeviceInfo'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Device info 2
    df['id_30'] = df['id_30'].fillna('unknown_device').str.lower()
    df['id_30_device'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))
    df['id_30_version'] = df['id_30'].apply(lambda x: ''.join([i for i in x if i.isnumeric()]))
    
    ########################### Browser
    df['id_31'] = df['id_31'].fillna('unknown_device').str.lower()
    df['id_31_device'] = df['id_31'].apply(lambda x: ''.join([i for i in x if i.isalpha()]))

In [29]:
for df in [train_identity, test_identity]:
    df['UserAgent'] = np.where(df['DeviceInfo'].str.contains("/"),df['DeviceInfo'],np.nan)
    df['ParsingError'] = np.where(df['DeviceInfo'].str.contains("/"),0,1)

In [30]:
########################### Merge Identity columns
temp_df = train_df[['TransactionID']]
temp_df = temp_df.merge(train_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
train_df = pd.concat([train_df,temp_df], axis=1)
    
temp_df = test_df[['TransactionID']]
temp_df = temp_df.merge(test_identity, on=['TransactionID'], how='left')
del temp_df['TransactionID']
test_df = pd.concat([test_df,temp_df], axis=1)

In [31]:
for df in [train_df,test_df]:
    df['is_na_sum'] = df['is_na_sum1'] + df['is_na_sum2']
    df['not_na_sum'] = df['not_na_sum1'] + df['not_na_sum2']

In [32]:
# for col in ['DeviceInfo']:
#     temp_dict = train_df.groupby([col])[TARGET].agg(['mean']).reset_index().rename(
#                                                         columns={'mean': col+'_target_mean'})
#     temp_dict.index = temp_dict[col].values
#     temp_dict = temp_dict[col+'_target_mean'].to_dict()

#     train_df[col+'_target_mean'] = train_df[col].map(temp_dict)
#     test_df[col+'_target_mean']  = test_df[col].map(temp_dict)

In [34]:
# browser = pd.read_csv(r'D:\学习\data_mining\IEEE-CIS_Fraud_Detection\kernels\9468kernel\brower_version.csv',engine = 'python')
# dic = pd.Series(browser[browser.columns[2]].values, index=browser[browser.columns[0]])

In [35]:
# train_df['BrowserUpToDate'] = train_df['id_31'].map(dic)
# train_df['BrowserUpToDate'] = pd.to_datetime(train_df['BrowserUpToDate'])
# train_df['WhetherBrowserUpToDate'] = train_df['DT'] <= train_df['BrowserUpToDate']
# train_df['WhetherBrowserUpToDate'][train_df['BrowserUpToDate'].isnull()] = np.nan
# train_df['WhetherBrowserUpToDate'][(train_df['id_31'].str.contains("ie") == True) & (train_df['id_31'] != 'android webview 4.0')] = 0
# train_df.drop(['BrowserUpToDate'],axis = 1,inplace = True)

# test_df['BrowserUpToDate'] = test_df['id_31'].map(dic)
# test_df['BrowserUpToDate'] = pd.to_datetime(test_df['BrowserUpToDate'])
# test_df['WhetherBrowserUpToDate'] = test_df['DT'] <= test_df['BrowserUpToDate']
# test_df['WhetherBrowserUpToDate'][test_df['BrowserUpToDate'].isnull()] = np.nan
# test_df['WhetherBrowserUpToDate'][(test_df['id_31'].str.contains("ie") == True) & (test_df['id_31'] != 'android webview 4.0')] = 0
# test_df.drop(['BrowserUpToDate'],axis = 1,inplace = True)

In [36]:
########################### Freq encoding
i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8',
          'addr1','addr2',
          'dist1','dist2',
#          'P_emaildomain', 'R_emaildomain','R_emaildomain_prefix2','P_emaildomain_prefix2','R_emaildomain_prefix','P_emaildomain_prefix',
          'P_emaildomain', 'R_emaildomain','R_emaildomain_prefix2','P_emaildomain_prefix2',
          'DeviceInfo','DeviceInfo_device','DeviceInfo_version',
          'id_30','id_30_device','id_30_version',
          'id_31_device',
          'id_33',
          'uid','uid2','uid3','uid4','uid5','uid7','uid8'
         ]

for col in i_cols:
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()   
    train_df[col+'_fq_enc'] = train_df[col].map(fq_encode)
    test_df[col+'_fq_enc']  = test_df[col].map(fq_encode)


for col in ['DT_M','DT_W','DT_D']:
    temp_df = pd.concat([train_df[[col]], test_df[[col]]])
    fq_encode = temp_df[col].value_counts().to_dict()
            
    train_df[col+'_total'] = train_df[col].map(fq_encode)
    test_df[col+'_total']  = test_df[col].map(fq_encode)
        

periods = ['DT_M','DT_W','DT_D']
i_cols = ['uid']
for period in periods:
    for col in i_cols:
        new_column = col + '_' + period
            
        temp_df = pd.concat([train_df[[col,period]], test_df[[col,period]]])
        temp_df[new_column] = temp_df[col].astype(str) + '_' + (temp_df[period]).astype(str)
        fq_encode = temp_df[new_column].value_counts().to_dict()
            
        train_df[new_column] = (train_df[col].astype(str) + '_' + train_df[period].astype(str)).map(fq_encode)
        test_df[new_column]  = (test_df[col].astype(str) + '_' + test_df[period].astype(str)).map(fq_encode)
        
        train_df[new_column] /= train_df[period+'_total']
        test_df[new_column]  /= test_df[period+'_total']

下面特征参考https://www.kaggle.com/gunesevitan/lightgbm-some-new-features/comments

In [38]:
DATE_COLS = ['D{}'.format(i) for i in range(1, 16) if i != 9]
train_df['UniqueDates'] = train_df[DATE_COLS].nunique(axis=1)
test_df['UniqueDates'] = test_df[DATE_COLS].nunique(axis=1)

In [39]:
train_df.shape

(590540, 864)

In [40]:
test_df.shape

(506691, 864)

In [41]:
##加上特征转换的部分

In [42]:
# columns = [i for i in train_df if 'V' in i]
# train_df.drop(columns,axis = 1,inplace = True)
# test_df.drop(columns,axis = 1,inplace = True)
# v_train = pd.read_pickle(r'D:\学习\data_mining\IEEE-CIS_Fraud_Detection\kernels\feature_selection\feature_V_train.pkl')
# v_test = pd.read_pickle(r'D:\学习\data_mining\IEEE-CIS_Fraud_Detection\kernels\feature_selection\feature_V_test.pkl')
# train_df = pd.concat([train_df,v_train],axis = 1)
# test_df = pd.concat([test_df,v_test.reset_index(drop=True)],axis = 1)

In [43]:
#columns2 = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'ProductCD_target_mean', 'M4_target_mean', 'TransactionAmt_check',  'email_check', 'R_emaildomain_prefix2', 'P_emaildomain_prefix2', 'id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_32', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'id_33_0', 'id_33_1', 'is_na_sum2', 'not_na_sum2', 'DeviceInfo_device', 'id_30_device', 'UserAgent',  'ParsingError', 'card1_fq_enc', 'card2_fq_enc', 'card3_fq_enc', 'card5_fq_enc', 'C1_fq_enc', 'C2_fq_enc', 'C3_fq_enc', 'C4_fq_enc', 'C5_fq_enc', 'C6_fq_enc', 'C7_fq_enc', 'C8_fq_enc', 'C9_fq_enc', 'C10_fq_enc', 'C11_fq_enc', 'C13_fq_enc', 'C14_fq_enc', 'D1_fq_enc', 'D2_fq_enc', 'D8_fq_enc', 'addr1_fq_enc', 'addr2_fq_enc', 'dist1_fq_enc', 'dist2_fq_enc', 'P_emaildomain_fq_enc', 'R_emaildomain_fq_enc', 'R_emaildomain_prefix2_fq_enc', 'P_emaildomain_prefix2_fq_enc', 'DeviceInfo_fq_enc', 'DeviceInfo_device_fq_enc', 'DeviceInfo_version_fq_enc', 'id_30_fq_enc', 'id_30_device_fq_enc', 'id_30_version_fq_enc', 'id_31_device_fq_enc', 'id_33_fq_enc', 'uid_fq_enc', 'uid2_fq_enc', 'uid3_fq_enc', 'uid4_fq_enc', 'uid5_fq_enc', 'uid_DT_M', 'uid_DT_W']


In [44]:
#len(drop_cols) / len(columns2)

去掉一些用处不大的列

In [45]:
drop_cols = ['id_27','id_22','id_23','id_08','TransactionAmt_check','id_10','D9','dist2_fq_enc','id_24','id_11','id_21','id_15','R_emaildomain_fq_enc','DeviceInfo_fq_enc','ParsingError','id_18','id_25','DeviceInfo_device','M2','id_03','id_09','D13']
for df in [train_df, test_df]:
    df.drop(columns=drop_cols, inplace=True)

In [46]:
########################### Encode Str columns
# For all such columns (probably not)
# we already did frequency encoding (numeric feature)
# so we will use astype('category') here
for col in list(train_df):
    if train_df[col].dtype=='O':
        print(col)
        train_df[col] = train_df[col].fillna('unseen_before_label')
        test_df[col]  = test_df[col].fillna('unseen_before_label')
        
        train_df[col] = train_df[col].astype(str)
        test_df[col] = test_df[col].astype(str)
        
# #######################对这些类别特征全部加上低频过滤#########################
# 没有必要。要么删了，要么本来类别就很少
#         valid_card = pd.concat([train_df[[col]], test_df[[col]]])
#         valid_card = valid_card[col].value_counts()
#         valid_card = valid_card[valid_card>2]
#         valid_card = list(valid_card.index)

#         train_df[col] = np.where(train_df[col].isin(test_df[col]), train_df[col], np.nan)
#         test_df[col]  = np.where(test_df[col].isin(train_df[col]), test_df[col], np.nan)

#         train_df[col] = np.where(train_df[col].isin(valid_card), train_df[col], np.nan)
#         test_df[col]  = np.where(test_df[col].isin(valid_card), test_df[col], np.nan)
        
# ###############################################################################        
        
        le = LabelEncoder()
        le.fit(list(train_df[col])+list(test_df[col]))
        train_df[col] = le.transform(train_df[col])
        test_df[col]  = le.transform(test_df[col])
        
        train_df[col] = train_df[col].astype('int')
        test_df[col] = test_df[col].astype('int')

P_emaildomain
R_emaildomain
uid
uid2
uid3
uid4
uid5
uid7
uid8
R_emaildomain_prefix2
P_emaildomain_prefix2
id_30
id_31
DeviceType
DeviceInfo
DeviceInfo_version
id_30_device
id_30_version
id_31_device
UserAgent


In [47]:
########################### Model Features 
## We can use set().difference() but the order matters
## Matters only for deterministic results
## In case of remove() we will not change order
## even if variable will be renamed
## please see this link to see how set is ordered
## https://stackoverflow.com/questions/12165200/order-of-unordered-python-sets
rm_cols = [
    'TransactionID','TransactionDT', # These columns are pure noise right now
    TARGET,                          # Not target in features))
    'uid','uid2','uid3', 'uid4','uid5', 'uid7',           # Our new client uID -> very noisy data
    'bank_type',                     # Victims bank could differ by time
    'DT','DT_M','DT_W','DT_D',       # Temporary Variables
    'DT_hour','DT_day_week','DT_day',
    'DT_D_total','DT_W_total','DT_M_total',
    'id_30','id_31','id_33','uid8'
]

In [48]:
########################### Features elimination 
from scipy.stats import ks_2samp
features_check = []
columns_to_check = set(list(train_df)).difference(base_columns+rm_cols)
for i in columns_to_check:
    features_check.append(ks_2samp(test_df[i], train_df[i])[1])

features_check = pd.Series(features_check, index=columns_to_check).sort_values() 
features_discard = list(features_check[features_check==0].index)
# if 'is_na_sum' in features_discard:
#     features_discard.remove('is_na_sum')
# if 'TransactionPerHour' in features_discard:
#     features_discard.remove('TransactionPerHour') 
features_discard2 = []
for i in features_discard:
    features_discard2.append(i)
features_discard = features_discard2
print(features_discard)



['M_sum', 'uid8_TransactionAmt_nunique', 'D7_fq_enc', 'first_date', 'uid8_D15_mean', 'uid8_dist1_std', 'M_na', 'uid7_D15_max', 'uid_DT_D', 'uid_DT_W', 'D5_fq_enc', 'uid7_dist1_max', 'uid8_D15_nunique', 'first_date2', 'uid7_dist1_skew', 'is_na_sum', 'uid8_dist1_skew', 'uid7_dist1_mean', 'uid8_dist1_min', 'D6_fq_enc', 'uid8_D15_max', 'uid8_dist1_nunique', 'not_na_sum', 'uid7_D15_std', 'UniqueDates', 'uid7_D15_mean', 'uid8_D15_median', 'uid7_D15_nunique', 'uid5_D15_max', 'TransactionPerHour', 'uid7_D15_median', 'uid7_D15_min', 'uid5_D15_min', 'is_na_sum1', 'uid8_dist1_mean', 'uid7_dist1_min', 'D4_fq_enc', 'uid8_TransactionAmt_max', 'not_na_sum1', 'uid8_dist1_median', 'C12_fq_enc', 'uid7_dist1_median', 'uid8_dist1_max', 'uid5_D15_mean', 'id_31_device', 'uid4_D15_mean', 'uid5_D15_median', 'uid8_D15_std', 'uid7_dist1_nunique', 'uid5_D15_nunique', 'DeviceInfo_version', 'uid8_D15_min', 'D3_fq_enc', 'id_30_version', 'uid4_D15_median']


In [49]:
# We will reset this list for now (use local test drop),
# Good droping will be in other kernels
# with better checking
#features_discard = [] 

# Final features list
features_columns = [col for col in list(train_df) if col not in rm_cols + features_discard]

In [50]:
# train_df.to_pickle(r"D:\学习\data_mining\IEEE-CIS_Fraud_Detection\super_stacking\data\lgb\train_df.pkl")
# test_df.to_pickle(r"D:\学习\data_mining\IEEE-CIS_Fraud_Detection\super_stacking\data\lgb\test_df.pkl")
# with open(r"D:\学习\data_mining\IEEE-CIS_Fraud_Detection\super_stacking\data\lgb\features_columns",'w') as f:
#     for element in features_columns:
#         f.write(element)

In [51]:
features_columns

['TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D10',
 'D11',
 'D12',
 'D14',
 'D15',
 'M1',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 

In [52]:
# train_df[features_columns].to_pickle(r"D:\学习\data_mining\IEEE-CIS_Fraud_Detection\kernels\feature_selection\train.pkl")
# test_df[features_columns].to_pickle(r"D:\学习\data_mining\IEEE-CIS_Fraud_Detection\kernels\feature_selection\test.pkl")
# train_df[TARGET].to_pickle(r"D:\学习\data_mining\IEEE-CIS_Fraud_Detection\kernels\feature_selection\target.pkl")
# train_df[TARGET].sum()

In [53]:
########################### Model params
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'learning_rate':0.01,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':800,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [54]:
#scale_pos_weight说我觉得我还可以再拯救一下
#train_labels = train_df[TARGET]
#ratio= 0.2 * float(np.sum(train_labels == 0)) / np.sum(train_labels == 1)
#ratio

In [55]:
#lgb_params['scale_pos_weight'] = ratio

In [56]:
%%time

########################### Model Train
if LOCAL_TEST:
    lgb_params['learning_rate'] = 0.01
    lgb_params['n_estimators'] = 4000
    lgb_params['early_stopping_rounds'] = 100
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params)
    print(metrics.roc_auc_score(test_predictions[TARGET], test_predictions['prediction']))
else:
    lgb_params['learning_rate'] = 0.005
    lgb_params['n_estimators'] = 2200
    lgb_params['early_stopping_rounds'] = 100    
    test_predictions = make_predictions(train_df, test_df, features_columns, TARGET, lgb_params, NFOLDS=6)

Fold: 0
456201 134339
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.956852	valid_1's auc: 0.889499
[400]	training's auc: 0.977332	valid_1's auc: 0.902315
[600]	training's auc: 0.989096	valid_1's auc: 0.909743
[800]	training's auc: 0.9949	valid_1's auc: 0.916009
[1000]	training's auc: 0.997517	valid_1's auc: 0.919879
[1200]	training's auc: 0.998786	valid_1's auc: 0.922152
[1400]	training's auc: 0.999405	valid_1's auc: 0.924189
[1600]	training's auc: 0.9997	valid_1's auc: 0.925373
[1800]	training's auc: 0.999844	valid_1's auc: 0.926217
[2000]	training's auc: 0.999922	valid_1's auc: 0.926574
Early stopping, best iteration is:
[2081]	training's auc: 0.999941	valid_1's auc: 0.926757
Fold: 1
488572 101968
Training until validation scores don't improve for 100 rounds.
[200]	training's auc: 0.953476	valid_1's auc: 0.918497
[400]	training's auc: 0.976301	valid_1's auc: 0.9327
[600]	training's auc: 0.988981	valid_1's auc: 0.940994
[800]	training's auc: 0.

In [57]:
########################### Export
if not LOCAL_TEST:
    test_predictions['isFraud'] = test_predictions['prediction']
    test_predictions[['TransactionID','isFraud']].to_csv('middle_result\submission_lgmb_final.csv', index=False)

In [63]:
features = ['P_emaildomain','R_emaildomain','uid','uid2','uid3','uid4','uid5','uid7','uid8','R_emaildomain_prefix2','P_emaildomain_prefix2','id_30','id_31','DeviceType','DeviceInfo','DeviceInfo_version','id_30_device','id_30_version','id_31_device','UserAgent']

In [64]:
for col in features:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [2]:
train_df.to_pickle('data\train_final.pkl')
test_df.to_pickle('data\test_final.pkl')