In [438]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
%matplotlib inline
import seaborn as sns
import math

RSEED = 50

## Load Data

In [381]:
# Load training data
df_train_transac = pd.read_csv('./data/train_transaction.csv')
df_train_identity = pd.read_csv('./data/train_identity.csv')
df_train = pd.merge(df_train_transac,df_train_identity,on='TransactionID',how='left')

In [382]:
# Load test data
df_test_transac = pd.read_csv('./data/test_transaction.csv')
df_test_identity = pd.read_csv('./data/test_identity.csv')
df_test = pd.merge(df_test_transac,df_test_identity,on='TransactionID',how='left')

In [383]:
# combine train and test
df_total = df_train.append(df_test,sort=False)

In [384]:
df_total.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0.0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0.0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0.0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0.0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0.0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


# Feature Engineer

In [243]:
def clean_id31(df):
    df['id_31'] = df['id_31'].str.replace("([0-9\.])", "")
    df['id_31'][df['id_31'].str.contains('chrome', regex=False)==True] = 'chrome'
    df['id_31'][df['id_31'].str.contains('Samsung', regex=False)==True] = 'Samsung'
    df['id_31'][df['id_31'].str.contains('samsung', regex=False)==True] = 'Samsung'
    df['id_31'][df['id_31'].str.contains('firefox', regex=False)==True] = 'firefox'
    df['id_31'][df['id_31'].str.contains('safari', regex=False)==True] = 'safari'
    df['id_31'][df['id_31'].str.contains('opera', regex=False)==True] = 'opera'
    df['id_31'] = df['id_31'].str.replace(" ", "")
    return df

In [244]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [245]:
def make_day_feature(df, offset=0, tname='TransactionDT'):
    """
    Creates a day of the week feature, encoded as 0-6. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    offset : float (default=0)
        offset (in days) to shift the start/end of a day.
    tname : str
        Name of the time column in df.
    """
    # found a good offset is 0.58
    days = df[tname] / (3600*24)        
    encoded_days = np.floor(days-1+offset) % 7
    return encoded_days

In [246]:
def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    tname : str
        Name of the time column in df.
    """
    hours = df[tname] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

In [247]:
def make_pdc_amt_ratio(df):
    df_product_aveAmt = df.groupby(['ProductCD'])['TransactionAmt'].agg(['mean'])
    df_product_aveAmt.reset_index(inplace=True)
    df_ratio = pd.merge(df[['TransactionID','ProductCD',
                                             'TransactionAmt','isFraud']],
                           df_product_aveAmt,on='ProductCD',how='left')
    
    return df_ratio['TransactionAmt']/df_ratio['mean']

In [248]:
def make_card_id(df):
    cards_cols= ['card1', 'card2', 'card3', 'card5']
    for card in cards_cols: 
        if '1' in card: 
            df['card_id']= df[card].map(str)
        else : 
            df['card_id']+= ' '+df[card].map(str)
    return df['card_id']       

In [249]:
def high_missing_cols(df,threshold):
    """return features with high missing rate"""
    rm_cols = [col for col in df.columns 
               if df[col].isnull().mean() > threshold]
    return rm_cols
    

In [385]:
# clean Pemail
df_total['P_email'] = df_total['P_emaildomain'].str.split('.',expand=True)[0]

In [386]:
us_emails = ['gmail', 'net', 'edu']

In [387]:
df_total['P_email_suffix'] = df_total['P_emaildomain'].map(lambda x: str(x).split('.')[-1])
df_total['P_email_suffix'] = df_total['P_email_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [388]:
df_total.drop('P_emaildomain',axis=1,inplace = True)

In [389]:
# clean R_emaildomain
df_total['R_email'] = df_total['R_emaildomain'].str.split('.',expand=True)[0]

In [390]:
df_total['R_email_suffix'] = df_total['R_emaildomain'].map(lambda x: str(x).split('.')[-1])
df_total['R_email_suffix'] = df_total['R_email_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [391]:
df_total.drop('R_emaildomain',axis=1,inplace = True)

In [392]:
df_total['id_30_OS'] = df_total['id_30'].str.split(' ',expand=True)[0]

In [393]:
df_total['id_30_version'] = df_total['id_30'].str.split(' ',expand=True)[1]

In [394]:
df_total.drop('id_30',axis=1,inplace = True)

In [395]:
df_total['dow'] = make_day_feature(df_total, offset=0.58)

In [396]:
df_total['hour'] = make_hour_feature(df_total)

In [397]:
df_total['pdc_amt_ratio'] = df_total['TransactionAmt']/df_total.groupby('ProductCD')['TransactionAmt'].transform('mean')

In [398]:
df_total['pdc_amt_std_ratio'] = df_total['TransactionAmt']/df_total.groupby('ProductCD')['TransactionAmt'].transform('std')

In [399]:
df_total['pdc_D1_ratio'] = df_total['D1']/df_total.groupby('ProductCD')['D1'].transform('mean')

In [400]:
df_total['pdc_D2_ratio'] = df_total['D2']/df_total.groupby('ProductCD')['D2'].transform('mean')

In [483]:
df_total['pdc_D3_ratio'] = df_total['D2']/df_total.groupby('ProductCD')['D3'].transform('mean')

In [401]:
df_total['pdc_D1_std_ratio'] = df_total['D1']/df_total.groupby('ProductCD')['D1'].transform('std')

In [402]:
df_total['pdc_D2_std_ratio'] = df_total['D2']/df_total.groupby('ProductCD')['D2'].transform('std')

In [482]:
df_total['pdc_D3_std_ratio'] = df_total['D3']/df_total.groupby('ProductCD')['D3'].transform('std')

In [403]:
df_total = clean_id31(df_total)

In [404]:
df_total['card_id'] = make_card_id(df_total)

In [405]:
df_total['card_TAmt_ratio'] = df_total['TransactionAmt']/df_total.groupby('card_id')['TransactionAmt'].transform('mean')

In [406]:
df_total['TransactionAmt_to_std_card1'] = df_total['TransactionAmt'] / df_total.groupby(['card_id'])['TransactionAmt'].transform('std')

In [409]:
df_total['card_freq_pdc'] = df_total.groupby('card_id')['ProductCD'].transform(lambda x:x.value_counts().index[0])

In [412]:
df_total['is_card_freq_pdc'] = (df_total.ProductCD == df_total.card_freq_pdc)

In [476]:
df_total['card_freq_addr1'] = df_total.groupby('card_id')['addr1'].transform(lambda x: x.value_counts(dropna=False).index[0])

In [478]:
df_total['is_card_freq_addr1'] = (df_total.addr1 == df_total.card_freq_addr1)

In [494]:
df_total['card1_count'] = df_total['card1'].map(df_total['card1'].value_counts(dropna=False)).head()

In [496]:
df_total['card_id_02_mean'] = df_total['id_02'] / df_total.groupby(['card_id'])['id_02'].transform('mean')

In [497]:
df_total['card_id_02_std'] = df_total['id_02'] / df_total.groupby(['card_id'])['id_02'].transform('std')

In [498]:
df_total['card_D15_mean'] = df_total['D15'] / df_total.groupby(['card_id'])['D15'].transform('mean')

In [499]:
df_total['card_D15_std'] = df_total['D15'] / df_total.groupby(['card_id'])['D15'].transform('std')

In [500]:
df_total['card_D1_mean'] = df_total['D1'] / df_total.groupby(['card_id'])['D1'].transform('mean')

In [501]:
df_total['card_D2_mean'] = df_total['D2'] / df_total.groupby(['card_id'])['D2'].transform('mean')

In [502]:
df_total['card_D3_mean'] = df_total['D2'] / df_total.groupby(['card_id'])['D3'].transform('mean')

In [503]:
df_total['card_D1_std'] = df_total['D1'] / df_total.groupby(['card_id'])['D1'].transform('std')

In [504]:
df_total['card_D2_std'] = df_total['D2'] / df_total.groupby(['card_id'])['D2'].transform('std')

In [505]:
df_total['card_D3_std'] = df_total['D3'] / df_total.groupby(['card_id'])['D3'].transform('std')

In [506]:
df_total['addr1_D15_mean'] = df_total['D15'] / df_total.groupby(['addr1'])['D15'].transform('mean')

In [507]:
df_total['addr1_D15_std'] = df_total['D15'] / df_total.groupby(['addr1'])['D15'].transform('std')

In [508]:
# decimal part of the transaction amount
df_total['TransactionAmt_decimal'] = ((df_total['TransactionAmt'] - df_total['TransactionAmt'].astype(int)) * 1000).astype(int)

In [509]:
df_total['Device_name'] = df_total['DeviceInfo'].str.split('/', expand=True)[0]
df_total['Device_version'] = df_total['DeviceInfo'].str.split('/', expand=True)[1]
df_total.drop('DeviceInfo',axis=1,inplace = True)

In [510]:
df_total.loc[df_total['Device_name'].str.contains('SM', na=False), 'Device_name']  = 'Samsung'
df_total.loc[df_total['Device_name'].str.contains('SAMSUNG', na=False), 'Device_name']  = 'Samsung'
df_total.loc[df_total['Device_name'].str.contains('GT-', na=False), 'Device_name']  = 'Samsung'
df_total.loc[df_total['Device_name'].str.contains('Moto G', na=False), 'Device_name']  = 'Motorola'
df_total.loc[df_total['Device_name'].str.contains('Moto', na=False), 'Device_name']  = 'Motorola'
df_total.loc[df_total['Device_name'].str.contains('moto', na=False), 'Device_name']  = 'Motorola'
df_total.loc[df_total['Device_name'].str.contains('LG-', na=False), 'Device_name']  = 'LG'
df_total.loc[df_total['Device_name'].str.contains('rv:', na=False), 'Device_name']  = 'RV'
df_total.loc[df_total['Device_name'].str.contains('HUAWEI', na=False), 'Device_name']  = 'Huawei'
df_total.loc[df_total['Device_name'].str.contains('ALE-', na=False), 'Device_name']  = 'Huawei'
df_total.loc[df_total['Device_name'].str.contains('-L', na=False), 'Device_name']  = 'Huawei'
df_total.loc[df_total['Device_name'].str.contains('Blade', na=False), 'Device_name']  = 'ZTE'
df_total.loc[df_total['Device_name'].str.contains('BLADE', na=False), 'Device_name']  = 'ZTE'
df_total.loc[df_total['Device_name'].str.contains('XT', na=False), 'Device_name']  = 'Sony'
df_total.loc[df_total.Device_name.isin(df_total.Device_name.value_counts()[df_total.Device_name.value_counts() < 200].index), 'Device_name'] = "Others"

In [511]:
df_total['card_freq_Device'] = df_total.groupby('card_id')['Device_name'].transform(lambda x: x.value_counts(dropna=False).index[0])

In [512]:
df_total['is_card_freq_Device'] = (df_total.Device_name == df_total.card_freq_Device)

In [514]:
df_total['screen_width'] = df_total['id_33'].str.split('x', expand=True)[0]
df_total['screen_height'] = df_total['id_33'].str.split('x', expand=True)[1]
df_total.drop('id_33',axis=1,inplace = True)

In [515]:
# this feature lead to over fitting
df_total.drop('card_id',axis=1,inplace=True)
df_total.drop(['card_freq_Device','card_freq_pdc','card_freq_addr1'],axis=1,inplace=True)

https://www.kaggle.com/davidcairuz/feature-engineering-lightgbm-corrected

# Remove Features

In [126]:
high_miss_cols = high_missing_cols(df_total,0.9)

In [127]:
one_value_cols = [col for col in df_total.columns 
                  if df_total[col].nunique() <= 1]

In [128]:
big_top_value_cols = [col for col in df_total.columns 
                      if df_total[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]

In [129]:
cols_to_drop = list(set(high_miss_cols + one_value_cols + big_top_value_cols))

In [130]:
len(cols_to_drop)

71

In [131]:
df_total.drop(cols_to_drop, axis=1, inplace=True)

# Encoder Categorical

In [516]:
df_total_final,colname = label_encoder(df_total, categorical_columns=None)

In [517]:
df_total_final.shape

(1097231, 469)

# Save Final Features

In [518]:
df_total_final.to_csv('./data/features469.csv', index = False)