<a href="https://colab.research.google.com/github/kiru883/Kaggle-IEEE-CIS-Fraud-Detection/blob/master/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive, files
drive.mount('/content/gdrive')

import warnings
warnings.filterwarnings("ignore")

import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit, train_test_split, KFold
from sklearn.feature_selection import RFECV

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Load datasets

In [0]:
#####LOAD DATASETS
# train
data_trainTR = pd.read_csv("/content/gdrive/My Drive/frauds_datasets/train_transaction.csv")
data_trainID = pd.read_csv("/content/gdrive/My Drive/frauds_datasets/train_identity.csv")

# test
data_testTR = pd.read_csv("/content/gdrive/My Drive/frauds_datasets/test_transaction.csv")
data_testID = pd.read_csv("/content/gdrive/My Drive/frauds_datasets/test_identity.csv") 

# Memory usage reduction function

In [0]:
def reduce_mem_usage(df):
    mem_usage_before = np.around(df.memory_usage().sum() / 1028**2)
    print(f"Memory usage before: {mem_usage_before} MB")

    for column in df.columns:
        if df[column].dtype == 'float':
            mn, mx = df[column].min(), df[column].max()
            if mn > -2147483648 or mx < 2147483648:
                df[column] = df[column].astype('float32')

        elif df[column].dtype == 'int':
            mn, mx = df[column].min(), df[column].max()
            if mn > -128 or mx < 127:
                df[column] = df[column].astype('int8')
            elif mn > -32000 or mx < 32000:
                df[column] = df[column].astype('int16')
            elif mn > -2147483648 or mx < 2147483648:
                df[column] = df[column].astype('int32')

        elif df[column].dtype == 'object':
            df[column] = df[column].astype('category')

    mem_usage_after = np.around(df.memory_usage().sum() / 1028**2)
    print(f"Memory usage after: {mem_usage_after} MB")
    print(f"Optimization: {np.around(100*(1 - mem_usage_after/mem_usage_before))}%")

    return df

# Main pipeline

In [0]:
#with all my hypotises
def pipeline(data_TR, data_ID):
    #useful functions
    def decimal_places(x):
        decimal_str = str(x)[str(x).find(".") +1:]
        if decimal_str == "0":
            return 0
        decimal_len = len(str(int(decimal_str[::-1])))
        return decimal_len if decimal_len < 5 else 5
    
    def email_map(email):
        mapping= {'frontier.com':'frontier','frontiernet.net':'frontier','gmail':'gmail','gmail.com':'gmail','hotmail.co.uk':'hotmail','hotmail.com':'Microsoft','hotmail.de':'Microsoft',
            'hotmail.es':'Microsoft','hotmail.fr':'Microsoft','icloud.com':'Apple','live.com':'Microsoft','live.com.mx':'Microsoft','live.fr':'Microsoft','mac.com':'Apple',
            'netzero.com':'Netzero','netzero.net':'Netzero','outlook.com':'Microsoft','outlook.es':'Microsoft', 'yahoo.co.jp':'Yahoo','yahoo.co.uk':'Yahoo','yahoo.com':'Yahoo',
            'yahoo.com.mx':'Yahoo','yahoo.de':'Yahoo','yahoo.es':'Yahoo','yahoo.fr':'Yahoo','ymail.com':'Yahoo', 'scranton.edu':'Scranton'}
        if email in mapping.keys():
            return mapping[email]
        elif pd.isnull(email):
            return 'NAN'
        else:
            return 'other'

    def parse_id30(x):
        devices_30_list = ['windows', 'ios', 'mac', 'android', 'linux']
        if pd.isnull(x): 
            return 'NAN'
        elif x.split()[0].lower() in devices_30_list: 
            return x.split()[0].lower()
        else:
            return 'NAN'
        
    def parse_id31(x):
        devices_30_set = {'chrome', 'safari', 'ie', 'edge', 'firefox'}
        if pd.isnull(x): 
            return 'NAN'
        result = list(devices_30_set & set(x.split()))
        if len(result) == 0:
            return 'other'
        else:
            return result[0]
        
    def parse_id33(x):
        devices_33_list = ['1334x750', '2436x1125', '1366x768', '1920x1080', '2208x1242']
        if pd.isnull(x):
            return 'NAN'
        if x in devices_33_list:
            return x
        else:
            return 'other'
        
    def parse_deviceinfo(x):
        devices_info_list = ['windows', 'macos', 'ios', 'trident/7.0']
        if pd.isnull(x):
            return 'NAN'
        x = x.split()[0].lower()
        if x in devices_info_list:
            return x
        else:
            return 'other'

    df = pd.concat([data_TR.set_index('TransactionID'), data_ID.set_index('TransactionID')], axis=1).reset_index()
    del data_TR, data_ID

    #main pipeline
    df['month'] = df['TransactionDT'] // (86400 * 30)
    # transactionAmt features
    df['TransactionAmt'] = df['TransactionAmt'].fillna(-999)
    df['transaction_month'] = df.groupby(['month'])['TransactionAmt'].transform('mean') - df['TransactionAmt']
    df['trans_meanq'] = pd.qcut(df['TransactionAmt'].median() - df['TransactionAmt'], [0.1, 0.3, 0.5, 0.7, 0.9], labels=False).fillna(-1)
    df['trans_std_negative'] = np.where((df['transaction_month'] < 0), 1, 0)
    df['transaction_std'] = df['transaction_month'] / df.groupby(['month'])['TransactionAmt'].transform('std')
    df['transaction_stdq'] = (df['TransactionAmt'].median() - df['TransactionAmt']) / df['TransactionAmt'].std()
    df['transaction_stdq'] = pd.qcut(df['transaction_stdq'], [0.1, 0.3, 0.5, 0.7, 0.9], labels=False).fillna(-1)
    df['transaction_digits'] = df['TransactionAmt'].map(decimal_places)
    df['transaction_count'] = df['TransactionAmt'].map(df['TransactionAmt'].value_counts())
    df['transaction_count'] = pd.qcut(df['transaction_count'], [0.1, 0.3, 0.5, 0.7, 0.9], labels=False).fillna(-1)
    high_tr = df['TransactionAmt'].quantile([0.9]).to_list()[0]
    lower_tr = df['TransactionAmt'].quantile([0.1]).to_list()[0]
    df['outlier'] = np.where((df['TransactionAmt'] > high_tr) | (df['TransactionAmt'] < lower_tr), 1, 0)

    #ProductCd
    df['ProductCD'] = df['ProductCD'].fillna('NAN')
    df['prod_stdq'] = pd.qcut(df.groupby(["ProductCD"])['TransactionAmt'].transform('median') - df['TransactionAmt'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], labels=False).fillna(-1)
    df['ProductCD'] = LabelEncoder().fit_transform(df['ProductCD'])
    
    #card1
    df['card1'] = df['card1'].fillna(-999)
    df['card1_count'] = df['card1'].map(df['card1'].value_counts())
    df['card1_count_q'] = pd.qcut(df['card1_count'], [0.05, 0.2, 0.4, 0.6, 0.8, 0.95], labels=False).fillna(-1)
    df['card1_frequency'] = df['card1'].map(df['card1'].value_counts() / df['card1'].shape[0])
    df['card1_frequency'] = pd.qcut(df['card1_frequency'], [0.05, 0.2, 0.4, 0.6, 0.8, 0.95], labels=False).fillna(-1)
    df['trans_card1_mean'] = df.groupby(['card1'])['TransactionAmt'].transform('mean') - df['TransactionAmt']
    df['trans_card1_mean_rel'] = df.groupby(['card1'])['TransactionAmt'].transform('mean') / df['TransactionAmt']
    df['card1_mean'] = df.groupby(['month'])['card1'].transform('mean') - df['card1']
    df['card1_std'] = df.groupby(['month'])['card1'].transform('std') / df['card1_mean'] 
    df['card1_addr1_mean'] = df['card1'].astype('str') + '_' + df['addr1'].astype('str')
    df['card1_addr1_mean'] = df.groupby(['card1_addr1_mean'])['TransactionAmt'].transform('mean') - df['TransactionAmt']

    #card2
    df['card2'] = df['card2'].fillna(-999)
    df['card2_count'] = df['card2'].map(df['card2'].value_counts())
    df['card2_out'] = pd.qcut(df['card2_count'], [0.05, 0.15, 0.85, 0.95], labels=False).fillna(-1)
    df['card2_q'] = pd.qcut(df['card2_count'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], labels=False).fillna(-1)

    #card3
    df['card3'] = df['card3'].fillna(-999)
    df['card3_o'] = np.where((df['card3'] < df['card3'].quantile(0.1)) | (df['card3'] > df['card3'].quantile(0.9)), 1, 0)
    
    #card4
    df['card4'] = df['card4'].fillna('NAN')
    df['card4_count'] = df['card4'].map(df['card4'].value_counts())
    df['trans_card_mean'] = df.groupby(['card4'])['TransactionAmt'].transform('mean') - df['TransactionAmt']
    df['trans_card_mean_q'] = pd.qcut(df['trans_card_mean'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], labels=False).fillna(-1)
    df['trans_card_std'] = df.groupby(['card4'])['TransactionAmt'].transform('std') / df['trans_card_mean']
    df['trans_card_std_q'] = pd.qcut(df['trans_card_std'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], labels=False).fillna(-1)
    df['card4_trans_mean'] = df['TransactionAmt'].astype('str') + '_' + df['card4'].astype('str')
    df['card4_trans_mean'] = df.groupby(['card4_trans_mean'])['TransactionAmt'].transform('mean') - df['TransactionAmt']
    df['card4'] = LabelEncoder().fit_transform(df['card4'])

    #card5
    df['card5'] = df['card5'].fillna(-999)
    df['card5_o'] = np.where(((df['card5'] < df['card5'].quantile(0.05)) | (df['card5'] > df['card5'].quantile(0.95))), 1, 0)

    #card6
    df['card6'] = df['card6'].fillna('NAN')
    df['trans_card_med'] = df.groupby(['card6'])['TransactionAmt'].transform('median') - df['TransactionAmt']
    df['card6'] = LabelEncoder().fit_transform(df['card6'])

    #addr1
    df['addr1'] = df['addr1'].fillna(-999)
    df['addr1_isnull'] = df['addr1'].isnull().astype(int)
    df['addr1_frequency'] = df['addr1'].map(df['addr1'].value_counts())
    df['addr_trans'] = df.groupby(['addr1'])['TransactionAmt'].transform('median') - df['TransactionAmt']
    df['addr_trans_q'] = pd.qcut(df['addr_trans'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], labels=False).fillna(-1)
    df['addr_card_med'] = df.groupby(['addr1'])['card1'].transform('median') - df['card1']
    df['addr_card'] = df.groupby(['addr1'])['card1'].transform('median') - df['card1']
    df['addr_card_q'] = pd.qcut(df['addr_card'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], labels=False).fillna(-1)
    df['addrcard'] = df['addr1'].astype(str) + "_" + df['card1'].astype(str)
    df['addrcard'] = LabelEncoder().fit_transform(df['addrcard'])

    #addr2 & dist1 & dist2
    df['addr2'] = df['addr2'].fillna(-999)
    df['dist1'] = df['dist1'].fillna(-999)
    df['dist2'] = df['dist2'].fillna(-999)

    #p_emaildomain
    df['P_emaildomain'] = LabelEncoder().fit_transform(df['P_emaildomain'].map(email_map))

    #r_emaildomain
    df['R_emaildomain'] = LabelEncoder().fit_transform(df['R_emaildomain'].map(email_map))

    #Cs
    for Ci in range(1, 15):
        c = "C" + str(Ci)
        df[c] = df[c].fillna(-999)
        df[c + '_trans_div'] = df.groupby([c])['TransactionAmt'].transform('median') - df['TransactionAmt']
        df[c + '_trans_div_q'] = pd.qcut(df[c + '_trans_div'], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], labels=False).fillna(-1)

    #Ds
    for Di in range(1, 16):
        d = "D" + str(Di)
        df[d] = df[d].fillna(-999)
    
    #Ms
    for Mi in range(1, 10):
        m = "M" + str(Mi)
        df[m] = df[m].fillna("NAN")
        df[m] = LabelEncoder().fit_transform(df[m])

    #Vs
    for Vi in range(1, 340):
        v = "V" + str(Vi)
        df[v] = df[v].fillna(-999)

    #ids
    for Ii in range(1, 39):
        i = "id_0" if Ii < 10 else "id_"
        i += str(Ii)
        if df[i].dtype == 'object':
            if i == "id_30":
                df[i] = df[i].map(parse_id30)
            elif i == "id_31":
                df[i] = df[i].map(parse_id31)
            elif i == 'id_33':
                df[i] = df[i].map(parse_id33)
            df[i] = df[i].fillna('NAN')
            df[i] = LabelEncoder().fit_transform(df[i])
        else:
            df[i] = df[i].fillna(-999)

    #test functions
    features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'D15', 'C13', 'addr1', 'dist1', 'R_emaildomain', 'M2', 'M4']
    features_combinations = list(itertools.combinations(features, 2))
    for f1, f2 in features_combinations:
        feature_name = f1 + '_' + f2
        df[feature_name] = df[f1].astype('str') + '_' + df[f2].astype('str')
        df[feature_name] = df[feature_name].map(df[feature_name].value_counts())

    #deviceType & deviceInfo
    df['DeviceType'] = LabelEncoder().fit_transform(df['DeviceType'].fillna("NAN"))
    df['DeviceInfo'] = LabelEncoder().fit_transform(df['DeviceInfo'].map(parse_deviceinfo))

    df.sort_values(by=["TransactionDT"], ascending=True).reset_index()
    df = df.drop(["TransactionDT", "TransactionID", "month"], axis=1)

    gc.collect()
    return df

# RFE

In [0]:
test = pipeline(data_trainTR.drop(['isFraud'], axis=1), data_trainID)
test = reduce_mem_usage(test)
X = test.to_numpy()
y = data_trainTR['isFraud'].to_numpy().reshape(-1, 1)

#RFE estimator
clf = lgb.LGBMClassifier(
    num_leaves = 490,
    n_estimators = 300,
    metrics = 'auc',
    objective = 'binary'
)

#RFECV
rfecv = RFECV(
    estimator=clf,
    step=10,
    cv=TimeSeriesSplit(),
    verbose=2
)

rfecv.fit(X, y)

features = [x for x in test.columns[rfecv.ranking_ == 1]]
print(f"Number of features: {len(features)}")
print("\nFeatures:\n", features)

Memory usage before: 2624.0 MB
Memory usage after: 1087.0 MB
Optimization: 59.0%
Fitting estimator with 587 features.
Fitting estimator with 577 features.
Fitting estimator with 567 features.
Fitting estimator with 557 features.
Fitting estimator with 547 features.
Fitting estimator with 537 features.
Fitting estimator with 527 features.
Fitting estimator with 517 features.
Fitting estimator with 507 features.
Fitting estimator with 497 features.
Fitting estimator with 487 features.
Fitting estimator with 477 features.
Fitting estimator with 467 features.
Fitting estimator with 457 features.
Fitting estimator with 447 features.
Fitting estimator with 437 features.
Fitting estimator with 427 features.
Fitting estimator with 417 features.
Fitting estimator with 407 features.
Fitting estimator with 397 features.
Fitting estimator with 387 features.
Fitting estimator with 377 features.
Fitting estimator with 367 features.
Fitting estimator with 357 features.
Fitting estimator with 347 feat