# Data Preprocessing, Feature Engineering, & Modeling
IF4041 - Data Science & Data Mining assignment: Fraud Detection.  
By  
13516015 [Michelle Eliza Gananjaya](github.com/)  
13516030 [Yonas Adiel Wiguna](github.com/)  
13516101 [Kelvin Kristian](github.com/)  
13516140 [Ilham Firdausi Putra](github.com/ilhamfp)  


# Load Data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

DIR_DATA_RAW = '../data/raw'
DIR_DATA_INTERIM = '../data/interim'

In [2]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames).
    """
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.

def load_data(DIR_DATA_RAW):
    """Load data and label from the given directory path
    
    **Args**:
    * DIR_DATA_RAW (string): Path to the raw directory
    
    **Return**:
    * data (dataframe) 
    * label (dataframe)
    """
    file_paths = get_filepaths(DIR_DATA_RAW)
    file_paths = [path for path in file_paths if (path.endswith("Inputs") or path.endswith("Targets"))]
    file_paths.sort()
    
    print("Loading data from: ")
    for x in file_paths[:4]:
        print(x)
        
    data = pd.DataFrame()
    label = pd.DataFrame()

    for i in range(0, len(file_paths[:4]), 2):
        cur_data = pd.read_csv(file_paths[i])
        cur_label = pd.read_csv(file_paths[i+1], header=None)

        data = pd.concat([data, cur_data])
        label = pd.concat([label, cur_label])
        
    label.columns = ['label']
    data = data.reset_index(drop=True)
    label = label.reset_index(drop=True)
    return data, label

In [3]:
data, label = load_data(DIR_DATA_RAW)

Loading data from: 
../data/raw/DataminingContest2009.Task1.CV1.Test.Inputs
../data/raw/DataminingContest2009.Task1.CV1.Test.Targets
../data/raw/DataminingContest2009.Task1.CV1.Train.Inputs
../data/raw/DataminingContest2009.Task1.CV1.Train.Targets


In [4]:
print(data.shape)
data.head()

(94682, 19)


Unnamed: 0,amount,hour1,state1,zip1,field1,domain1,field2,hour2,flag1,total,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5
0,25.9,0,FL,331,3,BELLSOUTH.NET,1,0,1,25.9,3878,8,0,0,0,0,1,0,1
1,38.85,0,TX,750,2,COMCAST.NET,1,0,0,38.85,-6330,21,1,0,0,1,1,0,1
2,38.85,1,VA,222,2,HOTMAIL.COM,0,1,0,38.85,5183,19,1,0,0,0,0,0,1
3,24.95,1,CA,946,0,GMAIL.COM,0,1,0,24.95,3822,16,0,0,0,0,0,0,1
4,20.72,1,CO,805,3,LEVEL3.COM,0,1,1,20.72,3536,8,1,0,0,1,1,0,1


In [5]:
print(label.shape)
label.tail()

(94682, 1)


Unnamed: 0,label
94677,0
94678,0
94679,1
94680,1
94681,1


# Data Preprocessing & Feature Engineering

In [6]:
from sklearn import preprocessing

def label_encode(data):
    """Hehuhehue Given array of data, label encode.
    """
    
    le = preprocessing.LabelEncoder()
    return le.fit_transform(data)


def preprocess(data):
    """Generating preprocess data.
    
    **Args**:
    * data (dataframe): The data points
    
    **Return**:
    * new_data (dataframe): New data points dataframe with new preprocessed feature
    """
    
    # We label encode state1 & domain1 because its massive amount of unique values.
    # One-hot encoding is infeasible in this case.
    data['state1'] = label_encode(data['state1'].astype(str).values)
    data['domain1'] = label_encode(data['domain1'].astype(str).values)
    
    # TODO: Data domain1 ada yang null 1, enaknya diapain ya? sekarang gw isi modus doang
    data['domain1'] = data['domain1'].fillna(data['domain1'].mode()[0])
    
    
    # TODO: apa lagi ya
    
    return data

def generate_new_feature(data):
    """Generating new feature.
    
    **Args**:
    * data (dataframe): The data points
    
    **Return**:
    * new_data (dataframe): New data points dataframe with new engineered feature
    """
    # Bisa coba tambahin fitur:
    # 1. pisahin domain1 jadi nama domainnya dan top level domainnnya? Contoh: BELLSOUTH & NET
    # 2. apa lagi ya, yang nomor 1 terinspirasi dari kompetisi kaggle ini sih https://www.kaggle.com/c/ieee-fraud-detection ada yang bikin feature kaya gitu
    #
    
    return data

In [7]:
data = preprocess(data)
data = generate_new_feature(data)

In [8]:
data.head()

Unnamed: 0,amount,hour1,state1,zip1,field1,domain1,field2,hour2,flag1,total,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5
0,25.9,0,11,331,3,494,1,0,1,25.9,3878,8,0,0,0,0,1,0,1
1,38.85,0,45,750,2,1102,1,0,0,38.85,-6330,21,1,0,0,1,1,0,1
2,38.85,1,47,222,2,3021,0,1,0,38.85,5183,19,1,0,0,0,0,0,1
3,24.95,1,6,946,0,2620,0,1,0,24.95,3822,16,0,0,0,0,0,0,1
4,20.72,1,7,805,3,4319,0,1,1,20.72,3536,8,1,0,0,1,1,0,1


# Model

In [9]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score

def get_skfold():
    return StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

## LightGBM

In [10]:
def get_lgbm_model(X, y):
    """Run LightGBM model to produce its out-of-fold prediction
    or test prediction if X_test is provided.
    
    **Args**:
    * X (dataframe): Dataframe containing features
    * y (numpy array): Numpy array containing label
    
    **Return**:
    * lightgbm_oof (numpy array) : Numpy array containing out-of-fold prediction / test prediction
    """
    
    lightgbm_param = {'random_state': 1}
    
    lightgbm_pred = np.zeros(len(y))
    count = 0

    for train_index, test_index in get_skfold().split(X, y):
        count += 1
        clf = LGBMClassifier(**lightgbm_param)
        clf.fit(X.loc[train_index,:], y[train_index])
        lightgbm_pred[[test_index]] = clf.predict_proba(X.loc[test_index,:])[:,1]

    return lightgbm_pred

In [11]:
lightgbm_pred = get_lgbm_model(data, label.label.values)

In [12]:
threshold = 0.5
lightgbm_pred_binary = [1 if x > threshold else 0 for x in lightgbm_pred]
print("F1 Score: {}".format(f1_score(label.label.values, lightgbm_pred_binary)))

F1 Score: 0.4138171667829728


## XGBoost

In [13]:
def get_xgb_model(X, y):
    """Run XGBoost model to produce its out-of-fold prediction
    or test prediction if X_test is provided.
    
    **Args**:
    * X (dataframe): Dataframe containing features
    * y (numpy array): Numpy array containing label
    
    **Return**:
    * xgboost_oof (numpy array) : Numpy array containing out-of-fold prediction / test prediction
    """
    
    xgboost_param = {'random_state': 1}
   
    
    xgboost_pred = np.zeros(len(y))
    count = 0

    for train_index, test_index in get_skfold().split(X, y):
        count += 1
        clf = XGBClassifier(**xgboost_param)
        clf.fit(X.loc[train_index,:], y[train_index])
        xgboost_pred[[test_index]] = clf.predict_proba(X.loc[test_index,:])[:,1]

    return xgboost_pred

In [14]:
xgboost_pred = get_xgb_model(data, label.label.values)

In [15]:
threshold = 0.5
xgboost_pred_binary = [1 if x > threshold else 0 for x in xgboost_pred]
print("F1 Score: {}".format(f1_score(label.label.values, xgboost_pred_binary)))

F1 Score: 0.31452859350850076


## Random Forest

In [16]:
def get_rf_model(X, y):
    """Run Random Forest model to produce its out-of-fold prediction
    or test prediction if X_test is provided.
    
    **Args**:
    * X (dataframe): Dataframe containing features
    * y (numpy array): Numpy array containing label
    
    **Return**:
    * rf_pred (numpy array) : Numpy array containing out-of-fold prediction / test prediction
    """
    
    rf_param = {'random_state': 1}
    
    rf_pred = np.zeros(len(y))
    count = 0

    for train_index, test_index in get_skfold().split(X, y):
        count += 1
        clf = RandomForestClassifier(**rf_param)
        clf.fit(X.loc[train_index,:], y[train_index])
        rf_pred[[test_index]] = clf.predict_proba(X.loc[test_index,:])[:,1]

    return rf_pred

In [17]:
rf_pred = get_rf_model(data, label.label.values)

In [18]:
threshold = 0.5
rf_pred_binary = [1 if x > threshold else 0 for x in rf_pred]
print("F1 Score: {}".format(f1_score(label.label.values, rf_pred_binary)))

F1 Score: 0.4732203389830508


## Ensemble

In [19]:
# Ini kita cobain teknik ensemble
# Ngegabungin hasil probability prediksi 3 classifier sebelumnya
# Ini untuk sekarang kita hard code aja nilai persentasenya
ensemble_pred = (lightgbm_pred*0.2) + (xgboost_pred*0.1) + (rf_pred*0.7)

In [20]:
threshold = 0.5
ensemble_pred_binary = [1 if x > threshold else 0 for x in ensemble_pred]
print("F1 Score: {}".format(f1_score(label.label.values, ensemble_pred_binary)))

F1 Score: 0.4733606557377049
