# Data Preprocessing, Feature Engineering, & Modeling
IF4041 - Data Science & Data Mining assignment: Fraud Detection.  
By  
13516015 [Michelle Eliza Gananjaya](github.com/)  
13516030 [Yonas Adiel Wiguna](github.com/)  
13516101 [Kelvin Kristian](github.com/)  
13516140 [Ilham Firdausi Putra](github.com/ilhamfp)  


# Load Data

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

DIR_DATA_RAW = '../data/raw'
DIR_DATA_INTERIM = '../data/interim'

Using TensorFlow backend.


In [2]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames).
    """
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.

def load_data(DIR_DATA_RAW):
    """Load data and label from the given directory path
    
    **Args**:
    * DIR_DATA_RAW (string): Path to the raw directory
    
    **Return**:
    * data (dataframe) 
    * label (dataframe)
    """
    file_paths = get_filepaths(DIR_DATA_RAW)
    file_paths = [path for path in file_paths if (path.endswith("Inputs") or path.endswith("Targets"))]
    file_paths.sort()
    
    print("Loading data from: ")
    for x in file_paths[:4]:
        print(x)
        
    data = pd.DataFrame()
    label = pd.DataFrame()

    for i in range(0, len(file_paths[:4]), 2):
        cur_data = pd.read_csv(file_paths[i])
        cur_label = pd.read_csv(file_paths[i+1], header=None)

        data = pd.concat([data, cur_data])
        label = pd.concat([label, cur_label])
        
    label.columns = ['label']
    data = data.reset_index(drop=True)
    label = label.reset_index(drop=True)
    return data, label

In [22]:
data, label = load_data(DIR_DATA_RAW)

Loading data from: 
../data/raw\DataminingContest2009.Task1.CV1.Test.Inputs
../data/raw\DataminingContest2009.Task1.CV1.Test.Targets
../data/raw\DataminingContest2009.Task1.CV1.Train.Inputs
../data/raw\DataminingContest2009.Task1.CV1.Train.Targets


In [4]:
print(data.shape)
data.head()

(94682, 19)


Unnamed: 0,amount,hour1,state1,zip1,field1,domain1,field2,hour2,flag1,total,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5
0,25.9,0,FL,331,3,BELLSOUTH.NET,1,0,1,25.9,3878,8,0,0,0,0,1,0,1
1,38.85,0,TX,750,2,COMCAST.NET,1,0,0,38.85,-6330,21,1,0,0,1,1,0,1
2,38.85,1,VA,222,2,HOTMAIL.COM,0,1,0,38.85,5183,19,1,0,0,0,0,0,1
3,24.95,1,CA,946,0,GMAIL.COM,0,1,0,24.95,3822,16,0,0,0,0,0,0,1
4,20.72,1,CO,805,3,LEVEL3.COM,0,1,1,20.72,3536,8,1,0,0,1,1,0,1


In [5]:
print(label.shape)
label.tail()

(94682, 1)


Unnamed: 0,label
94677,0
94678,0
94679,1
94680,1
94681,1


# Data Preprocessing & Feature Engineering

In [29]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def label_encode(data):
    """Hehuhehue Given array of data, label encode.
    """
    
    le = preprocessing.LabelEncoder()
    return le.fit_transform(data)


def preprocess_fill_nan(data):
    """Generating preprocess data.
    
    **Args**:
    * data (dataframe): The data points
    
    **Return**:
    * new_data (dataframe): New data points dataframe with new preprocessed feature
    """
    
    # TODO: Data domain1 ada yang null 1, enaknya diapain ya? sekarang gw isi modus doang
    data['domain1'] = data['domain1'].fillna(data['domain1'].mode()[0])
    
    return data

def preprocess_encode_categorical_value(data):
    # We label encode state1 & domain1 because its massive amount of unique values.
    # One-hot encoding is infeasible in this case.
    data['state1'] = label_encode(data['state1'].astype(str).values)
    data['domain_name'] = label_encode(data['domain_name'].astype(str).values)
    data['domain_top_level'] = label_encode(data['domain_top_level'].astype(str).values)
    
    return data

def preprocess_remove_high_correlation_feature(data):
    # Remove high correlation feature
    data = data.drop(columns=['hour2','total'])
    return data

def preprocess_oversample_minor_class(data, label):
    # Oversample minority class
    data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=0.25, random_state=27)

    sm = SMOTE(random_state=27, ratio=1.0)
    data, label = sm.fit_sample(data, label)
    
    return data, label

def preprocess_dimension_reduction(data, components):
    data = StandardScaler().fit_transform(data)
    pca = PCA(n_components=components)
    principalComponents = pca.fit_transform(data)
    principalDf = pd.DataFrame(data = principalComponents)
    
    return principalDf

def preprocess_generate_new_feature(data):
    """Generating new feature.
    
    **Args**:
    * data (dataframe): The data points
    
    **Return**:
    * new_data (dataframe): New data points dataframe with new engineered feature
    """

    # Bisa coba tambahin fitur:
    # 1. pisahin domain1 jadi nama domainnya dan top level domainnnya? Contoh: BELLSOUTH & NET
    # 2. apa lagi ya, yang nomor 1 terinspirasi dari kompetisi kaggle ini sih https://www.kaggle.com/c/ieee-fraud-detection ada yang bikin feature kaya gitu
    #
   
    domain_name = []
    domain_top_level = []
    
    for domain in data['domain1'].iteritems():
        domainSplitted = domain[1].split('.')
        domain_name.append(domainSplitted[0])
        domain_top_level.append(domainSplitted[1])
    
    data['domain_name'] = domain_name
    data['domain_top_level'] = domain_top_level
    
    data = data.drop(columns=['domain1'])

    return data

## Fill NaN Data

In [7]:
data_without_nan = preprocess_fill_nan(data)

## Generate New Feature

In [8]:
data_new_domain_features = preprocess_generate_new_feature(data_without_nan)
data_new_domain_features.head()

Unnamed: 0,amount,hour1,state1,zip1,field1,field2,hour2,flag1,total,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5,domain_name,domain_top_level
0,25.9,0,FL,331,3,1,0,1,25.9,3878,8,0,0,0,0,1,0,1,BELLSOUTH,NET
1,38.85,0,TX,750,2,1,0,0,38.85,-6330,21,1,0,0,1,1,0,1,COMCAST,NET
2,38.85,1,VA,222,2,0,1,0,38.85,5183,19,1,0,0,0,0,0,1,HOTMAIL,COM
3,24.95,1,CA,946,0,0,1,0,24.95,3822,16,0,0,0,0,0,0,1,GMAIL,COM
4,20.72,1,CO,805,3,0,1,1,20.72,3536,8,1,0,0,1,1,0,1,LEVEL3,COM


## Encode Categorical Values

In [9]:
data_encoded = preprocess_encode_categorical_value(data_new_domain_features)
data_encoded.head()

Unnamed: 0,amount,hour1,state1,zip1,field1,field2,hour2,flag1,total,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5,domain_name,domain_top_level
0,25.9,0,11,331,3,1,0,1,25.9,3878,8,0,0,0,0,1,0,1,482,248
1,38.85,0,45,750,2,1,0,0,38.85,-6330,21,1,0,0,1,1,0,1,1086,248
2,38.85,1,47,222,2,0,1,0,38.85,5183,19,1,0,0,0,0,0,1,2995,77
3,24.95,1,6,946,0,0,1,0,24.95,3822,16,0,0,0,0,0,0,1,2595,77
4,20.72,1,7,805,3,0,1,1,20.72,3536,8,1,0,0,1,1,0,1,4291,77


## Remove High Correlation Feature

In [20]:
data_without_highcorr_feature = preprocess_remove_high_correlation_feature(data_encoded)
data_without_highcorr_feature.head()

Unnamed: 0,amount,hour1,state1,zip1,field1,field2,flag1,field3,field4,field5,indicator1,indicator2,flag2,flag3,flag4,flag5,domain_name,domain_top_level
0,25.9,0,11,331,3,1,1,3878,8,0,0,0,0,1,0,1,482,248
1,38.85,0,45,750,2,1,0,-6330,21,1,0,0,1,1,0,1,1086,248
2,38.85,1,47,222,2,0,0,5183,19,1,0,0,0,0,0,1,2995,77
3,24.95,1,6,946,0,0,0,3822,16,0,0,0,0,0,0,1,2595,77
4,20.72,1,7,805,3,0,1,3536,8,1,0,0,1,1,0,1,4291,77


## Dimension Reduction

In [31]:
# Using PCA
components = 3
data_reduced_dimension = preprocess_dimension_reduction(data_without_highcorr_feature, components)
data_reduced_dimension.head()

Unnamed: 0,0,1,2
0,0.847794,0.324632,-0.554799
1,-2.092895,1.460692,-2.082147
2,-0.925512,-1.171369,-0.160564
3,-0.479558,-1.729791,-0.416347
4,1.205099,1.488074,0.852186


## Oversample Minor Class

In [11]:
# Should be done later while cross validation

# print("Total data before oversampling", data_without_highcorr_feature.shape)

# data_oversampled, label = preprocess_oversample_minor_class(data_without_highcorr_feature, label)

# print("Total data after oversampling", data_oversampled.shape)

# Model

In [12]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score

def get_skfold():
    return StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

## LightGBM

In [13]:
def get_lgbm_model(X, y):
    """Run LightGBM model to produce its out-of-fold prediction
    or test prediction if X_test is provided.
    
    **Args**:
    * X (dataframe): Dataframe containing features
    * y (numpy array): Numpy array containing label
    
    **Return**:
    * lightgbm_oof (numpy array) : Numpy array containing out-of-fold prediction / test prediction
    """
    
    lightgbm_param = {'random_state': 1}
    
    lightgbm_pred = np.zeros(len(y))
    count = 0

    for train_index, test_index in get_skfold().split(X, y):
        count += 1
        clf = LGBMClassifier(**lightgbm_param)
        clf.fit(X.loc[train_index,:], y[train_index])
        lightgbm_pred[[test_index]] = clf.predict_proba(X.loc[test_index,:])[:,1]

    return lightgbm_pred

In [14]:
lightgbm_pred = get_lgbm_model(data, label.label.values)

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: state1, domain1, domain_name, domain_top_level

In [None]:
threshold = 0.5
lightgbm_pred_binary = [1 if x > threshold else 0 for x in lightgbm_pred]
print("F1 Score: {}".format(f1_score(label.label.values, lightgbm_pred_binary)))

## XGBoost

In [None]:
def get_xgb_model(X, y):
    """Run XGBoost model to produce its out-of-fold prediction
    or test prediction if X_test is provided.
    
    **Args**:
    * X (dataframe): Dataframe containing features
    * y (numpy array): Numpy array containing label
    
    **Return**:
    * xgboost_oof (numpy array) : Numpy array containing out-of-fold prediction / test prediction
    """
    
    xgboost_param = {'random_state': 1}
   
    
    xgboost_pred = np.zeros(len(y))
    count = 0

    for train_index, test_index in get_skfold().split(X, y):
        count += 1
        clf = XGBClassifier(**xgboost_param)
        clf.fit(X.loc[train_index,:], y[train_index])
        xgboost_pred[[test_index]] = clf.predict_proba(X.loc[test_index,:])[:,1]

    return xgboost_pred

In [None]:
xgboost_pred = get_xgb_model(data, label.label.values)

In [None]:
threshold = 0.5
xgboost_pred_binary = [1 if x > threshold else 0 for x in xgboost_pred]
print("F1 Score: {}".format(f1_score(label.label.values, xgboost_pred_binary)))

## Random Forest

In [None]:
def get_rf_model(X, y):
    """Run Random Forest model to produce its out-of-fold prediction
    or test prediction if X_test is provided.
    
    **Args**:
    * X (dataframe): Dataframe containing features
    * y (numpy array): Numpy array containing label
    
    **Return**:
    * rf_pred (numpy array) : Numpy array containing out-of-fold prediction / test prediction
    """
    
    rf_param = {'random_state': 1}
    
    rf_pred = np.zeros(len(y))
    count = 0

    for train_index, test_index in get_skfold().split(X, y):
        count += 1
        clf = RandomForestClassifier(**rf_param)
        clf.fit(X.loc[train_index,:], y[train_index])
        rf_pred[[test_index]] = clf.predict_proba(X.loc[test_index,:])[:,1]

    return rf_pred

In [None]:
rf_pred = get_rf_model(data, label.label.values)

In [None]:
threshold = 0.5
rf_pred_binary = [1 if x > threshold else 0 for x in rf_pred]
print("F1 Score: {}".format(f1_score(label.label.values, rf_pred_binary)))

## Ensemble

In [None]:
# Ini kita cobain teknik ensemble
# Ngegabungin hasil probability prediksi 3 classifier sebelumnya
# Ini untuk sekarang kita hard code aja nilai persentasenya
ensemble_pred = (lightgbm_pred*0.2) + (xgboost_pred*0.1) + (rf_pred*0.7)

In [None]:
threshold = 0.5
ensemble_pred_binary = [1 if x > threshold else 0 for x in ensemble_pred]
print("F1 Score: {}".format(f1_score(label.label.values, ensemble_pred_binary)))