# IEEE-CIS Fraud Detection

## Feature Engineering

In [29]:
# Import necessary modules

import pandas as pd
import numpy as np

from sklearn import preprocessing

In [56]:
train = pd.read_pickle('dataset/train_mem.pkl')
train.set_index('TransactionID', drop=True, inplace=True)

test = pd.read_pickle('dataset/test_mem.pkl')
test.set_index('TransactionID', drop=True, inplace=True)

In [3]:
# def train_test_sync (train_data, test_data, columns):
#     '''
#     Syncronizes test data with train data. Returns syncronized test data.
#     '''
#     for col in columns:
#         unique_list = train_data[col].unique().tolist()
#         test_data[col].loc[~test_data[col].isin(unique_list)] = np.nan
#     return test_data

In [4]:
# def mean_encoder(train_data, full_data, columns):
#     for col in columns:
#         full_data[col].fillna('missing', inplace=True)
#         col_dict = train_data.groupby(col).isFraud.mean().to_dict()
#         full_data[col+'_val'] = full_data[col].map(col_dict)
#     return full_data 

In [57]:
def feature_engineering(train, test):
    
    ## PREPROCESSING
    sync_list = (['P_emaildomain', 'R_emaildomain']
                 +['DeviceInfo']
                 +['id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']
                )
    
    # Preprocessing for unique values in the test set (sync_list columns)
    for col in sync_list:
        unique_list = train[col].unique().tolist()
        test[col].loc[~test[col].isin(unique_list)] = np.nan
    
    # Concatenate train and test data
    data = pd.concat((train.drop('isFraud', axis=1), test)).copy()

    ## FEATURE SELECTION
    # List of columns/features to be removed from the adtaaset
    remove_list = (#['TransactionAmt'] # new (log transformed) variable created
                   ['C2', 'C4', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14'] # correlation
                  +['D2', 'D6', 'D7', 'D12'] # correlation
                  +['V5', 'V9', 'V11', 'V13', 'V16', 'V17', 'V18', 'V20', 'V21', 'V22', 'V24', 
                    'V26', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V36', 'V38', 'V39', 
                    'V40', 'V41', 'V42', 'V43', 'V45', 'V48', 'V49', 'V50', 'V51', 'V52', 'V54', 
                    'V57', 'V58', 'V59', 'V60', 'V62', 'V63', 'V64', 'V65', 'V67', 'V68', 'V69', 
                    'V70', 'V71', 'V72', 'V73', 'V74', 'V76', 'V79', 'V80', 'V81', 'V83', 'V84', 
                    'V85', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V96', 'V97', 
                    'V100', 'V101', 'V102', 'V103', 'V105', 'V106', 'V110', 'V113', 'V116', 'V119', 
                    'V125', 'V126', 'V127', 'V128', 'V132', 'V133', 'V134', 'V137', 'V140', 'V142', 
                    'V143', 'V145', 'V147', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 
                    'V156', 'V157', 'V158', 'V159', 'V160', 'V162', 'V163', 'V164', 'V165', 'V167', 
                    'V168', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V185', 'V186', 
                    'V189', 'V190', 'V191', 'V192', 'V193', 'V195', 'V196', 'V197', 'V198', 'V199', 
                    'V200', 'V201', 'V202', 'V203', 'V204', 'V206', 'V207', 'V211', 'V212', 'V213', 
                    'V216', 'V217', 'V218', 'V219', 'V222', 'V225', 'V228', 'V229', 'V230', 'V231', 
                    'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V239', 'V242', 'V243', 'V244', 
                    'V245', 'V246', 'V247', 'V248', 'V249', 'V251', 'V252', 'V253', 'V254', 'V255', 
                    'V256', 'V257', 'V258', 'V259', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 
                    'V267', 'V268', 'V269', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 
                    'V278', 'V279', 'V280', 'V285', 'V287', 'V289', 'V292', 'V293', 'V294', 'V295', 
                    'V296', 'V297', 'V298', 'V299', 'V301', 'V302', 'V303', 'V304', 'V306', 'V307', 
                    'V308', 'V309', 'V310', 'V311', 'V312', 'V315', 'V316', 'V317', 'V318', 'V319', 
                    'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 
                    'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339'] # correlation
                  ) 
    # Remove unnecessary columns
    data.drop(remove_list, axis=1, inplace=True)
    
    ## TRANSACTION 
    # Create a new column with the log of TransactionAmt
    data['TransactionAmt_log'] = np.log(data.TransactionAmt)
    
    # New feature - decimal part of the transaction amount
    data['TransactionAmt_decimal'] = ((data['TransactionAmt'] - data['TransactionAmt'].astype(int)) * 1000).astype(int)

    # Datetime
    import datetime
    START_DATE = '2019-03-01'
    startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
    data['TransactionDT_DT'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
    
    # Hour column
    hours = data['TransactionDT']/3600
    data['Transaction_hours'] = np.floor(hours) % 24
    
    ## DEVICE
    # Fill NaNs with "missing"
    data['DeviceType'].fillna('missing', inplace=True)

    ## M Columns
    M_dict = {'T':1, 'F':0, 'M0':0, 'M1':1, 'M2':2}
    data[['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']] = data[['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']].replace(M_dict)
    
    ## MEAN ENCODER
    # Create new numeric column
    for col in sync_list:
        data[col].fillna('missing', inplace=True)
        col_dict = train.groupby(col).isFraud.mean().to_dict()
        data[col+'_val'] = data[col].map(col_dict)
    
    lbl_enc_list = (['P_emaildomain', 'R_emaildomain']
             +['DeviceInfo']
            )
    
    
    ## New features from Kernels
    data['TransactionAmt_to_std_card1'] = data['TransactionAmt'] / data.groupby(['card1'])['TransactionAmt'].transform('std')
    data['TransactionAmt_to_mean_card1'] = data['TransactionAmt'] / data.groupby(['card1'])['TransactionAmt'].transform('mean')
    data['TransactionAmt_to_std_card4'] = data['TransactionAmt'] / data.groupby(['card4'])['TransactionAmt'].transform('std')
    data['TransactionAmt_to_mean_card4'] = data['TransactionAmt'] / data.groupby(['card4'])['TransactionAmt'].transform('mean')
    
    data['D15_to_mean_card1'] = data['D15'] / data.groupby(['card1'])['D15'].transform('mean')
    data['D15_to_std_card1'] = data['D15'] / data.groupby(['card1'])['D15'].transform('std')
    data['D15_to_mean_addr1'] = data['D15'] / data.groupby(['addr1'])['D15'].transform('mean')
    data['D15_to_std_addr1'] = data['D15'] / data.groupby(['addr1'])['D15'].transform('std')
    
    # Replace inf values with nan
    data.replace([np.inf, -np.inf], np.nan, inplace=True)   
    
    ## DROP UNNECESSARY COLUMNS
    # List of columns to be dropped from the dataset for modeling
    drop_list = (['TransactionDT'] # new (same) variable created
                +['TransactionDT_DT'] # datetime 
                + sync_list # new (encoded) variables created
                )
    
    # Create a list of columns with more than 20% missing 
    # remove_missing_cols = data.isna().mean()[data.isna().mean()>0.2].index.to_list()
    
    # Drop unnecessary columns 
    # data.drop(drop_list+remove_missing_cols, axis=1, inplace=True)
    data.drop(drop_list, axis=1, inplace=True)
    
    
    ## LABEL ENCODER
#     for col in data.columns:
#         if data[col].dtype=='object': 
#             lbl = preprocessing.LabelEncoder()
#             data[col] = lbl.fit_transform(data[col].values)
    
    ## PREPARE DATA FOR MODELING
    # Create dummy variables
    data = pd.get_dummies(data)
    
    # Split train and test data
    train_data = data[:train.shape[0]]
    test_data = data[train.shape[0]:]
    
    return train_data.join(train.isFraud), test_data

In [76]:
# def feature_engineering(train, test):
    
#     ## PREPROCESSING
#     sync_list = (['P_emaildomain', 'R_emaildomain']
#                  +['DeviceInfo']
#                 )
    
#     # Preprocessing for unique values in the test set (e-mail columns)
#     P_email_list = train.P_emaildomain.unique().tolist()
#     test.P_emaildomain.loc[~test.P_emaildomain.isin(P_email_list)] = np.nan
    
#     R_email_list = train.R_emaildomain.unique().tolist()
#     test.R_emaildomain.loc[~test.R_emaildomain.isin(R_email_list)] = np.nan
    
#     # Preprocessing for unique values in the test set (DeviceInfo columns)
#     device_list = train.DeviceInfo.unique().tolist()
#     test.DeviceInfo.loc[~test.DeviceInfo.isin(device_list)] = np.nan
    
    
#     # Concatenate train and test data
#     data = pd.concat((train.drop('isFraud', axis=1), test)).copy()

#     ## FEATURE SELECTION
#     # List of columns/features to be removed from the adtaaset
#     remove_list = (#['TransactionAmt'] # new (log transformed) variable created
#                    ['C2', 'C4', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14'] # correlation
#                   +['D2', 'D6', 'D7', 'D12'] # correlation
#                   +['V5', 'V9', 'V11', 'V13', 'V16', 'V17', 'V18', 'V20', 'V21', 'V22', 'V24', 
#                     'V26', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V36', 'V38', 'V39', 
#                     'V40', 'V41', 'V42', 'V43', 'V45', 'V48', 'V49', 'V50', 'V51', 'V52', 'V54', 
#                     'V57', 'V58', 'V59', 'V60', 'V62', 'V63', 'V64', 'V65', 'V67', 'V68', 'V69', 
#                     'V70', 'V71', 'V72', 'V73', 'V74', 'V76', 'V79', 'V80', 'V81', 'V83', 'V84', 
#                     'V85', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V96', 'V97', 
#                     'V100', 'V101', 'V102', 'V103', 'V105', 'V106', 'V110', 'V113', 'V116', 'V119', 
#                     'V125', 'V126', 'V127', 'V128', 'V132', 'V133', 'V134', 'V137', 'V140', 'V142', 
#                     'V143', 'V145', 'V147', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 
#                     'V156', 'V157', 'V158', 'V159', 'V160', 'V162', 'V163', 'V164', 'V165', 'V167', 
#                     'V168', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V185', 'V186', 
#                     'V189', 'V190', 'V191', 'V192', 'V193', 'V195', 'V196', 'V197', 'V198', 'V199', 
#                     'V200', 'V201', 'V202', 'V203', 'V204', 'V206', 'V207', 'V211', 'V212', 'V213', 
#                     'V216', 'V217', 'V218', 'V219', 'V222', 'V225', 'V228', 'V229', 'V230', 'V231', 
#                     'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V239', 'V242', 'V243', 'V244', 
#                     'V245', 'V246', 'V247', 'V248', 'V249', 'V251', 'V252', 'V253', 'V254', 'V255', 
#                     'V256', 'V257', 'V258', 'V259', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 
#                     'V267', 'V268', 'V269', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 
#                     'V278', 'V279', 'V280', 'V285', 'V287', 'V289', 'V292', 'V293', 'V294', 'V295', 
#                     'V296', 'V297', 'V298', 'V299', 'V301', 'V302', 'V303', 'V304', 'V306', 'V307', 
#                     'V308', 'V309', 'V310', 'V311', 'V312', 'V315', 'V316', 'V317', 'V318', 'V319', 
#                     'V320', 'V321', 'V322', 'V323', 'V324', 'V325', 'V326', 'V327', 'V328', 'V329', 
#                     'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338', 'V339'] # correlation
#                   ## FOR NOW
#                   +['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08', 'id_09', 
#                      'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 
#                      'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 
#                      'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 
#                      'id_37', 'id_38'] # Needs feature engineering and missing val analysis
#                   ) 
#     # Remove unnecessary columns
#     data.drop(remove_list, axis=1, inplace=True)
    
    
#     ## TRANSACTION 
#     # Create a new column with the log of TransactionAmt
#     data['TransactionAmt_log'] = np.log(data.TransactionAmt)
    
#     # Datetime
#     import datetime
#     START_DATE = '2019-03-01'
#     startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
#     data['TransactionDT_DT'] = data['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
    
#     # Hour column
#     hours = data['TransactionDT']/3600
#     data['Transaction_hours'] = np.floor(hours) % 24

#     ## E-MAIL 
#     # Fill NaNs with "missing"
#     data.P_emaildomain.fillna('missing', inplace=True)
#     data.P_emaildomain.fillna('missing', inplace=True)

#     # Create dict for each column (using train data)
#     P_email_dict = train.groupby('P_emaildomain').isFraud.mean().to_dict()
#     R_email_dict = train.groupby('R_emaildomain').isFraud.mean().to_dict()

#     # Create new numeric column
#     data['P_emaildomain_val'] = data.P_emaildomain.map(P_email_dict)
#     data['R_emaildomain_val'] = data.R_emaildomain.map(R_email_dict)
    
#     ## DEVICE
#     # Fill NaNs with "missing"
#     data['DeviceType'].fillna('missing', inplace=True)

#     # Create new numeric column
#     data.DeviceInfo.fillna('missing', inplace=True)
#     device_dict = train.groupby('DeviceInfo').isFraud.mean().to_dict()
#     data['DeviceInfo_val'] = data.DeviceInfo.map(device_dict)
    
#     ## PREPARE DATA FOR MODEL
#     # List of columns to be dropped from the dataset for modeling
#     drop_list = (['TransactionDT'] # new (same) variable created
#                 +['TransactionDT_DT'] # datetime 
#                 +['P_emaildomain', 'R_emaildomain'] # new (encoded) variable created
#                 +['DeviceInfo'] # new (encoded) variable created
#                 )
    
#     # Create a list of columns with more than 20% missing 
#     remove_missing_cols = data.isna().mean()[data.isna().mean()>0.5].index.to_list()
    
#     # Drop unnecessary columns 
#     data.drop(drop_list+remove_missing_cols, axis=1, inplace=True)
    
#     # Create dummy variables
#     data = pd.get_dummies(data)
    
#     # Split train and test data
#     train_data = data[:train.shape[0]]
#     test_data = data[train.shape[0]:]
    
#     return train_data.join(train.isFraud), test_data

In [58]:
train, test = feature_engineering(train, test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [54]:
train.shape, test.shape

((590540, 204), (506691, 203))

In [49]:
print(train.columns.to_list())

['TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2', 'C1', 'C3', 'C5', 'D1', 'D3', 'D4', 'D5', 'D8', 'D9', 'D10', 'D11', 'D13', 'D14', 'D15', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7', 'V8', 'V10', 'V12', 'V14', 'V15', 'V19', 'V23', 'V25', 'V27', 'V35', 'V37', 'V44', 'V46', 'V47', 'V53', 'V55', 'V56', 'V61', 'V66', 'V75', 'V77', 'V78', 'V82', 'V86', 'V95', 'V98', 'V99', 'V104', 'V107', 'V108', 'V109', 'V111', 'V112', 'V114', 'V115', 'V117', 'V118', 'V120', 'V121', 'V122', 'V123', 'V124', 'V129', 'V130', 'V131', 'V135', 'V136', 'V138', 'V139', 'V141', 'V144', 'V146', 'V148', 'V161', 'V166', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V184', 'V187', 'V188', 'V194', 'V205', 'V208', 'V209', 'V210', 'V214', 'V215', 'V220', 'V221', 'V223', 'V224', 'V226', 'V227', 'V238', 'V240', 'V241', 'V250', 'V260', 'V270', 'V281', 'V282', 'V283', 'V284', 'V286', 'V288', 'V290', 'V291', 'V300', 'V305'

### Save Train and Test Data

In [59]:
pd.to_pickle(train, 'dataset/train_engineered_new2.pkl')
pd.to_pickle(test, 'dataset/test_engineered_new2.pkl')

### Feature Selection

### Transaction

In [6]:
# Create a new column with the log of TransactionAmt
train['TransactionAmt_log'] = np.log(train.TransactionAmt)

In [7]:
import datetime

START_DATE = '2019-03-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

train['TransactionDT_DT'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

In [8]:
def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23. 
    
    Parameters:
    -----------
    df : pd.DataFrame
        df to manipulate.
    tname : str
        Name of the time column in df.
    """
    hours = df[tname] / (3600)        
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

train['Transaction_hours'] = make_hour_feature(train)

### E-mail

In [9]:
# Fill NaNs with "missing"
train.P_emaildomain.fillna('missing', inplace=True)
train.R_emaildomain = train.P_emaildomain.fillna('missing')

# Create dict for each columns (-> use these dicts in for converting TEST DATA)
P_email_dict = train.groupby('P_emaildomain').isFraud.mean().to_dict()
R_email_dict = train.groupby('R_emaildomain').isFraud.mean().to_dict()

# Create new numeric columns
train['P_emaildomain_val'] = train.P_emaildomain.map(P_email_dict)
train['R_emaildomain_val'] = train.R_emaildomain.map(R_email_dict)

## Encoding

In [11]:
from sklearn.preprocessing import OneHotEncoder

# Create the encoder.
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(train)

# Apply the encoder.
train_encoded = encoder.transform(X_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Imputation

In [None]:
from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# imputer.fit_transform(df)

## Save Data

In [10]:
train.shape

(590540, 439)

In [11]:
len(remove_list + drop_list)

295

In [12]:
save_train = train.drop(remove_list + drop_list, axis=1)

In [13]:
save_train.shape

(590540, 144)

In [14]:
pd.to_pickle(save_train, 'dataset/train_engineered.pkl')