<br>

### <span style='color:red;'><b>Note :</b></span> Majority of the Feature Engineering in this section is inspired from <a href='https://www.kaggle.com/cdeotte/xgb-fraud-with-magic-0-9600'>this</a> competition winning kernel.
<br><br>

In [3]:
import regex as re
import gc
import datetime
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Loading the feature engineered train and test dataset

train_data = pd.read_pickle('basic_fe_train.pkl')
test_data = pd.read_pickle('basic_fe_test.pkl')

In [5]:
print("*"*35)
print("\n Train Data Shape : {} \n".format(train_data.shape))
print("\n Test Data Shape : {} \n".format(test_data.shape))
print("*"*35)

***********************************

 Train Data Shape : (590538, 208) 


 Test Data Shape : (506691, 207) 

***********************************


## Utility Functions
<br>

In [6]:
def cat_num_features(df):
    
    '''
        Utility Function to get the names of Categorical Features and 
        Numerical Features of the given Dataset.
    '''
    
    catf = []
    numf = []
    
    catf = [
            'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 
            'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 
            'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo',
            'TransactionWD', 'card1_div_1000', 'card2_div_10', 'P_parent_domain', 
            'P_domain_name', 'P_top_level_domain', 'R_parent_domain', 'R_domain_name', 
            'R_top_level_domain', 'device_name', 'device_version', 'os_name', 'os_version', 
            'screen_width', 'screen_height', 'card_intr1', 'card_intr2', 'card1_addr1', 
            'card1_addr2', 'card2_addr1', 'card2_addr2', 'card3_addr1', 'card3_addr2', 
            'card5_addr1', 'card5_addr2', 'card6_addr1', 'card6_addr2', 'ProductCD_addr1', 
            'ProductCD_addr2', 'card1_ProductCD', 'card2_ProductCD', 'card5_ProductCD', 
            'card6_ProductCD', 'addr1_P_emaildomain', 'card1_P_emaildoman', 'card1_addr1_P_emaildomain',
            'uid1', 'uid2'
            ]

    catf+=['id_'+str(i) for i in range(12,39)]


    # Updating the Categorical Feature Names List based on the columns present in the dataframe
    catf = [feature for feature in catf if feature in df.columns]
    numf = [feature for feature in df.columns if feature not in catf and not feature == 'isFraud']
    
    return (catf, numf)  

In [7]:
def label_encode(X_train, X_test, catf):
  
  '''
    Utility Function to Encode Categorical Features.
  '''

  for f in catf:
    
    X_train[f] = X_train[f].astype(str)
    X_test[f] = X_test[f].astype(str)
    
    le = LabelEncoder()
    le.fit(X_train[f])
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    X_train[f] = le.transform(X_train[f])
    
    # Manually Encoding the test and Test Dataset so as to avoid error for any category which is not present in train set
    
    # All the categories which are not present in train datset are encoded as -1    
    X_test[f] = [-1 if mapping.get(v, -1)==-1 else mapping[v] for v in X_test[f].values ]

  return (X_train, X_test)

### Encoding Functions
<br>

In [8]:
def frequency_encode(train_df, test_df, features):

  '''
    Utility Function to perform frequency encoding for a feature.
  '''

  for f in features:
      
      value_count_dict = train_df[f].value_counts(dropna=True, normalize=True).to_dict()
      name = f+'_FE'  
      train_df[name] = train_df[f].map(value_count_dict) 
      
      # Manually Encoding the feature in test dataset so as to avoid error for a feature value which is not in train set
      
      # Assigning -1 to all the values of the test feature which are not in train set
      test_df[name] = [value_count_dict.get(val, -1) for val in test_df[f].values]

      print(name)
      



# https://www.kaggle.com/kyakovlev/ieee-fe-with-some-eda

def feature_aggregation1(features, uids, train_df, test_df, aggregations=['mean']):
    
    '''
      Utility Function to perform aggregation of a given feature with uid for given statistic.
    '''

    for f in features:  
        for uid in uids:
            for agg_type in aggregations:
                
                name = f+'_'+uid+'_'+agg_type

                temp_df = train_df.groupby([uid])[f].agg([agg_type]).reset_index().rename(columns={agg_type: name})

                temp_df.index = list(temp_df[uid])
                temp_df = temp_df[name].to_dict()   

                train_df[name] = train_df[uid].map(temp_df)

                # Manually Encoding the feature in test dataset so as to avoid error for a feature value which is not in train set
        
                # Assigning -1 to all the values of the test feature which are not in train set
                test_df[name] = [temp_df.get(uid, -1) for uid in test_df[uid].values]

                print(name)

        


def feature_aggregation2(features, uids, train_df, test_df):
  '''
    Utility Function to perform Aggregation based on the number of unique values present in a feature.
  '''

  for f in features:  
      for uid in uids:

          temp_df = train_df[[uid]+[f]]
          
          mp = temp_df.groupby(uid)[f].agg(['nunique'])['nunique'].to_dict()
          
          name = uid+'_'+f+'_ct'

          train_df[name] = train_df[uid].map(mp)

          # Manually Encoding the feature in test dataset so as to avoid error for a feature value which is not in train set
      
          # Assigning -1 to all the values of the test feature which are not in train set
          test_df[name] = [mp.get(uid, -1) for uid in test_df[uid].values]

          print(name)

## Data Preparation
<br>

In [9]:
X_train = train_data.drop(['isFraud'], axis=1)
y_train = train_data['isFraud']

X_test = test_data

del train_data, test_data

In [10]:
# Storing Categorical and Numerical Feature Names 

catf, numf = cat_num_features(X_train)

In [11]:
# Encoding the Categorical Features

X_train[catf] = X_train[catf].fillna('missing')
X_test[catf] = X_test[catf].fillna('missing')

X_train, X_test = label_encode(X_train, X_test, catf)

## Feature Engineering
<br>

In [12]:
# Frequency Encoding

frequency_encode(X_train,X_test,['addr1','card1','card2','card3','P_emaildomain'])
frequency_encode(X_train,X_test,['card1_addr1','card1_addr1_P_emaildomain'])


# Feature Aggregation

feature_aggregation1(['TransactionAmt','D9','D11'],['card1','card1_addr1','card1_addr1_P_emaildomain'],X_train, X_test, ['mean','std'])

addr1_FE
card1_FE
card2_FE
card3_FE
P_emaildomain_FE
card1_addr1_FE
card1_addr1_P_emaildomain_FE
TransactionAmt_card1_mean
TransactionAmt_card1_std
TransactionAmt_card1_addr1_mean
TransactionAmt_card1_addr1_std
TransactionAmt_card1_addr1_P_emaildomain_mean
TransactionAmt_card1_addr1_P_emaildomain_std
D9_card1_mean
D9_card1_std
D9_card1_addr1_mean
D9_card1_addr1_std
D9_card1_addr1_P_emaildomain_mean
D9_card1_addr1_P_emaildomain_std
D11_card1_mean
D11_card1_std
D11_card1_addr1_mean
D11_card1_addr1_std
D11_card1_addr1_P_emaildomain_mean
D11_card1_addr1_P_emaildomain_std


In [13]:
# Adding Month Feature, this will also be used while making final predictions

START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')
X_train['DT_M'] = X_train['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
X_train['DT_M'] = (X_train['DT_M'].dt.year-2017)*12 + X_train['DT_M'].dt.month 

X_test['DT_M'] = X_test['TransactionDT'].apply(lambda x: (START_DATE + datetime.timedelta(seconds = x)))
X_test['DT_M'] = (X_test['DT_M'].dt.year-2017)*12 + X_test['DT_M'].dt.month 

In [14]:
# FREQUENCY ENCODE UID

frequency_encode(X_train,X_test,['uid1', 'uid2'])


# AGGREGATE UID

feature_aggregation1(['TransactionAmt','D4','D9','D10','D15'],['uid1', 'uid2'],X_train, X_test,['mean','std'])
feature_aggregation1(['C'+str(i) for i in range(1,15) if 'C'+str(i) in X_train.columns],['uid1', 'uid2'],X_train,X_test,['mean'])
feature_aggregation1(['M'+str(i) for i in range(1,10) if 'M'+str(i) in X_train.columns],['uid1', 'uid2'], X_train, X_test,['mean'])
feature_aggregation1(['C14'],['uid1', 'uid2'],X_train,X_test,['std'])


feature_aggregation2(['P_emaildomain','dist1','DT_M','id_02','cents'], ['uid1', 'uid2'],X_train,X_test)
feature_aggregation2(['V127','V307'],['uid1', 'uid2'],X_train,X_test)

uid1_FE
uid2_FE
TransactionAmt_uid1_mean
TransactionAmt_uid1_std
TransactionAmt_uid2_mean
TransactionAmt_uid2_std
D4_uid1_mean
D4_uid1_std
D4_uid2_mean
D4_uid2_std
D9_uid1_mean
D9_uid1_std
D9_uid2_mean
D9_uid2_std
D10_uid1_mean
D10_uid1_std
D10_uid2_mean
D10_uid2_std
D15_uid1_mean
D15_uid1_std
D15_uid2_mean
D15_uid2_std
C1_uid1_mean
C1_uid2_mean
C2_uid1_mean
C2_uid2_mean
C4_uid1_mean
C4_uid2_mean
C5_uid1_mean
C5_uid2_mean
C6_uid1_mean
C6_uid2_mean
C7_uid1_mean
C7_uid2_mean
C9_uid1_mean
C9_uid2_mean
C10_uid1_mean
C10_uid2_mean
C11_uid1_mean
C11_uid2_mean
C12_uid1_mean
C12_uid2_mean
C13_uid1_mean
C13_uid2_mean
C14_uid1_mean
C14_uid2_mean
M1_uid1_mean
M1_uid2_mean
M2_uid1_mean
M2_uid2_mean
M3_uid1_mean
M3_uid2_mean
M4_uid1_mean
M4_uid2_mean
M5_uid1_mean
M5_uid2_mean
M6_uid1_mean
M6_uid2_mean
M7_uid1_mean
M7_uid2_mean
M8_uid1_mean
M8_uid2_mean
M9_uid1_mean
M9_uid2_mean
C14_uid1_std
C14_uid2_std
uid1_P_emaildomain_ct
uid2_P_emaildomain_ct
uid1_dist1_ct
uid2_dist1_ct
uid1_DT_M_ct
uid2_DT_M_c

In [15]:
# New Feature 

X_train['outsider15'] = (np.abs(X_train.D1-X_train.D15)>3).astype('int8')
X_test['outsider15'] = (np.abs(X_test.D1-X_test.D15)>3).astype('int8')
print('outsider15')

outsider15


In [16]:
X_train.drop(['uid1', 'uid2'], axis=1, inplace=True)
X_test.drop(['uid1', 'uid2'], axis=1, inplace=True)

In [17]:
print("*"*35)
print("\n Train Data Shape : {} \n".format(X_train.shape))
print("\n Test Data Shape : {} \n".format(X_test.shape))
print("*"*35)

***********************************

 Train Data Shape : (590538, 312) 


 Test Data Shape : (506691, 312) 

***********************************


In [18]:
X_train['isFraud'] = y_train

In [19]:
# Saving the Feature Engineered Datasets

X_train.to_pickle('advanced_fe_train.pkl')
X_test.to_pickle('advanced_fe_test.pkl')