In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import vonmises
from tqdm import tqdm
import itertools
import gc

In [4]:
# load pickled data from EDA file
df_train = pd.read_pickle('df_train.pkl')
df_test  = pd.read_pickle('df_test.pkl')

### Categorical Features and Numerical Features

In [5]:
# As per Competition host data description https://www.kaggle.com/competitions/ieee-fraud-detection/discussion/101203
cat_features = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 
               'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1',
               'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo',
               'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20',
               'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
               'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']


In [6]:
exclude = ['TransactionID', 'isFraud']
num_features = [col_name for col_name in df_train.columns if (col_name not in cat_features) & (col_name not in exclude)]

## Feature Engineering
1) When Constructuin a credit card fraud detection model, it is important how to extract the right features from the transactional data. This is usually done by aggregating the transactions inoreder to observe the spending behavioral patterns of the customers.
2) We aggregated transactions to capture consumer buying behavior prior to each transaction and used these aggregations for model estimation to identify fraudulent transactions.
3) One difficulty with analysis of credit card fraud is that perpetrators do not
usually carry on a single fraudulent transaction. 
4) Analyzing fraud from the perspective of a ‘‘one by one’’ transaction omits the idea
of clustering that is inherent of credit card fraud actions.
5) Perpetrators usually produce a group of fraudulent transactions. We argue that analyzing the aggregated behavior is essential to improve credit card fraud detection rates.
6) From Top 20 features of below journal we adopted 13 features which are relevent for our dataset 
https://www.researchgate.net/publication/335139727_Predicting_Credit_Card_Transaction_Fraud_Using_Machine_Learning_Algorithms 

In [7]:
#https://www.kaggle.com/code/brodzik/ieee-cis-fraud-detection/notebook?scriptVersionId=21213096
for df in [df_train, df_test]:
    df["day_of_week"] = np.floor((df["TransactionDT"] / (3600 * 24) - 1) % 7).astype(int)
    
    df["TransactionAmt_int"] = df["TransactionAmt"].astype(int)
    df["TransactionAmt_dec"] = (1000 * (df["TransactionAmt"] - df["TransactionAmt_int"])).astype(int)
    
        
    for a, b in itertools.combinations(["card1", "card2", "card3", "card4", "card5", "addr1", "addr2", "dist1", "dist2"], 2):
        df["{}__{}".format(a, b)] = df[a].astype(str) + "__" + df[b].astype(str)
    
    df[["P_emaildomain_0", "P_emaildomain_1", "P_emaildomain_2"]] = df["P_emaildomain"].str.split(".", expand=True)
    df[["R_emaildomain_0", "R_emaildomain_1", "R_emaildomain_2"]] = df["R_emaildomain"].str.split(".", expand=True)
    df[["operating_system_0", "operating_system_1", "operating_system_2", "operating_system_3"]] = df["id_30"].str.split(" ", expand=True)
    df[["browser_0", "browser_1", "browser_2", "browser_3"]] = df["id_31"].str.split(" ", expand=True)
    df[["resolution_width", "resolution_height"]] = df["id_33"].str.split("x", expand=True)
    df[["DeviceInfo_0", "DeviceInfo_1", "DeviceInfo_2", "DeviceInfo_3", "DeviceInfo_4"]] = df["DeviceInfo"].str.split(r"[ -/_]", expand=True)[[0, 1, 2, 3, 4]]

    df["TransactionAmt_to_mean_card1"] = df["TransactionAmt"] / df.groupby(["card1"])["TransactionAmt"].transform("mean")
    df["TransactionAmt_to_mean_card4"] = df["TransactionAmt"] / df.groupby(["card4"])["TransactionAmt"].transform("mean")
    df["TransactionAmt_to_std_card1"] = df["TransactionAmt"] / df.groupby(["card1"])["TransactionAmt"].transform("std")
    df["TransactionAmt_to_std_card4"] = df["TransactionAmt"] / df.groupby(["card4"])["TransactionAmt"].transform("std")

    df["id_02_to_mean_card1"] = df["id_02"] / df.groupby(["card1"])["id_02"].transform("mean")
    df["id_02_to_mean_card4"] = df["id_02"] / df.groupby(["card4"])["id_02"].transform("mean")
    df["id_02_to_std_card1"] = df["id_02"] / df.groupby(["card1"])["id_02"].transform("std")
    df["id_02_to_std_card4"] = df["id_02"] / df.groupby(["card4"])["id_02"].transform("std")

    df["D15_to_mean_card1"] = df["D15"] / df.groupby(["card1"])["D15"].transform("mean")
    df["D15_to_mean_card4"] = df["D15"] / df.groupby(["card4"])["D15"].transform("mean")
    df["D15_to_std_card1"] = df["D15"] / df.groupby(["card1"])["D15"].transform("std")
    df["D15_to_std_card4"] = df["D15"] / df.groupby(["card4"])["D15"].transform("std")

    df["D15_to_mean_addr1"] = df["D15"] / df.groupby(["addr1"])["D15"].transform("mean")
    df["D15_to_mean_addr2"] = df["D15"] / df.groupby(["addr2"])["D15"].transform("mean")
    df["D15_to_std_addr1"] = df["D15"] / df.groupby(["addr1"])["D15"].transform("std")
    df["D15_to_std_addr2"] = df["D15"] / df.groupby(["addr2"])["D15"].transform("std")

In [8]:
for i in df_train.columns[212:]:
    if i[:3]=='Tra' or i[:3]=='id_' or i[:3]=='D15' or i[:3]=='day':
        num_features.append(i)
    else:
        cat_features.append(i)

In [9]:
# https://www.kaggle.com/code/cdeotte/xgb-fraud-with-magic-0-9600#The-Magic-Feature---UID
# FREQUENCY ENCODE TOGETHER
def encode_FE(df1, df2, cols):
    for col in cols:
        vc = df1[col].value_counts(dropna=True, normalize=True).to_dict()
        vc1 = df2[col].value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc1)
        df2[nm] = df2[nm].astype('float32')

# LABEL ENCODE
def encode_LE(col,train=df_train,test=df_test):
    df_comb,_ = train[col].factorize(sort=True)
    df_comb1,_ = test[col].factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb1[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb1[len(train):].astype('int16')
    del df_comb; x=gc.collect()
# COMBINE FEATURES
def encode_CB(col1,col2):
    nm = col1+'_'+col2
    df_train[nm] = df_train[col1].astype(str)+'_'+df_train[col2].astype(str)
    df_test[nm]= df_test[col1].astype(str)+'_'+df_test[col2].astype(str) 
    encode_LE(nm)


In [10]:
def encode_AG(main_columns, uids, aggregations=['mean'], train_df=df_train, test_df=df_test, 
              fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                new_col_name = main_column+'_'+col+'_'+agg_type
                temp_df =train_df[[col, main_column]]
                temp_df1=test_df[[col,main_column]]
                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   
                if usena: temp_df1.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df1 = temp_df1.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df1.index = list(temp_df1[col])
                temp_df1 = temp_df1[new_col_name].to_dict()

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df1).astype('float32')
                
                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)
# GROUP AGGREGATION NUNIQUE
def encode_AG2(main_columns, uids, train_df=df_train, test_df=df_test):
    for main_column in main_columns:  
        for col in uids:
            comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
            train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
            test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')

                

In [11]:
for i in range(1,16):
    if i in [1,2,3,5,9]: continue
    df_train['D'+str(i)] =  df_train['D'+str(i)] - df_train.TransactionDT/np.float32(24*60*60)
    df_test['D'+str(i)]  = df_test['D'+str(i)] - df_test.TransactionDT/np.float32(24*60*60)

In [12]:
# FREQUENCY ENCODE: ADDR1, CARD1, CARD2, CARD3, P_EMAILDOMAIN
encode_FE(df_train,df_test,['addr1','card1','card2','card3','P_emaildomain'])
# COMBINE COLUMNS CARD1+ADDR1, CARD1+ADDR1+P_EMAILDOMAIN
encode_CB('card1','addr1')
encode_CB('card1_addr1','P_emaildomain')
# FREQUENCY ENOCDE
encode_FE(df_train,df_test,['card1_addr1','card1_addr1_P_emaildomain'])
# GROUP AGGREGATE
encode_AG(['TransactionAmt','D9','D11'],['card1','card1_addr1','card1_addr1_P_emaildomain'],['mean','std'],usena=True)

In [13]:
df_train['uid'] = df_train.card1_addr1.astype(str)+'_'+np.floor(df_train.day-df_train.D1).astype(str)
df_test['uid'] = df_test.card1_addr1.astype(str)+'_'+np.floor(df_test.day-df_test.D1).astype(str)

In [19]:
list(df_train.columns)

['TransactionID',
 'isFraud',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V12',
 'V14',
 'V15',
 'V17',
 'V19',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V35',
 'V37',
 'V44',
 'V46',
 'V47',
 'V53',
 'V55',
 'V56',
 'V61',
 'V66',
 'V67',
 'V75',
 'V77',
 'V78',
 'V82',
 'V86',
 'V87',
 'V95',
 'V98',
 'V99',
 'V100',
 'V104',
 'V107',
 'V108',
 'V109',
 'V110',
 'V111',
 'V112',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V120',
 'V121',
 'V122',
 'V123',
 'V124',
 'V125',
 'V129',
 'V130',
 'V131',
 'V1

In [14]:
df_train['uid']

0          13832_-13.0
1            32191_1.0
2            37942_1.0
3         28240_-111.0
4            37442_1.0
              ...     
590535     44024_153.0
590536      1578_182.0
590537      7079_182.0
590538     48254_160.0
590539     17800_182.0
Name: uid, Length: 590540, dtype: object

In [15]:
# NEW FEATURE
df_train['outsider15'] = (np.abs(df_train.D1-df_train.D15)>3).astype('int8')
df_test['outsider15'] = (np.abs(df_test.D1-df_test.D15)>3).astype('int8')

In [17]:
df_train['outsider15'].unique()

array([1, 0], dtype=int8)

In [93]:
# FREQUENCY ENCODE UID
encode_FE(df_train,df_test,['uid'])
# AGGREGATE 
encode_AG(['TransactionAmt','D4','D9','D10','D15'],['uid'],['mean','std'],fillna=True,usena=True)
# AGGREGATE
encode_AG(['C'+str(x) for x in range(1,15) if x!=3],['uid'],['mean'],df_train,df_test,fillna=True,usena=True)
# AGGREGATE
# encode_AG(['M'+str(x) for x in range(1,10)],['uid'],['mean'],fillna=True,usena=True)
# AGGREGATE
encode_AG2(['P_emaildomain','dist1','id_02','cents'], ['uid'], train_df=df_train, test_df=df_test)
# AGGREGATE
encode_AG(['C14'],['uid'],['std'],df_train,df_test,fillna=True,usena=True)
# AGGREGATE 
encode_AG2(['C13','V314'], ['uid'], train_df=df_train, test_df=df_test)
# AGGREATE 
encode_AG2(['V136'], ['uid'], train_df=df_train, test_df=df_test)
# NEW FEATURE
df_train['outsider15'] = (np.abs(df_train.D1-df_train.D15)>3).astype('int8')
df_test['outsider15'] = (np.abs(df_test.D1-df_test.D15)>3).astype('int8')


uid_P_emaildomain_ct, uid_dist1_ct, uid_id_02_ct, uid_cents_ct, uid_C13_ct, uid_V314_ct, uid_V136_ct, 

### Drop columns with majority nans 

In [94]:
# replacing inf values to nans
df_train.replace([np.inf, -np.inf], np.nan, inplace=True)
df_test.replace([np.inf, -np.inf], np.nan, inplace=True)
# removing columns that have more than 90 percent nans
col_names_num = df_train[num_features].isna().sum()
col_names_cat = df_train[cat_features].isna().sum()

drop_num=col_names_num[(col_names_num/df_train.shape[0])>0.9].index
drop_cat=col_names_cat[(col_names_cat/df_train.shape[0])>0.9].index

for column in df_train.columns:
    if column in drop_cat or column in drop_num :
        df_train.drop(column, axis=1, inplace=True)
        df_test.drop(column, axis=1, inplace=True)


### Filling Nan Values for remaining columns

In [95]:
# updating categorical features and numerical features list 
# which we manually created at start of the notebook
num_features_new=[]
for i in df_train.columns:
    if df_train[i].dtype != 'object':
        if i not in drop_num and i!='isFraud':
            num_features_new.append(i)
cat_features_new=[]
for j in df_train.columns:
    if df_train[j].dtype == 'object':
        if j not in drop_cat and j!='isFraud':
            cat_features_new.append(j)

### Fill numerical features Nan with median

In [96]:
# train data
median_values = df_train[num_features_new].median()
df_train[num_features_new] = df_train[num_features_new].fillna(median_values)
# test data
median_values = df_test[num_features_new].median()
df_test[num_features_new] = df_test[num_features_new].fillna(median_values)

### Fill Categorical features with % Nan < 1 with interpolation

In [97]:
'''for columns that have very low nan values we use interpolation to fill nans
   based on values above and below of particular nan value
   if nans are greater than 1 percent we create a new label for nans as 'missing'.
'''
for col in cat_features_new:
    if df_train[col].isna().sum()==0:
        continue
    elif df_train[col].isna().sum()<(len(df_train)*0.01):
        df_train[col]=df_train[col].astype('category')
        df_train[col]= df_train[col].cat.codes.replace(-1,np.nan).interpolate().astype(int).astype('category').cat.rename_categories(df_train[col].cat.categories)
    else:
        try:
            df_train[col]=df_train[col].fillna('missing')
        except:
            ValueError
            df_train[col]=df_train[col].astype('category')
            df_train[col]= df_train[col].cat.codes.replace(-1,np.nan).interpolate().astype(int).astype('category').cat.rename_categories(df_train[col].cat.categories)

In [98]:
# for test data
for col in cat_features_new:
    if df_test[col].isna().sum()==0:
        continue
    elif df_test[col].isna().sum()<(len(df_test)*0.01):
        df_test[col]=df_test[col].astype('category')
        df_test[col]= df_test[col].cat.codes.replace(-1,np.nan).interpolate().astype(int).astype('category').cat.rename_categories(df_test[col].cat.categories)
    else:
        try:
            df_test[col]=df_test[col].fillna('missing')
        except:
            ValueError
            df_test[col]=df_test[col].astype('category')
            df_test[col]= df_test[col].cat.codes.replace(-1,np.nan).interpolate().astype(int).astype('category').cat.rename_categories(df_test[col].cat.categories)

In [99]:
# we keep only common columns that are present in both train and test data
list1=df_test.columns
list2=df_train.columns
common_cols=list(set(list1).intersection(list2))

In [100]:
count=0
for i in list2:
    if i not in common_cols and i!='isFraud':
        df_train.drop(i,axis=1,inplace=True)
        count+=1
# remove columns in test data that are not in train data
for cols in df_test.columns:
    if cols not in df_train.columns:
        df_test.drop(cols,axis=1,inplace=True)
        count+=1
print(df_train.shape)
print(df_test.shape)
print('No of Dropped columns which are not common to both train and test',count)

(590540, 341)
(506691, 340)
No of Dropped columns which are not common to both train and test 0


In [101]:
### OHE and Label encoding for categorical features 
import bisect
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [102]:
'''Categories with low cardinality are one hot encoded
   High cardinality features are label encoded'''
for cols in cat_features_new:   
    df_train[cols]=df_train[cols].astype(str)
    le.fit(df_train[cols])
    df_train[cols]=le.transform(df_train[cols])
    df_test[cols]=df_test[cols].astype(str)
    # to handle unknown
    le_classes=le.classes_.tolist()
    bisect.insort_left(le_classes, 'other')
    le.classes_ = le_classes
    df_test[cols] = df_test[cols].map(lambda x: 'other' if x not in le.classes_ else x)
    df_test[cols]=le.transform(df_test[cols])      

### Feature 1:Max amount of Transcation by a particular cardnum in 30 days

In [103]:
# Train data
df_temp_train=df_train[['uid','month','TransactionAmt']].groupby(['uid','month']).max('TrancsactionAmt').reset_index()
df_temp_train.rename({'TransactionAmt':'MAX_amt'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','month'])

# Test data
df_temp_test=df_test[['uid','month','TransactionAmt']].groupby(['uid','month']).max('TrancsactionAmt').reset_index()
df_temp_test.rename({'TransactionAmt':'MAX_amt'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','month'])

### Feature 2:Total amount by a particular cardnum in this addr1 in 14 days 

In [104]:
# Train data
df_temp_train=df_train[['uid','week2','TransactionAmt','addr1']].groupby(['uid','addr1','week2']).sum('TrancsactionAmt').reset_index()
df_temp_train.rename({'TransactionAmt':'Total_amt'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','addr1','week2'])

# Test data
df_temp_test=df_test[['uid','week2','TransactionAmt','addr1']].groupby(['uid','addr1','week2']).sum('TrancsactionAmt').reset_index()
df_temp_test.rename({'TransactionAmt':'Total_amt'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','addr1','week2'])

### Feature 3:Median amount by a particular cardnum in this addr1 in 30 days 

In [105]:
# Train data
df_temp_train=df_train[['uid','month','TransactionAmt','addr1']].groupby(['uid','addr1','month']).median('TrancsactionAmt').reset_index()
df_temp_train.rename({'TransactionAmt':'Median_amt'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','addr1','month'])

# Test data
df_temp_test=df_test[['uid','month','TransactionAmt','addr1']].groupby(['uid','addr1','month']).median('TrancsactionAmt').reset_index()
df_temp_test.rename({'TransactionAmt':'Median_amt'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','addr1','month'])

### Feature 4:Total Number of transactions for a cardnum in a given day

In [106]:
# Train
df_temp_train=df_train[['uid','day','TransactionID']].groupby(['uid','day']).count().reset_index()
df_temp_train.rename({'TransactionID':'#tran_1day'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','day'])

# Test data
df_temp_test=df_test[['uid','day','TransactionID']].groupby(['uid','day']).count().reset_index()
df_temp_test.rename({'TransactionID':'#tran_1day'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','day'])

### Feature 5:Actual/Average amount by this cardnum in 30 days

In [107]:
'''Frist we are grouping based on card num and month of transaction we obtain 
   No of transaction done by the card in a given month and Actual Amount of transaction
   Then we obtain Average amount of transaction by dividing actual transactionamt by no of transactions
''' 
# Train data
df_temp_train=df_train[['uid','month','TransactionID','TransactionAmt']].groupby(['uid','month']).agg({'TransactionID':'count','TransactionAmt':'sum'}).reset_index()
df_temp_train.rename({'TransactionID':'count','TransactionAmt':'Act_amt'},axis=1,inplace=True)
df_temp_train['Average_amt']=df_temp_train['Act_amt'].values/df_temp_train['count'].values
df_temp_train['Act_amt/Avg_amt']=df_temp_train['Act_amt'].values/df_temp_train['Average_amt'].values
df_train=df_train.merge(df_temp_train.drop(['count','Act_amt','Average_amt'],axis=1),on=['uid','month'])

# Test data
df_temp_test=df_test[['uid','month','TransactionID','TransactionAmt']].groupby(['uid','month']).agg({'TransactionID':'count','TransactionAmt':'sum'}).reset_index()
df_temp_test.rename({'TransactionID':'count','TransactionAmt':'Act_amt'},axis=1,inplace=True)
df_temp_test['Average_amt']=df_temp_test['Act_amt'].values/df_temp_test['count'].values
df_temp_test['Act_amt/Avg_amt']=df_temp_test['Act_amt'].values/df_temp_test['Average_amt'].values
df_test=df_test.merge(df_temp_test.drop(['count','Act_amt','Average_amt'],axis=1),on=['uid','month'])

### Feature 6:No of transactions by this cardnum in this addr2 in 1 day

In [108]:
# Train data
df_temp_train=df_train[['uid','day','TransactionID','addr2']].groupby(['uid','day','addr2']).agg({'TransactionID':'count'}).reset_index()
df_temp_train.rename({'TransactionID':'#tran_1day_addr2'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','day','addr2'])

# Test data
df_temp_test=df_test[['uid','day','TransactionID','addr2']].groupby(['uid','day','addr2']).agg({'TransactionID':'count'}).reset_index()
df_temp_test.rename({'TransactionID':'#tran_1day_addr2'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','day','addr2'])

### Feature 7:Max amount by this cardnum in this addr2 in 14 day

In [109]:
# Train data
df_temp_train=df_train[['uid','week2','TransactionAmt','addr2']].groupby(['uid','week2','addr2']).agg({'TransactionAmt':'max'}).reset_index()
df_temp_train.rename({'TransactionAmt':'Maxamt_14day_addr2'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','week2','addr2'])

# Test data
df_temp_test=df_test[['uid','week2','TransactionAmt','addr2']].groupby(['uid','week2','addr2']).agg({'TransactionAmt':'max'}).reset_index()
df_temp_test.rename({'TransactionAmt':'Maxamt_14day_addr2'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','week2','addr2'])

### Feature 8:Median amount by this cardnum in this addr1 in 30 day

In [110]:
# Train data
df_temp_train=df_train[['uid','month','TransactionAmt','addr1']].groupby(['uid','month','addr1']).agg({'TransactionAmt':'median'}).reset_index()
df_temp_train.rename({'TransactionAmt':'Medianamt_30day_addr1'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','month','addr1'])

# Test data
df_temp_test=df_test[['uid','month','TransactionAmt','addr1']].groupby(['uid','month','addr1']).agg({'TransactionAmt':'median'}).reset_index()
df_temp_test.rename({'TransactionAmt':'Medianamt_30day_addr1'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','month','addr1'])

### Feature 9:Median amount by this cardnum in 30 days

In [111]:
# Train data
df_temp_train=df_train[['uid','month','TransactionAmt']].groupby(['uid','month']).agg({'TransactionAmt':'median'}).reset_index()
df_temp_train.rename({'TransactionAmt':'Medianamt_30day'},axis=1,inplace=True)
df_tran=df_train.merge(df_temp_train,on=['uid','month'])

# Test data
df_temp_test=df_test[['uid','month','TransactionAmt']].groupby(['uid','month']).agg({'TransactionAmt':'median'}).reset_index()
df_temp_test.rename({'TransactionAmt':'Medianamt_30day'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','month'])

### Feature 10:Total amount by this cardnum in this addr1 in 30 days

In [112]:
# Train data
df_temp_train=df_train[['uid','month','TransactionAmt','addr1']].groupby(['uid','month','addr1']).agg({'TransactionAmt':'sum'}).reset_index()
df_temp_train.rename({'TransactionAmt':'Totalamt_30day_addr1'},axis=1,inplace=True)
df_train=df_train.merge(df_temp_train,on=['uid','month','addr1'])

# Test data
df_temp_test=df_test[['uid','month','TransactionAmt','addr1']].groupby(['uid','month','addr1']).agg({'TransactionAmt':'sum'}).reset_index()
df_temp_test.rename({'TransactionAmt':'Totalamt_30day_addr1'},axis=1,inplace=True)
df_test=df_test.merge(df_temp_test,on=['uid','month','addr1'])

### Feature 11:Log TransactionAmt

In [113]:
df_train['log_amt']=np.log(df_train['TransactionAmt'].values)
df_test['log_amt'] =np.log(df_test['TransactionAmt'].values)

## Creating a Periodic feature from Vonmises Distribution
1) When using the aggregated features there is still some information that is not completely captured by those features.In particular when we are interested in analyzing the time of the transaction.
2) we use a new method for extracting periodic features in order to estimate if the time of a new transaction is with in the confidence interval of the previous transaction times.
3) The motivation is that a customer is expected to make transactions at similar hours. The proposed methodology is based on analyzing the periodic behavior of a transactiontime,using the vonMises distribution.
4) The vonMises distribution,also known as the periodic normal distribution,is a distribution of a wrappe dnormal distributed variable across a circle.
5) In particular we are interested in calculating a confidence interval(CI) for the time of a transaction.For doing that initially we select a set of transactions made by the same client in the last 7 days.
6) Afterwards,the probability distribution function of the time of the set of transactions is calculated with the help of vonmises module in scipy.stats
7) Vonmises.interval gives us the required confidence interval for present transaction.
8) Expected time of a transaction Using the confidence interval,a transaction can be flag normal or suspicious,depending whether or not the time of the transaction is with in the confidence interval.
9) Then, using the estimated distribution,a new set of features can be extracted,ie.,a binaryfeature if a new transaction time is with in the confidence interval range with probability α(confidence score).
**Reference**<br>
https://albahnsen.github.io/files/Feature%20Engineering%20Strategies%20for%20Credit%20Card%20Fraud%20Detection_published.pdf

In [122]:
df_time=df_train[['TransactionID','hour','uid','day','week']]

In [6]:
'''for index(start from zero to len of dataframe),i is a list [hour,week]
        if index >1:
            if past_week(we obtained from else condition)==present_week(i[index])
                we fit all transactions occured till now(not including present transaction) in this week 
                and get confidence interval for the present transaction.
                we append at what hour transaction occured in temp list
                we store the present transaction week information in past_week 

            else:(i.e new transaction doesn't belong to pastweek)
                we empty all the details of previous transactions 
                fit with the new data get confidence interval
                we append at what hour transaction occured in temp list
                we store the present transaction week information in past_week 
            
        else:
            frist transaction for a given card doesn't have any previous transaction
            so we directly fit the data and get kappa,loc (parameters of vonmises distribution)
            use them to get confidence interval for index=0 transaction.
            we append at what hour transaction occured in temp list
            we store the present transaction week information in past_week 
'''
def get_CI_and_binary_feature(df3):
    confidence=[]
    temp=[]
    for index,i in enumerate(df3[['hour','week']].values):
        if index>1:
            if past_week==i[1]:
                kappa,loc,_=vonmises.fit(temp,fscale=1)
                confidence.append(np.round(vonmises.interval(0.9,kappa,loc=loc, scale=1)))
                temp.append(i[0])
                past_week=i[1]
            else:
                temp=[]
                kappa,loc,_=vonmises.fit(i[0],fscale=1)
                confidence.append(np.round(vonmises.interval(0.9,kappa,loc=loc, scale=1)))
                temp.append(i[0])
                past_week=i[1]
        else:
            kappa,loc,_=vonmises.fit(i[0],fscale=1)
            confidence.append(np.round(vonmises.interval(0.9,kappa,loc=loc, scale=1)))
            past_week=i[1]
            temp.append(i[0])
    # create a new column with name ConfidenceInterval
    df3['ConfidenceInterval']=confidence
    # create a new binary feature
    ''' for item [hour,confidenceinterval]
            if present transaction hour lies with in confidence interval 
            we return true else false '''  
    binary_feature=[]
    for item in df3[['hour','ConfidenceInterval']].values:
        if item[0]>=item[1][0] and item[0]<=item[1][1]:
            binary_feature.append(True)
        else:
            binary_feature.append(False) 
    df3['binary_feature']=binary_feature   
    
    return df3

In [101]:
dict_feature={}
for index,i in enumerate(df_time['card_id'].unique()):
    df3=df_time.loc[df_time['card_id']==i]
    df3=get_CI_and_binary_feature(df3)
    if index==0:
        dict_feature['TransactionID']=df3['TransactionID'].values
        dict_feature['ConfidenceInterval']=df3['ConfidenceInterval'].values
        dict_feature['binary_feature']=df3['binary_feature'].values
    else:
        dict_feature['TransactionID']=np.append(dict_feature['TransactionID'],df3['TransactionID'].values)
        dict_feature['ConfidenceInterval']=np.append(dict_feature['ConfidenceInterval'],df3['ConfidenceInterval'].values)
        dict_feature['binary_feature']=np.append(dict_feature['binary_feature'],df3['binary_feature'].values)
       

In [123]:
df_binary=pd.read_csv('vonmisses_train.csv')
df_binary.head()

Unnamed: 0,TransactionID,ConfidenceInterval,binary_feature
0,2987000,[-0. 0.],True
1,3023492,[12. 12.],True
2,3026562,[6. 6.],False
3,3033460,[4. 9.],False
4,3042544,[22. 22.],True


In [124]:
df_train=df_train.merge(df_binary,on='TransactionID')

In [123]:
df_time=df_test[['TransactionID','hour','card_id','day','week']]

In [145]:
dict_feature={}
for index,i in enumerate(df_time['card_id'].unique()):
    df3=df_time.loc[df_time['card_id']==i]
    df3=get_CI_and_binary_feature(df3)
    if index==0:
        dict_feature['TransactionID']=df3['TransactionID'].values
        dict_feature['ConfidenceInterval']=df3['ConfidenceInterval'].values
        dict_feature['binary_feature']=df3['binary_feature'].values
    else:
        dict_feature['TransactionID']=np.append(dict_feature['TransactionID'],df3['TransactionID'].values)
        dict_feature['ConfidenceInterval']=np.append(dict_feature['ConfidenceInterval'],df3['ConfidenceInterval'].values)
        dict_feature['binary_feature']=np.append(dict_feature['binary_feature'],df3['binary_feature'].values)

In [125]:
df_binary=pd.read_csv('vonmisses_test.csv')
df_binary.head()

Unnamed: 0,TransactionID,ConfidenceInterval,binary_feature
0,3663549,[-0. 0.],True
1,3665539,[22. 22.],True
2,3671801,[ 8. 14.],False
3,3673579,[12. 17.],True
4,3687976,[15. 15.],True


In [126]:
df_test=df_test.merge(df_binary,on='TransactionID')

In [127]:
print(df_train.shape)
print(df_test.shape)

(590540, 353)
(506691, 353)


In [97]:
df_test.isna().sum().sum()

0

In [98]:
df_train.isna().sum().sum()

0

In [19]:
# Save data to pickle files for faster loading
df_train.to_pickle('df_train_fe.pkl')
df_test.to_pickle('df_test_fe.pkl')

## Vonmises distribution
1) Vonmises distribution help to understand periodic nature of time features.
2) Here using Binary feature we are predicting present transaction of a given creditcard follows previous transactions distribution or not.
3) Time span of 7 days <br>
**The above process is taking more than 30 hrs to complete for entire dataset**