In [41]:
import os
import math
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib

%matplotlib inline
pd.set_option('display.max_columns', 500)

import pandas_profiling



In [3]:
df = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv', delimiter=',')
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
print('Event Rate:', np.mean(df.is_fraud))
display(df.head(3))

In [42]:
df.profile_report()

In [4]:
from sklearn.utils import resample
df_majority = df[df.is_fraud == 0]
df_minority = df[df.is_fraud == 1]
df_maj_dowsampled = resample(df_majority,
                            n_samples = len(df_minority),
                            random_state = 42)
df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
print(df_downsampled.is_fraud.value_counts())


In [5]:
df_downsampled.head(3)

In [6]:
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df = df_input.copy()
    mapping = {x: node_id for node_id, x in enumerate(set(df['cc_num'].values.tolist()+
                                                          df['merchant'].values.tolist()))}

    df['from'] = df['cc_num'].apply(lambda x: mapping[x])
    df['to'] = df['merchant'].apply(lambda x: mapping[x])
    df = df[['from','to','amt','is_fraud']].groupby(['from','to']).agg({'is_fraud':'sum','amt':'sum'}).reset_index()
    df['is_fraud'] = df['is_fraud'].apply(lambda x: 1 if x>0 else 0)
    print(df.sample(3))
    
    G = nx.from_edgelist(df[['from','to']].values, create_using = graph_type)
    
    nx.set_node_attributes(G, {x:1 for x in df['from'].unique()}, 'bipartite')
    nx.set_node_attributes(G, {x:2 for x in df['to'].unique()}, 'bipartite')
    
    nx.set_edge_attributes(G, 
                          {(int(x['from']), int(x['to'])): x['is_fraud'] for idx, x in df[['from','to','is_fraud']].iterrows()},
                          'label')
    nx.set_edge_attributes(G, 
                          {(int(x['from']), int(x['to'])): x['amt'] for idx, x in df[['from','to','amt']].iterrows()},
                          'weight')
    return(G,df)

In [7]:
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df = df_input.copy()
    mapping = {x: node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                          df['cc_num'].values.tolist()+
                                                          df['merchant'].values.tolist()))}
    
    df['in_node'] = df['cc_num'].apply(lambda x: mapping[x])
    df['out_node'] = df['merchant'].apply(lambda x: mapping[x])
       
    G = nx.from_edgelist([(x['in_node'], mapping[idx]) for idx, x in df.iterrows()] +
                         [(x['out_node'], mapping[idx]) for idx, x in df.iterrows()],
                         create_using = graph_type)
    
    nx.set_node_attributes(G, {x['in_node']:1 for idx, x in df.iterrows()}, 'bipartite')
    nx.set_node_attributes(G, {x['out_node']:2 for idx, x in df.iterrows()}, 'bipartite')
    nx.set_node_attributes(G, {mapping[idx]:3 for idx, x in df.iterrows()}, 'bipartite')
    
    nx.set_edge_attributes(G, 
                          {(int(x['in_node']), mapping[idx]): x['is_fraud'] for idx, x in df.iterrows()},
                          'label')
    nx.set_edge_attributes(G, 
                          {(int(x['out_node']), mapping[idx]): x['is_fraud'] for idx, x in df.iterrows()},
                          'label')
   
    nx.set_edge_attributes(G, 
                          {(int(x['in_node']), mapping[idx]): x['amt'] for idx, x in df.iterrows()},
                          'weight')
    nx.set_edge_attributes(G, 
                          {(int(x['out_node']), mapping[idx]): x['amt'] for idx, x in df.iterrows()},
                          'weight')
    
    return(G,df)

In [8]:
G_down, ti_g_data = build_graph_tripartite(df_downsampled)

In [9]:
graph_degree = list(G_down.degree())

print('graph degree sample:', graph_degree[0:3])

In [16]:
list(G_down.nodes)[0:3]

## Visualize Graph

In [14]:
def plot_graph(dataset):
    sample_1 = dataset[['cc_num','merchant','is_fraud']]
    label = sample_1.groupby(['merchant'], sort=False, as_index=False ).agg( {'is_fraud':max }).reset_index(drop=True)

    # Create graph object
    G = nx.from_pandas_edgelist(sample_1, 'cc_num', 'merchant', create_using=nx.Graph())

    testmap = pd.Series(label.is_fraud.values,index=label.merchant).to_dict()
    target = list(map(testmap.get, list(G.nodes)))
    target = pd.Series(target, dtype=object).fillna(2).tolist()

    fig, ax = plt.subplots(figsize=(15, 8))

    nx.draw(G, with_labels=True, node_color=pd.Categorical(target).codes)

In [19]:
sample_data = df_downsampled[(df_downsampled['cc_num'].isin([340187018810220])) ] #4477156602511939689, 501818133297

plot_graph(sample_data)

## Other Variables
### Graph Related Variables

In [10]:
nodes_info = pd.DataFrame.from_dict(dict(nx.degree(G_down)), orient='index').rename(columns = {0 : 'degree'}).reset_index()
nodes_info

In [11]:
nodes_info_dict = {
#   'closeness_centrality': nx.closeness_centrality,
  'eigenvector_centrality': nx.eigenvector_centrality_numpy,
  'pagerank': nx.pagerank,
    'degree_centrality': nx.degree_centrality
}

columns_with_node_infos = ['degree'] + list(nodes_info_dict.keys())
nodes_info = pd.DataFrame.from_dict(dict(nx.degree(G_down)), orient='index').rename(columns = {0 : 'degree'}).reset_index()

# computing graph features for each node
for info, fun in nodes_info_dict.items():
    temp = pd.DataFrame.from_dict(fun(G_down), orient='index').rename(columns = {0 : info}).reset_index()
    nodes_info = nodes_info.merge(temp, on='index')

nodes_info = nodes_info.rename(columns = {'index': 'in_node'})
nodes_info.head(3)


In [28]:
ti_g_data = ti_g_data.merge(nodes_info, left_on = 'in_node', right_on = 'in_node', how='left').drop('in_node', axis=1)
ti_g_data.head(3)

### Other Variables

In [27]:
from math import sin, cos, sqrt, atan2, radians

def calculate_distance(lat1, lat2, lon1, lon2):
    R = 6373.0
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return(distance)

calculate_distance(35.9946,36.430124,-81.7266,-81.179483)

In [33]:
ti_g_data['distance'] = ti_g_data.apply(lambda x: calculate_distance(x.lat,x.merch_lat,x.long,x.merch_long), axis=1)
ti_g_data['distance'].hist()

In [36]:
ti_g_data.category.value_counts()

## Data Split and Model Development

In [63]:
selected_variables = ['amt','degree','eigenvector_centrality', 'pagerank', 'degree_centrality', 'distance','is_fraud'] #'category',
final_dataset = ti_g_data[selected_variables]

In [78]:
from sklearn.model_selection import train_test_split # train-test split
from sklearn.metrics import confusion_matrix, classification_report # classification metrics
from sklearn.model_selection import GridSearchCV # grid search cross validation
from sklearn.model_selection import RandomizedSearchCV # randomized search cross validation
from sklearn.ensemble import AdaBoostClassifier # Adaptive Boosting Classifier
from sklearn.ensemble import BaggingClassifier # Bootstrap Aggregating Classifier
from sklearn.tree import DecisionTreeClassifier # Decision Tree

from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbbors

X = final_dataset.drop(['is_fraud'],axis=1)
# X["category"] = X["category"].astype("category")
y = final_dataset['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### XGBoost

In [75]:
from xgboost import XGBClassifier
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, seed=123)
eval_set = [(X_train, y_train), (X_test, y_test)]

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train, eval_metric=["error"], eval_set=eval_set, verbose=True)
results = xg_cl.evals_result()

# Predict the labels of the test set: preds
predictions = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(predictions == y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy*100))

print(classification_report(y_test,predictions))



In [81]:
xgb.plot_importance(xg_cl)

### KNN

In [69]:
param_grid = {'n_neighbors': range(1,20)}
clf = RandomizedSearchCV(KNeighborsClassifier(), param_grid)
clf.fit(X_train,y_train)
clf_pred = clf.predict(X_test)

In [70]:
### Determining the number of neighbors using RandomizedSearchCV
param_grid = {'n_neighbors': range(1,20)}
knn = RandomizedSearchCV(KNeighborsClassifier(), param_grid, verbose=3)
knn.fit(X_train,y_train)

In [71]:
knn.best_params_ # best parameter

In [72]:
knn_pred = knn.predict(X_test)

print(confusion_matrix(y_test,knn_pred))
print('\n')
print(classification_report(y_test,knn_pred))

### Random Forest

In [73]:
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)

print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

### Ada Boosting

In [79]:
adabc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=200)
adabc.fit(X_train,y_train)
adabc_pred = adabc.predict(X_test)

print(confusion_matrix(y_test,adabc_pred))
print('\n')
print(classification_report(y_test,adabc_pred))

### Bagging

In [80]:
bgc = BaggingClassifier(DecisionTreeClassifier(),n_estimators=200)
bgc.fit(X_train,y_train)
bgc_pred = bgc.predict(X_test)

print(confusion_matrix(y_test,bgc_pred))
print('\n')
print(classification_report(y_test,bgc_pred))