In [1]:
import os
import math
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline



## Data Exploration

In [2]:
df = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv', delimiter=',')
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
print('Event Rate:', np.mean(df.is_fraud))
display(df.head(3))

**The event rate is 0.5%, so the dataset is highly imbalanced, we will random pick 20% non-event observations and all event observations to do undersampling.**

In [3]:
train_data = df[df.is_fraud==0].sample(frac=0.2, random_state = 2).append(df[df.is_fraud==1])
print('Event Rate:', np.mean(train_data.is_fraud))
print('Event Rate Distribution:\n', train_data.is_fraud.value_counts())


### Prepare Graph Data

In [4]:
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df = df_input.copy()
    mapping = {x: node_id for node_id, x in enumerate(set(df['cc_num'].values.tolist()+
                                                          df['merchant'].values.tolist()))}
    df['from'] = df['cc_num'].apply(lambda x: mapping[x])
    df['to'] = df['merchant'].apply(lambda x: mapping[x])
    df = df[['from','to','amt','is_fraud']].groupby(['from','to']).agg({'is_fraud':'sum','amt':'sum'}).reset_index()
    df['is_fraud'] = df['is_fraud'].apply(lambda x: 1 if x>0 else 0)
    
    G = nx.from_edgelist(df[['from','to']].values, create_using = graph_type)
    
    nx.set_node_attributes(G, {x:1 for x in df['from'].unique()}, 'bipartite')
    nx.set_node_attributes(G, {x:2 for x in df['to'].unique()}, 'bipartite')
    
    nx.set_edge_attributes(G, 
                          {(int(x['from']), int(x['to'])): x['is_fraud'] for idx, x in df[['from','to','is_fraud']].iterrows()},
                          'label')
    nx.set_edge_attributes(G, 
                          {(int(x['from']), int(x['to'])): x['amt'] for idx, x in df[['from','to','amt']].iterrows()},
                          'weight')
    return(G)

In [5]:
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df = df_input.copy()
    mapping = {x: node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                          df['cc_num'].values.tolist()+
                                                          df['merchant'].values.tolist()))}
    
    df['in_node'] = df['cc_num'].apply(lambda x: mapping[x])
    df['out_node'] = df['merchant'].apply(lambda x: mapping[x])
       
    G = nx.from_edgelist([(x['in_node'], mapping[idx]) for idx, x in df.iterrows()] +
                         [(x['out_node'], mapping[idx]) for idx, x in df.iterrows()],
                         create_using = graph_type)
    
    nx.set_node_attributes(G, {x['in_node']:1 for idx, x in df.iterrows()}, 'bipartite')
    nx.set_node_attributes(G, {x['out_node']:2 for idx, x in df.iterrows()}, 'bipartite')
    nx.set_node_attributes(G, {mapping[idx]:3 for idx, x in df.iterrows()}, 'bipartite')
    
    nx.set_edge_attributes(G, 
                          {(int(x['in_node']), mapping[idx]): x['is_fraud'] for idx, x in df.iterrows()},
                          'label')
    nx.set_edge_attributes(G, 
                          {(int(x['out_node']), mapping[idx]): x['is_fraud'] for idx, x in df.iterrows()},
                          'label')
   
    nx.set_edge_attributes(G, 
                          {(int(x['in_node']), mapping[idx]): x['amt'] for idx, x in df.iterrows()},
                          'weight')
    nx.set_edge_attributes(G, 
                          {(int(x['out_node']), mapping[idx]): x['amt'] for idx, x in df.iterrows()},
                          'weight')
    
    return(G)

In [6]:
G_bu = build_graph_bipartite(train_data, nx.Graph(name=['Bipartite Undirected']))
G_bd = build_graph_bipartite(train_data, nx.DiGraph(name=['Bipartite Directed']))
G_tu = build_graph_tripartite(train_data, nx.Graph(name=['Tripartite Undirected']))
G_td = build_graph_tripartite(train_data, nx.DiGraph(name=['Tripartite Directed']))

In [7]:
from networkx.algorithms import bipartite
bipartite.is_bipartite(G_bu)

In [8]:
print(nx.info(G_bu))
print('==========================')
print(nx.info(G_tu))

**Node Degree Distribution**

From the below plot we can see, the bipartite graph has a more variegated distribution, with a peak of around 300. While tripartite graph has a peak of degree 2.

In [10]:
for G in [G_bu, G_tu]:
    plt.figure(figsize = (10,10))
    degrees = pd.Series(
        {
            k:v for k,v in nx.degree(G)
        }
    )
    degrees.plot.hist()
    plt.yscale("log")

**Edge Weight Distribution**

From the below plot, the distribution is slightly shifted to the right (right-skewed) when compared to the tripartite where the transaction nodes are more pronounced.

In [11]:
for G in [G_bu, G_tu]:
    allEdgesWeights = pd.Series({
        (d[0], d[1]): d[2]['weight'] for d in G.edges(data=True)
    })
    np.quantile(allEdgesWeights.values, [0.1, 0.5, 0.7, 0.9, 1])
    quant_dist = np.quantile(allEdgesWeights.values, [0.1, 0.5, 0.7, 0.9])
    allEdgesWeightsFiltered = pd.Series({
        (d[0], d[1]): d[2]['weight'] for d in G.edges(data=True) if d[2]['weight'] < quant_dist[-1]
    })
    plt.figure(figsize = (10,10))
    allEdgesWeightsFiltered.plot.hist(bins=40)
    plt.yscale('log')

**Degree Centrality**

Degree Centrality is defined as the number of links incident upon a node (i.e., the number of ties that a node has). 


In [23]:
for G in [G_bu, G_tu]:
    plt.figure(figsize = (10,10))
    degree_centrality = pd.Series(
        {
            k:v for k,v in nx.degree_centrality(G).items()
        }
    )
    degree_centrality.plot.hist()
    plt.yscale("log")

**Betweenness Centrality**

It measures the number of shortest paths that pass through a given node, providing and intuition about how central that node is for message passing within the network.

In [None]:
for G in [G_bu, G_tu]:
    plt.figure(figsize = (10,10))
    degree_centrality = pd.Series(
        {
            k:v for k,v in nx.betweenness_centrality(G).items()
        }
    )
    degree_centrality.plot.hist()
    plt.yscale("log")

**Closeness Centrality**

It is a way of detecting nodes that are able to spread information very efficiently through a graph. the closeness centrality of a node measures its average farness (inverse distance) to all other nodes. Nodes with a high closeness score have the shortest distances to all other nodes. 

In [None]:
for G in [G_bu, G_tu]:
    plt.figure(figsize = (10,10))
    degree_centrality = pd.Series(
        {
            k:v for k,v in nx.closeness_centrality(G).items()
        }
    )
    degree_centrality.plot.hist()
    plt.yscale("log")