In [1]:
import networkx as nx
import plotly.graph_objects as go
from google.colab import drive
import pandas as pd
import numpy as np

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [30]:
# menggunakan pasangan buyer-seller dari transaksi fraud di tahap sql
fraud_transaction = pd.read_csv('/content/drive/MyDrive/MSIB Bitlabs Data Analytics for Business/PBL/Task 6/fraud_transaction.csv')
transaction = pd.read_csv('/content/drive/MyDrive/MSIB Bitlabs Data Analytics for Business/PBL/data_cleaning/transaction.csv', parse_dates=['transaction_created_datetime', 'transaction_updated_datetime'])
user = pd.read_csv('/content/drive/MyDrive/MSIB Bitlabs Data Analytics for Business/PBL/data_cleaning/user.csv')

### Membuat Graf

In [79]:
# Buat DataFrame edge list (buyer_id, seller_id)
edge_list = fraud_transaction[['buyer_id', 'seller_id']]

# Buat graph dari edge list
G = nx.from_pandas_edgelist(edge_list, source='buyer_id', target='seller_id')

In [80]:
# 1. Temukan semua komponen terhubung
connected_components = list(nx.connected_components(G))

# 2. Dapatkan subgraf untuk setiap komponen dan jumlah node
subgraphs_info = []

for component in connected_components:
    subgraph = G.subgraph(component)
    subgraphs_info.append((subgraph, len(subgraph.nodes())))

# 3. Tampilkan informasi subgraf dan jumlah node
for idx, (subgraph, node_count) in enumerate(subgraphs_info):
    print(f"Subgraf {idx + 1}: {node_count} node(s)")

Subgraf 1: 552 node(s)
Subgraf 2: 2 node(s)
Subgraf 3: 3 node(s)
Subgraf 4: 2 node(s)
Subgraf 5: 2 node(s)
Subgraf 6: 2 node(s)
Subgraf 7: 2 node(s)
Subgraf 8: 2 node(s)
Subgraf 9: 5 node(s)
Subgraf 10: 2 node(s)
Subgraf 11: 2 node(s)
Subgraf 12: 4 node(s)
Subgraf 13: 2 node(s)
Subgraf 14: 2 node(s)
Subgraf 15: 4 node(s)
Subgraf 16: 2 node(s)
Subgraf 17: 2 node(s)
Subgraf 18: 2 node(s)
Subgraf 19: 2 node(s)
Subgraf 20: 4 node(s)
Subgraf 21: 2 node(s)
Subgraf 22: 2 node(s)
Subgraf 23: 2 node(s)
Subgraf 24: 12 node(s)
Subgraf 25: 4 node(s)
Subgraf 26: 3 node(s)
Subgraf 27: 2 node(s)
Subgraf 28: 2 node(s)
Subgraf 29: 2 node(s)
Subgraf 30: 2 node(s)
Subgraf 31: 2 node(s)
Subgraf 32: 6 node(s)
Subgraf 33: 2 node(s)
Subgraf 34: 2 node(s)
Subgraf 35: 2 node(s)
Subgraf 36: 2 node(s)
Subgraf 37: 3 node(s)
Subgraf 38: 5 node(s)
Subgraf 39: 2 node(s)
Subgraf 40: 2 node(s)
Subgraf 41: 2 node(s)
Subgraf 42: 2 node(s)
Subgraf 43: 3 node(s)
Subgraf 44: 2 node(s)
Subgraf 45: 3 node(s)
Subgraf 46: 2 no

### Membagi dua graf sebelumnya menjadi graf utama dan kumpulan graf-graf kecil

In [81]:
#  Temukan komponen terhubung terbesar
largest_component = max(nx.connected_components(G), key=len)
#  Buat subgraf dari komponen terbesar
G_largest = G.subgraph(largest_component)
# Buat subgraph selain komponen terbesar
G_outside_largest = G.subgraph(set(G.nodes) - largest_component)


In [82]:
G.number_of_nodes()

1089

In [83]:
G_largest.number_of_nodes()

552

In [84]:
G_outside_largest.number_of_nodes()

537

### Cek nodes mencurigakan secara keseluruhan (graf G)

In [63]:
graph_nodes_df = pd.DataFrame(list(G.nodes()), columns=['company_id'])
nodes_in_user = user.merge(graph_nodes_df, on='company_id', how='inner')
nodes_in_user.shape[0]

771

In [64]:
G.number_of_nodes() - nodes_in_user.shape[0]

318

318 nodes atau users tidak terdaftar di data user

### Cek nodes mencurigakan di G_outside_largest

In [85]:
# Nodes di G_outside_largest yang terdaftar di data user
graph_G_outside_largest_df = pd.DataFrame(list(G_outside_largest.nodes()), columns=['company_id'])
G_outside_largest_nodes_in_user = user.merge(graph_G_outside_largest_df, on='company_id', how='inner')
G_outside_largest_nodes_in_user.shape[0]

244

In [86]:
G_outside_largest.number_of_nodes() - G_outside_largest_nodes_in_user.shape[0]

293

In [88]:
293/318

0.9213836477987422

Dari total 318 nodes yang tidak terdaftar di data user, 293 nya berada di G_outside_largest (lebih dari 90 % akun ilegal tidak terhubung ke jaringan G_largest)

In [87]:
1 - (G_outside_largest_nodes_in_user.shape[0]/G_outside_largest.number_of_nodes())

0.5456238361266295

54% nodes di dalam G_outside_largest merupakan akun ilegal

### Menemukan central node pada graf utama

In [69]:
#  Hitung degree centrality G_largest
degree_centrality = nx.degree_centrality(G_largest)
sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
sorted_degree_centrality

[('5d2233f5a1a6435891142442fac09a77809d0c16496f07b2575c3a451fbd7f60',
  0.9274047186932849),
 ('053819be0c911b698bbd253728ba9b9a85779f2d9a5a39503b29f9f01cd0aa0e',
  0.009074410163339382),
 ('ed654f3687ca1ef5da46e0d2988c906c02a740fa1fb1bf68f0d3725ac35e2201',
  0.009074410163339382),
 ('8764fbf59745f1235c38abb5c858dcd2a4b4b511f725c251932baedbf63d0ef2',
  0.007259528130671506),
 ('b13aa5ba4ea9bb2443cac2caa92fb8d6b3a86f3d3b35b5da314b7538934f1f4d',
  0.0054446460980036296)]

In [70]:
# Temukan central node G_largest
central_node_dc = max(degree_centrality, key=degree_centrality.get)
central_node_dc

'5d2233f5a1a6435891142442fac09a77809d0c16496f07b2575c3a451fbd7f60'

In [71]:
#  Hitung eigenvector centrality G_largest
eigenvector_centrality = nx.eigenvector_centrality(G_largest, max_iter=1000, tol=1e-6)
sorted_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
sorted_eigenvector_centrality

[('5d2233f5a1a6435891142442fac09a77809d0c16496f07b2575c3a451fbd7f60',
  0.7068836526329391),
 ('053819be0c911b698bbd253728ba9b9a85779f2d9a5a39503b29f9f01cd0aa0e',
  0.032891687939180916),
 ('b13aa5ba4ea9bb2443cac2caa92fb8d6b3a86f3d3b35b5da314b7538934f1f4d',
  0.03283272059167899),
 ('6c3f436adfc4cfefb8865abc4902497c6f45d4104a9771a55b4ad81955984c62',
  0.03283272059167899),
 ('ef958867c89944a43d26df0ada8ff693ff1b7142cbafe0bfb3e53cb27ecbe217',
  0.03270696421960359)]

In [72]:
# Temukan central node G_largest
central_node_ec= max(eigenvector_centrality, key=degree_centrality.get)
central_node_ec

'5d2233f5a1a6435891142442fac09a77809d0c16496f07b2575c3a451fbd7f60'

In [73]:
# Buat DataFrame dari central_node_ec
central_node_G_largest = pd.DataFrame({'company_id': [central_node_ec]})
user.merge(central_node_G_largest, on='company_id', how='inner')

Unnamed: 0,company_id,company_kyc_status_name,company_kyb_status_name,company_type_group,company_phone_verified_flag,company_email_verified_flag,user_fraud_flag,testing_account_flag,blacklist_account_flag,package_active_name,company_registered_datetime
0,5d2233f5a1a6435891142442fac09a77809d0c16496f07...,BELUM_VALIDASI,BELUM_VALIDASI,PT,0.0,0.0,0.0,0.0,0.0,FREE,2021-05-11 18:54:05


central node pada subgraph utama G_largest tidak ditandai fraud atau blacklist

### Visualisasi Jaringan Sosial User yang terlibat transaksi fraud

In [74]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Fungsi untuk membuat traces dari sebuah graf
def create_graph_traces(G, title, layout_seed=42):
    pos = nx.spring_layout(G, seed=layout_seed)  # Generate positions for the graph

    # Edge coordinates
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines'
    )

    # Node coordinates
    node_x = []
    node_y = []
    node_text = []
    node_sizes = []
    node_colors = []

    # Find the central node (with the highest degree)
    central_node = max(G.nodes(), key=lambda node: len(list(G.adj[node])))

    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

        degree = len(list(G.adj[node]))
        node_text.append(f"Node: {node}<br>Connections: {degree}")

        # Set size and color based on degree
        if node == central_node:
            node_sizes.append(10)  # Highlight central node
            node_colors.append('red')  # Central node in red
        else:
            node_sizes.append(10)  # Adjust size based on degree
            node_colors.append(degree)  # Use degree for color

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            showscale=True,
            colorscale='Viridis',
            size=node_sizes,
            color=node_colors,
            colorbar=dict(
                thickness=15,
                title='Node Degree',
                xanchor='left',
                titleside='right'
            ),
            line_width=2
        ),
        text=node_text
    )

    return [edge_trace, node_trace]

In [75]:
traces_largest = create_graph_traces(G_largest, "Largest Subgraph")
traces_outside = create_graph_traces(G_outside_largest, "Outside Largest Subgraph")

In [76]:
# Create the figure for the largest subgraph
fig_largest = go.Figure()

# Add traces for the largest subgraph
for trace in traces_largest:
    fig_largest.add_trace(trace)

# Update layout for the largest subgraph
fig_largest.update_layout(
    title_text="Visualization of Largest Subgraph",
    titlefont_size=16,
    showlegend=False,
    hovermode="closest",
    margin=dict(b=0, l=0, r=0, t=40),
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False),
    height=800  # Adjust height for better visibility
)

# Show the figure for the largest subgraph
fig_largest.show()


In [26]:
# Create the figure for the outside largest subgraph
fig_outside = go.Figure()

# Add traces for the outside largest subgraph
for trace in traces_outside:
    fig_outside.add_trace(trace)

# Update layout for the outside largest subgraph
fig_outside.update_layout(
    title_text="Visualization of Outside Largest Subgraph",
    titlefont_size=16,
    showlegend=False,
    hovermode="closest",
    margin=dict(b=0, l=0, r=0, t=40),
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False),
    height=800  # Adjust height for better visibility
)

# Show the figure for the outside largest subgraph
fig_outside.show()


### Analisis subgraf terbesar di G_outside_largest

In [36]:
#Temukan komponen terhubung terbesar pada G_outside_largest
G_outside_largest_largest_component = max(nx.connected_components(G_outside_largest), key=len)
#  Buat subgraf dari komponen terbesar
G_outside_largest_largest = G.subgraph(G_outside_largest_largest_component)
G_outside_largest_largest.number_of_nodes()

12

In [44]:
#  Hitung degree centrality G_outside_largest_largest
degree_centrality = nx.degree_centrality(G_outside_largest_largest)
sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
sorted_degree_centrality

[('b4c5286fbf6443dd4df37457f3ef23a2b71253b1c86452f824e8856befba3483',
  0.9090909090909092),
 ('f0c9669d6f2a1f6512c1b3e4f31db09d50247a213ebf32f021251051a0d47d04',
  0.18181818181818182),
 ('02611e2fdd7d730bddbd654baf24f03a739704bcb34c010c3cb54d3069087eea',
  0.09090909090909091),
 ('1f85162b10b0f71d2407163f6ba7ba1a62582620fb4221dbf5637c8d5a5acfb4',
  0.09090909090909091),
 ('f5df21b362aad9c806eadf1bca9ff4e9026c72c3471f1d6255554c3f9c5fadb6',
  0.09090909090909091)]

In [45]:
# Temukan central node G_outside_largest_largest
central_node_dc = max(degree_centrality, key=degree_centrality.get)
central_node_dc

'b4c5286fbf6443dd4df37457f3ef23a2b71253b1c86452f824e8856befba3483'

In [46]:
#  Hitung eigenvector centrality G_outside_largest_largest
eigenvector_centrality = nx.eigenvector_centrality(G_outside_largest_largest, max_iter=1000, tol=1e-6)
sorted_eigenvector_centrality = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
sorted_eigenvector_centrality

[('b4c5286fbf6443dd4df37457f3ef23a2b71253b1c86452f824e8856befba3483',
  0.7028838494674509),
 ('f0c9669d6f2a1f6512c1b3e4f31db09d50247a213ebf32f021251051a0d47d04',
  0.24532833353348035),
 ('1f85162b10b0f71d2407163f6ba7ba1a62582620fb4221dbf5637c8d5a5acfb4',
  0.22106185062986744),
 ('f5df21b362aad9c806eadf1bca9ff4e9026c72c3471f1d6255554c3f9c5fadb6',
  0.22106185062986744),
 ('08468244a08a0b83bb64fe767ea422a11bbbd1b37878f92b1d979d7149019961',
  0.22106185062986744)]

In [47]:
# Temukan central G_outside_largest_largest
central_node_ec= max(eigenvector_centrality, key=degree_centrality.get)
central_node_ec

'b4c5286fbf6443dd4df37457f3ef23a2b71253b1c86452f824e8856befba3483'

In [48]:
# Buat DataFrame dari central_node_ec
central_node_df = pd.DataFrame({'company_id': [central_node_ec]})
user.merge(central_node_df, on='company_id', how='inner')

Unnamed: 0,company_id,company_kyc_status_name,company_kyb_status_name,company_type_group,company_phone_verified_flag,company_email_verified_flag,user_fraud_flag,testing_account_flag,blacklist_account_flag,package_active_name,company_registered_datetime
0,b4c5286fbf6443dd4df37457f3ef23a2b71253b1c86452...,VALIDASI_BERHASIL,VALIDASI_BERHASIL,PT,1.0,1.0,0.0,1.0,0.0,PAPER+ ONE YEAR,2023-01-15 21:28:04


Central node terdaftar di data user, ternyata akun testing

In [42]:
# Ambil node dari graf
nodes_in_graph = list(G_outside_largest_largest.nodes())
# Buat DataFrame dari node graf
graph_nodes_df = pd.DataFrame(nodes_in_graph, columns=['company_id'])

merged_df = user.merge(graph_nodes_df, on='company_id', how='inner')

In [43]:
merged_df

Unnamed: 0,company_id,company_kyc_status_name,company_kyb_status_name,company_type_group,company_phone_verified_flag,company_email_verified_flag,user_fraud_flag,testing_account_flag,blacklist_account_flag,package_active_name,company_registered_datetime
0,b4c5286fbf6443dd4df37457f3ef23a2b71253b1c86452...,VALIDASI_BERHASIL,VALIDASI_BERHASIL,PT,1.0,1.0,0.0,1.0,0.0,PAPER+ ONE YEAR,2023-01-15 21:28:04
1,02611e2fdd7d730bddbd654baf24f03a739704bcb34c01...,VALIDASI_BERHASIL,BELUM_VALIDASI,PT,1.0,1.0,0.0,1.0,0.0,FREE,2023-01-15 21:38:22


Dari 12 node hanya dua yang terdaftar, keduanya akun testing, sementara yang lainnya (10) tidak terdaftar di data user