# Community Detection with networkx

## 1. load the required packages

In [None]:
import itertools
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import networkx as nx
import networkx.algorithms.community as nxcom
from node2vec import Node2Vec

## 2. import dataset and obtain the preprocessed data

### 2.1 dataset for 2020

In [None]:
df = pd.read_excel('../data/all_submission_files.xlsx')
data=df[df["report_end_date"].dt.year==2020]
data.head(n=6)

graph of the data

In [None]:
# Find all the possible CIK combinations (sorted) of length 2
all_cik_pairs = list(itertools.combinations(df['cik'].unique(), 2))

In [None]:
df[df['cik']==all_cik_pairs[0][0]]

In [None]:
# For each CIK pair, find the number of overlapping cusips
# Creates a list [[cik1, cik2], nr_of_overlapping_cusips]
nodes_and_edges = []
for cik_pair in all_cik_pairs:
    df1 = df[df['cik']==cik_pair[0]]
    df2 = df[df['cik']==cik_pair[1]]
    nr_of_shared_cusips = len(df2[df2["cusip"].isin(df1["cusip"])])
    nodes_and_edges.append([cik_pair, nr_of_shared_cusips])

In [None]:
len(nodes_and_edges)

In [None]:
# The number of nodes that are connected
len([c for c,n in nodes_and_edges if n>0])

In [None]:
# DEBUG ONLY: take only the first 200 edges
# nodes_and_edges = nodes_and_edges[:200]

In [None]:
# Build a network of all CIK's.
# Add an edge between each CIK that invests in overlapping cusips, add weight +1 for each overlapping cusip
G=nx.Graph()
for (cik1, cik2), nr_of_shared_cusips in nodes_and_edges:
    if nr_of_shared_cusips == 0:
        pass
        # Don't add the nodes that are not connected, makes the graph impossible to understand
        # G.add_node(cik1)
        # G.add_node(cik2)
    elif nr_of_shared_cusips > 50:
        # Take a mimimum amount of shared cusips for visibility
        G.add_edge(cik1,cik2,weight=nr_of_shared_cusips)

In [None]:
node_size =  None
edge_width = [w for _,w in nodes_and_edges]
pos = nx.kamada_kawai_layout(G)
plt.figure(figsize=(20,20))
# Visualize graph components
nx.draw_networkx_edges(G, pos, alpha=0.3, width=None, edge_color="m")
nx.draw_networkx_nodes(G, pos, node_size=None, node_color="#210070", alpha=0.9)
label_options = {"ec": "k", "fc": "white", "alpha": 0.7}
# _ = nx.draw_networkx_labels(G, pos, font_size=14, bbox=label_options)

In [None]:
# Find the communities
communities = sorted(nxcom.greedy_modularity_communities(G), key=len, reverse=True)
# Count the communities
print(f"The group of investment funds has {len(communities)} communities.")

In [None]:
# Used this example:
# https://graphsandnetworks.com/community-detection-using-networkx/

In [None]:
# Helper functions to plot the graph with community labels
def set_node_community(G, communities):
    '''Add community to node attributes'''
    for c, v_c in enumerate(communities):
        for v in v_c:
            # Add 1 to save 0 for external edges
            G.nodes[v]['community'] = c + 1

def set_edge_community(G):
    '''Find internal edges and add their community to their attributes'''
    for v, w, in G.edges:
        if G.nodes[v]['community'] == G.nodes[w]['community']:
            # Internal edge, mark with community
            G.edges[v, w]['community'] = G.nodes[v]['community']
        else:
            # External edge, mark as 0
            G.edges[v, w]['community'] = 0

def get_color(i, r_off=1, g_off=1, b_off=1):
    '''Assign a color to a vertex.'''
    r0, g0, b0 = 0, 0, 0
    n = 16
    low, high = 0.1, 0.9
    span = high - low
    r = low + span * (((i + r_off) * 3) % n) / (n - 1)
    g = low + span * (((i + g_off) * 5) % n) / (n - 1)
    b = low + span * (((i + b_off) * 7) % n) / (n - 1)
    return (r, g, b)

In [None]:
# Set node and edge communities
set_node_community(G, communities)
set_edge_community(G)

node_color = [get_color(G.nodes[v]['community']) for v in G.nodes]

# Set community color for edges between members of the same community (internal) and intra-community edges (external)
external = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] == 0]
internal = [(v, w) for v, w in G.edges if G.edges[v, w]['community'] > 0]
internal_color = ['black' for e in internal]

In [None]:
comm_pos = nx.spring_layout(G)

plt.rcParams.update({'figure.figsize': (15, 10)})
# Draw external edges
nx.draw_networkx(
    G,
    pos=comm_pos,
    node_size=0,
    edgelist=external,
    edge_color="silver")
# Draw nodes and internal edges
nx.draw_networkx(
    G,
    pos=comm_pos,
    node_color=node_color,
    edgelist=internal,
    edge_color=internal_color)