In [1]:
import pandas as pd

In [2]:
data_followings = pd.read_pickle("community_detection_data.pkl")
df_followings = data_followings['followings']

# Leiden community detection

In [3]:
import igraph as ig
import leidenalg as la

In [4]:
edges = list(zip(df_followings['Follower_Id'], df_followings['Followed_Id']))

In [5]:
grph = ig.Graph.TupleList(edges, directed=True)

In [6]:
objective_function = la.RBConfigurationVertexPartition

# Best possible solution based on resolution

In [7]:
resolutions = [0.5, 1, 1.5, 2, 2.5, 3]

for resl in resolutions:

    test_partition_leiden = la.find_partition(
        grph, 
        objective_function, 
        resolution_parameter=resl,
        n_iterations=10,
        seed=42
    )

    test_modularity_leiden = grph.modularity(test_partition_leiden)

    print(f"For resolution: {resl}, Leiden found {len(test_partition_leiden)} communities with modularity {test_modularity_leiden:.4f}")

For resolution: 0.5, Leiden found 2493 communities with modularity 0.2919
For resolution: 1, Leiden found 1741 communities with modularity 0.4337
For resolution: 1.5, Leiden found 1562 communities with modularity 0.4098
For resolution: 2, Leiden found 1543 communities with modularity 0.3780
For resolution: 2.5, Leiden found 1547 communities with modularity 0.3573
For resolution: 3, Leiden found 1565 communities with modularity 0.3406


In [8]:
resolution = 1.0 

# Execute Leiden

In [9]:
partition = la.find_partition(
    grph, 
    objective_function, 
    resolution_parameter=resolution,
    n_iterations=10,
    seed=42
)

modularity = grph.modularity(partition)

In [10]:
print(f"Leiden found {len(partition)} communities with modularity {modularity:.4f}")

Leiden found 1741 communities with modularity 0.4337


In [11]:
df_communities = []
adj_outgoing_edges_leiden = {v.index: set(grph.successors(v.index)) for v in grph.vs}

for i, community_nodes in enumerate(partition):
    subgraph = grph.subgraph(community_nodes)

    nodes = subgraph.vs.indices
    nodes_set = set(nodes)
    
    internal_density = subgraph.density()
    
    edges_in_len = len(subgraph.es)
    edges_out_len = sum(len(adj_outgoing_edges_leiden[node] - nodes_set) for node in nodes)
    
    if (edges_in_len + edges_out_len) > 0:
        conductance = edges_out_len / (edges_in_len + edges_out_len)
    else:
        conductance = 0
        
    df_communities.append({
        'Community': i,
        'Size': len(community_nodes),
        'Density': internal_density,
        'Conductance': conductance 
    })

df_community_summary = pd.DataFrame(df_communities).sort_values(by='Size', ascending=False)

In [12]:
df_community_summary

Unnamed: 0,Community,Size,Density,Conductance
0,0,147071,0.000150,0.588191
1,1,46218,0.000138,0.963146
2,2,45581,0.000498,0.881181
3,3,44992,0.002131,0.639614
4,4,43015,0.000311,0.930063
...,...,...,...,...
1736,1736,2,0.500000,0.999299
1737,1737,2,0.500000,0.999299
1738,1738,2,0.500000,0.999299
1739,1739,2,0.500000,0.999299


In [13]:
df_community_summary.describe()

Unnamed: 0,Community,Size,Density,Conductance
count,1741.0,1741.0,1741.0,1741.0
mean,870.0,278.145893,0.44539,0.998679
std,502.727726,4302.89821,0.156627,0.013789
min,0.0,2.0,0.000138,0.588191
25%,435.0,2.0,0.333333,0.999299
50%,870.0,2.0,0.5,0.999299
75%,1305.0,3.0,0.5,0.99981
max,1740.0,147071.0,1.0,0.999939


# Users in communities

In [14]:
membership_leiden = partition.membership

df_community_users = pd.DataFrame({
    "Community": membership_leiden,
    "User_Id": grph.vs['name']
})

In [15]:
df_community_users

Unnamed: 0,Community,User_Id
0,3,brendafranzo
1,3,njsinc4el
2,3,praguebob
3,3,hungheroic
4,4,zachhwilliams
...,...,...
484247,6,macochan
484248,7,lacivetta
484249,3,rabravoc
484250,4,t3chlover


In [16]:
df_communities = df_community_users.groupby('Community')['User_Id'].size().reset_index(name='Size')

In [17]:
df_communities

Unnamed: 0,Community,Size
0,0,147071
1,1,46218
2,2,45581
3,3,44992
4,4,43015
...,...,...
1736,1736,2
1737,1737,2
1738,1738,2
1739,1739,2


In [22]:
df_communities.sort_values('Size', ascending=False)

Unnamed: 0,Community,Size
0,0,147071
1,1,46218
2,2,45581
3,3,44992
4,4,43015
...,...,...
1736,1736,2
1737,1737,2
1738,1738,2
1739,1739,2


In [18]:
community_size_min_threshold = 4 # As 75% communities are <= 3
df_communities_filtered = df_communities[df_communities['Size'] >= community_size_min_threshold]

In [19]:
df_communities_filtered


Unnamed: 0,Community,Size
0,0,147071
1,1,46218
2,2,45581
3,3,44992
4,4,43015
...,...,...
259,259,4
260,260,4
261,261,4
262,262,4


# Export community data

In [20]:
community_data = { 'Communities': df_communities, 'Communities_filtered': df_communities_filtered }

pd.to_pickle(community_data, 'community_data.pkl')

In [21]:
community_user_data = { 'Users': df_community_users }

pd.to_pickle(community_user_data, 'community_users_data.pkl')