In [1]:
# Import libraries

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import networkx as nx
import community as community_louvain



In [2]:
# Load Comments
comments = pd.read_csv("data/filtered_comments.csv")

# Load Users
users = pd.read_csv("data/filtered_users.csv")

# Load Following
following = pd.read_csv("data/filtered_following.csv")

# Load Likes
likes = pd.read_csv("data/filtered_likes.csv")

# Load Entries/Posts
entries = pd.read_csv("data/filtered_entries.csv")

In [None]:
# show a graph of people following other people

G = nx.from_pandas_edgelist(following, source='FollowerID', target='FollowedID', create_using=nx.DiGraph())
print(f"-> Graph created successfully.")
print(f"-> Nodes (Users): {G.number_of_nodes()}")
print(f"-> Edges (Connections): {G.number_of_edges()}")
print("-" * 30)
plt.figure(figsize=(10,10))
pos = nx.spring_layout(G, k=0.1)
nx.draw(G, pos, node_size=10, alpha=0.3, arrows=False)
plt.title('Follower Network')
plt.savefig('results/follower_network.png', dpi=300, bbox_inches='tight')
plt.show()


# --- 2. Detect Communities with Louvain Algorithm (from your existing code) ---
print("Step 2: Running the Louvain algorithm...")
partition = community_louvain.best_partition(G)
print("-> Community detection complete.")
print("-" * 30)

# --- 3. Analyze and Process the Results (from your existing code) ---
print("Step 3: Analyzing community results...")
num_communities = len(set(partition.values()))
print(f"-> Discovered {num_communities} distinct communities.")
partition_df = pd.DataFrame(list(partition.items()), columns=['ID', 'CommunityID'])
community_sizes = partition_df['CommunityID'].value_counts().reset_index()
community_sizes.columns = ['CommunityID', 'NumberOfMembers']
print("\nTop 10 Largest Social Circles:")
print(community_sizes.head(10).to_string(index=False))
print("-" * 30)

# --- 4. Merge and Save Results (from your existing code) ---
print("Step 4: Merging and saving results...")
users_with_communities = pd.merge(users, partition_df, on='ID', how='inner')
output_filepath = "results/users_with_communities.csv"
users_with_communities.to_csv(output_filepath, index=False, encoding='utf-8')
print(f"-> Successfully saved results to '{output_filepath}'")
print("-" * 30)

# --- 5. Display a Sample from the Largest Community (from your existing code) ---
if not community_sizes.empty:
    largest_community_id = community_sizes.iloc[0]['CommunityID']
    print(f"Step 5: Sample of users from the largest community (ID: {largest_community_id}):")
    largest_community_members = users_with_communities[
        users_with_communities['CommunityID'] == largest_community_id
    ]
    print(largest_community_members[['ID', 'Name', 'CommunityID']].head(15).to_string(index=False))
else:
    print("No communities were found to display a sample.")
print("-" * 30)


# --- 6. NEW: Plot Degree Distribution ---
print("Step 6: Generating Degree Distribution Plot...")
# Get the degree (number of connections) for each node
degrees = [G.degree(n) for n in G.nodes()]

# Create the plot
plt.figure(figsize=(12, 7))
plt.hist(degrees, bins=100, log=True, color='skyblue', ec='black')
plt.title('Degree Distribution of the Social Network (Log Scale)', fontsize=16)
plt.xlabel('Degree (Number of Connections)', fontsize=12)
plt.ylabel('Number of Users (Log Scale)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Save the plot to a file
degree_plot_path = 'results/degree_distribution.png'
plt.savefig(degree_plot_path)
print(f"-> Degree distribution plot saved to '{degree_plot_path}'")
plt.show() # Display the plot in the notebook
print("-" * 30)


# --- 7. NEW: Visualize the Largest Community ---
print(f"Step 7: Visualizing the largest community (ID: {largest_community_id})...")

# Extract nodes that belong to the largest community
nodes_in_community = [
    n for n, c_id in partition.items() if c_id == largest_community_id
]

# Create a subgraph containing only the nodes and edges from that community
subgraph = G.subgraph(nodes_in_community)
print(f"-> Largest community has {subgraph.number_of_nodes()} members.")

# For clarity, we'll only draw a sample if the community is too large
if subgraph.number_of_nodes() > 200:
    print("-> Community is too large to draw clearly. A random sample of 200 nodes will be visualized.")
    # Take a random sample of nodes from the subgraph
    sampled_nodes = pd.Series(list(subgraph.nodes())).sample(200, random_state=42).tolist()
    # Create the final subgraph for visualization from the sample
    vis_subgraph = subgraph.subgraph(sampled_nodes)
else:
    vis_subgraph = subgraph

# Draw the graph
plt.figure(figsize=(15, 15))
# Use a spring layout for a more organic look
pos = nx.spring_layout(vis_subgraph, seed=42, k=0.15)

# Draw nodes and edges
nx.draw_networkx_nodes(vis_subgraph, pos, node_size=50, node_color='lightgreen', alpha=0.9)
nx.draw_networkx_edges(vis_subgraph, pos, width=0.5, alpha=0.3, edge_color='gray')

plt.title(f'Visualization of Largest Social Circle (Community ID: {largest_community_id})', fontsize=20)
plt.axis('off') # Hide the axes

# Save the visualization to a file
community_plot_path = 'results/largest_community_graph.png'
plt.savefig(community_plot_path)
print(f"-> Community graph saved to '{community_plot_path}'")
plt.show() # Display the plot in the notebook
print("-" * 30)


-> Graph created successfully.
-> Nodes (Users): 484251
-> Edges (Connections): 18477145
------------------------------


In [None]:
TOP_N_COMMUNITIES = 10

def print_stats(data_series, description):
    """Calculates and prints key statistics for a pandas Series."""
    print(f"--- {description} ---")
    if data_series.empty:
        print("No data available to calculate statistics.")
    else:
        print(f"Average: {data_series.mean():.2f}")
        print(f"Median: {data_series.median():.2f}")
        print(f"Standard Deviation: {data_series.std():.2f}")
        print(f"Min: {data_series.min()}")
        print(f"Max: {data_series.max()}")
    print("-" * (len(description) + 8) + "\n")

top_communities = users_with_communities['CommunityID'].value_counts().nlargest(TOP_N_COMMUNITIES).index

for community_id in top_communities:
    print(f"=============================================")
    print(f"      ANALYZING COMMUNITY ID: {community_id}")
    print(f"=============================================\n")

    # 1. Get all user IDs for the current community
    community_members = users_with_communities[users_with_communities['CommunityID'] == community_id]['ID']

    # 2. Filter the main dataframes for members of this community
    following_c = following[following['FollowerID'].isin(community_members) | following['FollowedID'].isin(community_members)]
    entries_c = entries[entries['PostedBy'].isin(community_members)]
    likes_c = likes[likes['userID'].isin(community_members)]
    comments_c = comments[comments['PostedBy'].isin(community_members)]

    # --- Follower Count ---
    follower_count = following_c[following_c['FollowedID'].isin(community_members)]['FollowedID'].value_counts()
    print_stats(follower_count, f"Follower Count (Community {community_id})")

    # --- Following Count ---
    following_count = following_c[following_c['FollowerID'].isin(community_members)]['FollowerID'].value_counts()
    print_stats(following_count, f"Following Count (Community {community_id})")

    # --- Posts Created ---
    posts_created = entries_c['PostedBy'].value_counts()
    print_stats(posts_created, f"Posts Created (Community {community_id})")

    # --- Likes Received on Average per Post ---
    if not entries_c.empty:
        likes_per_post = likes.groupby('PostID').size().rename('LikesCount')
        posts_with_likes = entries_c.merge(likes_per_post, on='PostID', how='left').fillna(0)
        avg_likes_received = posts_with_likes.groupby('PostedBy')['LikesCount'].mean()
        print_stats(avg_likes_received, f"Avg. Likes Received per Post (Community {community_id})")
    else:
        print_stats(pd.Series(), f"Avg. Likes Received per Post (Community {community_id})")


    # --- Likes Given ---
    likes_given = likes_c['userID'].value_counts()
    print_stats(likes_given, f"Likes Given (Community {community_id})")

    # --- Comments Received on Average per Post ---
    if not entries_c.empty:
        comments_per_post = comments.groupby('PostID').size().rename('CommentsCount')
        posts_with_comments = entries_c.merge(comments_per_post, on='PostID', how='left').fillna(0)
        avg_comments_received = posts_with_comments.groupby('PostedBy')['CommentsCount'].mean()
        print_stats(avg_comments_received, f"Comments Received per Post (Community {community_id})")
    else:
        print_stats(pd.Series(), f"Comments Received per Post (Community {community_id})")

    # --- Comments Given ---
    comments_given = comments_c['PostedBy'].value_counts()
    print_stats(comments_given, f"Comments Given (Community {community_id})")


