# Part 2: Email Behaviour Data Analysis

---

### Install Python packages (pip only)

In [1]:
pip install networkx

Note: you may need to restart the kernel to use updated packages.


### Import Python packages

In [10]:
import networkx as nx
import numpy as np
import json

---

### Task 1 of 1 

Examine the file "emails_cmt224.edgelist" which represents email behaviour at an organisation. Each line contains two numbers, 𝑢 and 𝑣, separated by a blank space. Consider each number as an identifier for an individual in an organisation, with the space on each line representing that the individual, 𝑢, sent at least one email to the another individual, 𝑣, at some point. Model the data using an appropriate, directed network representation and answer the following questions:

##### Q1. Do the majority of individuals have a higher or lower ratio of mutual connections than average in the network?

In [3]:
graph = nx.read_edgelist("emails_cmt224.edgelist", create_using=nx.DiGraph())

# Function to calculate mutual connections ratio for an individual
def calculate_mutual_connections_ratio(graph, node):
    # Get successors (outgoing connections) of the node
    successors = set(graph.successors(node))
    
    # Count mutual connections (nodes who also have an edge to `node`)
    mutual_count = 0
    for neighbor in successors:
        if node in graph.successors(neighbor):
            mutual_count += 1
    
    # Calculate the ratio of mutual connections
    total_connections = len(successors)
    if total_connections > 0:
        ratio = mutual_count / total_connections
    else:
        ratio = 0.0
    
    return ratio

# Calculate mutual connections ratio for each individual and collect ratios
mutual_ratios = {}
for node in graph.nodes():
    ratio = calculate_mutual_connections_ratio(graph, node)
    mutual_ratios[node] = ratio

# Calculate average ratio of mutual connections across all individuals
average_ratio = sum(mutual_ratios.values()) / len(mutual_ratios)

# Compare each individual's ratio with the average ratio
higher_than_average = 0
lower_than_average = 0

for node, ratio in mutual_ratios.items():
    if ratio > average_ratio:
        higher_than_average += 1
    elif ratio < average_ratio:
        lower_than_average += 1

# Determine majority based on comparison with average ratio
if higher_than_average > lower_than_average:
    majority_relationship = "higher"
elif lower_than_average > higher_than_average:
    majority_relationship = "lower"
else:
    majority_relationship = "equal"

print(f"The majority of individuals have a {majority_relationship} ratio of mutual connections than average.")


The majority of individuals have a higher ratio of mutual connections than average.


##### Q2. Using the largest, strongly connected component (where at least one path exists between each individual and all others). Could the connectivity of the component be suggested to be reflective of a small world phenomenon in comparison to the typical connectivity of 10 comparative random networks?

In [23]:
graph = nx.read_edgelist("emails_cmt224.edgelist", create_using=nx.DiGraph())

# Find all strongly connected components
sccs = list(nx.strongly_connected_components(graph))

# Get the largest strongly connected component (SCC)
largest_scc = max(sccs, key=len)

# Create a subgraph of the largest SCC
scc_graph = graph.subgraph(largest_scc)

# Calculate connectivity metrics for the SCC
average_shortest_path_length = nx.average_shortest_path_length(scc_graph)
clustering_coefficient = nx.average_clustering(scc_graph)

print("Metrics for the largest strongly connected component:")
print(f"Average Shortest Path Length: {average_shortest_path_length:.2f}")
print(f"Clustering Coefficient: {clustering_coefficient:.2f}")

# Generate 10 random networks with similar characteristics
num_nodes = len(graph.nodes())
num_edges = len(graph.edges())

random_networks_metrics = []
for _ in range(10):
    random_graph = nx.gnm_random_graph(num_nodes, num_edges, directed=True)
    random_scc = max(nx.strongly_connected_components(random_graph), key=len)
    random_scc_graph = random_graph.subgraph(random_scc)
    random_average_shortest_path_length = nx.average_shortest_path_length(random_scc_graph)
    random_clustering_coefficient = nx.average_clustering(random_scc_graph)
    random_networks_metrics.append((random_average_shortest_path_length, random_clustering_coefficient))

# Calculate mean metrics for the random networks
mean_random_avg_shortest_path_length = np.mean([m[0] for m in random_networks_metrics])
mean_random_clustering_coefficient = np.mean([m[1] for m in random_networks_metrics])

print("\nMetrics for random networks (mean of 10 networks):")
print(f"Mean Average Shortest Path Length: {mean_random_avg_shortest_path_length:.2f}")
print(f"Mean Clustering Coefficient: {mean_random_clustering_coefficient:.2f}")

# Compare SCC metrics with random networks
if (average_shortest_path_length < mean_random_avg_shortest_path_length) and \
   (clustering_coefficient > mean_random_clustering_coefficient):
    print("\nThe largest strongly connected component exhibits properties of a small world network.")
else:
    print("\nThe largest strongly connected component does not exhibit typical small world properties compared to random networks.")


Metrics for the largest strongly connected component:
Average Shortest Path Length: 2.56
Clustering Coefficient: 0.39

Metrics for random networks (mean of 10 networks):
Mean Average Shortest Path Length: 2.54
Mean Clustering Coefficient: 0.03

The largest strongly connected component does not exhibit typical small world properties compared to random networks.


##### Q3. Are occurrences of induced, connected subgraphs of 3 individuals (triads) with only mutual connections more abundant in the largest, strongly connected component than those with a mixture of asymmetric and mutual connections? What does this suggest about how mutual connections are distributed in the component?

In [32]:
def load_graph_from_edgelist(edgelist_file):
    """Load a directed graph from an edgelist file."""
    graph = nx.read_edgelist(edgelist_file, create_using=nx.DiGraph())
    return graph

# Load the graph from the edgelist file
edgelist_file_path = "emails_cmt224.edgelist"
graph = load_graph_from_edgelist(edgelist_file_path)

# Find the largest strongly connected component (SCC)
largest_scc = max(nx.strongly_connected_components(graph), key=len)
largest_scc_graph = graph.subgraph(largest_scc)

# Function to classify triads
def classify_triads(graph):
    triads_mutual = 0
    triads_mixed = 0

    # Iterate over all nodes in the SCC and check triads
    for node in graph:
        neighbors = set(graph.successors(node))  # Outgoing neighbors
        mutual_triads = 0

        # Check each pair of neighbors for mutual connections
        for n1 in neighbors:
            for n2 in neighbors:
                if n1 != n2 and graph.has_edge(n1, n2) and graph.has_edge(n2, n1):
                    mutual_triads += 1

        # Determine triad type based on mutual connections
        if mutual_triads == len(neighbors) * (len(neighbors) - 1):
            triads_mutual += 1  # All connections are mutual
        else:
            triads_mixed += 1    # Some connections are asymmetric

    return triads_mutual, triads_mixed

# Classify triads within the largest SCC
triads_mutual, triads_mixed = classify_triads(largest_scc_graph)

# Compare counts of mutual-only and mixed triads
print("Number of triads with only mutual connections:", triads_mutual)
print("Number of triads with mixed asymmetric and mutual connections:", triads_mixed)

# Interpretation
if triads_mutual > triads_mixed:
    print("Triads with only mutual connections are more abundant.")
else:
    print("Triads with mixed connections are more abundant.")


Number of triads with only mutual connections: 57
Number of triads with mixed asymmetric and mutual connections: 507
Triads with mixed connections are more abundant.


---
### Task 2 of 2

Examine the JSON file "emails_cmt224_departments.json" (departments file). Keys in the departments file represent individuals using the same ids as in the "emails_cmt224.edgelist" file in Part 2, Task 1 and the values represent a department id that the individual can be attributed to. Using the contents of the departments file in combination with the network in Part 2, Task 1, answer the following questions:

##### Q1. Using the connections that individuals have in the network, are they more likely to mix with others in their department or those with a similar number of outward connections?

In [30]:
graph = nx.read_edgelist("emails_cmt224.edgelist", create_using=nx.DiGraph())

# Load the departments data from the JSON file
with open("emails_cmt224_departments.json", "r") as f:
    departments_data = json.load(f)

# Map individuals to their respective departments
individual_department = {}
for individual, department_id in departments_data.items():
    individual_department[individual] = department_id

# Calculate out-degree (number of outward connections) for each individual
outward_connections = {}
for node in graph.nodes():
    outward_connections[node] = graph.out_degree(node)

# Determine mixing patterns
same_department_mix_count = 0
similar_outward_connections_mix_count = 0

for edge in graph.edges():
    sender, receiver = edge
    sender_department = individual_department.get(sender)
    receiver_department = individual_department.get(receiver)
    sender_outward_connections = outward_connections.get(sender)
    receiver_outward_connections = outward_connections.get(receiver)
    
    # Check if sender and receiver are in the same department
    if sender_department == receiver_department:
        same_department_mix_count += 1
    
    # Check if sender and receiver have a similar number of outward connections
    if sender_outward_connections == receiver_outward_connections:
        similar_outward_connections_mix_count += 1

total_connections = graph.number_of_edges()

# Calculate probabilities
prob_same_department_mix = same_department_mix_count / total_connections
prob_similar_outward_connections_mix = similar_outward_connections_mix_count / total_connections

print(f"Probability of mixing with same department individuals: {prob_same_department_mix:.2f}")
print(f"Probability of mixing with individuals with similar outward connections: {prob_similar_outward_connections_mix:.2f}")



Probability of mixing with same department individuals: 0.33
Probability of mixing with individuals with similar outward connections: 0.01


##### Q2. Are all departments with 15 or more members more tightly connected amongst themselves in comparison to all individuals across the overall network irrespective of their department?  Where in this context, 'more tightly connected' is defined as having more mutual AND clustered connections. In addition to answering the overall question as yes or no, provide a list of departments this is true for (if any) and not true for (if any).

In [31]:
graph = nx.read_edgelist("emails_cmt224.edgelist", create_using=nx.DiGraph())

# Load the departments data from the JSON file
with open("emails_cmt224_departments.json", "r") as f:
    departments_data = json.load(f)

# Map individuals to their respective departments
individual_department = {}
for individual, department_id in departments_data.items():
    individual_department[individual] = department_id

# Count members in each department
department_size = {}
for individual, department_id in individual_department.items():
    if department_id in department_size:
        department_size[department_id] += 1
    else:
        department_size[department_id] = 1

# Identify departments with 15 or more members
large_departments = [dept_id for dept_id, size in department_size.items() if size >= 15]

# Function to calculate mutual connections and clustering coefficient for a subgraph
def calculate_connectivity_metrics(subgraph):
    mutual_connections = sum(1 for u, v in subgraph.edges() if subgraph.has_edge(v, u))
    clustering_coefficient = nx.average_clustering(subgraph)
    return mutual_connections, clustering_coefficient

# Calculate metrics for the overall network
overall_mutual_connections, overall_clustering_coefficient = calculate_connectivity_metrics(graph)

# Compare departments with 15 or more members to the overall network
tightly_connected_departments = []
not_tightly_connected_departments = []

for dept_id in large_departments:
    # Create a subgraph for the current department
    department_members = [indiv for indiv, dept in individual_department.items() if dept == dept_id]
    department_subgraph = graph.subgraph(department_members)
    
    # Calculate metrics for the department's subgraph
    dept_mutual_connections, dept_clustering_coefficient = calculate_connectivity_metrics(department_subgraph)
    
    # Compare department metrics with overall network metrics
    if dept_mutual_connections > overall_mutual_connections and dept_clustering_coefficient > overall_clustering_coefficient:
        tightly_connected_departments.append(dept_id)
    else:
        not_tightly_connected_departments.append(dept_id)

# Determine the overall answer (yes or no)
if tightly_connected_departments:
    print("Yes, departments with 15 or more members are more tightly connected amongst themselves compared to the overall network.")
    print("List of departments where this is true:", tightly_connected_departments)
else:
    print("No, departments with 15 or more members are not more tightly connected amongst themselves compared to the overall network.")
    print("List of departments where this is not true:", not_tightly_connected_departments)



No, departments with 15 or more members are not more tightly connected amongst themselves compared to the overall network.
List of departments where this is not true: ['1', '21', '14', '9', '17', '11', '10', '36', '7', '4', '22', '8', '15', '16', '13', '6', '0', '23', '19']
