# 1. Setup

In [None]:
# %pip install pandas spacy networkx matplotlib beautifulsoup4
# !python -m spacy download en_core_web_sm

import pandas as pd
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re

# 2. Data Loading and Preprocessing

In [None]:
df = pd.read_csv('teachers_db_practice.csv')
def clean_html(text):
    if isinstance(text, str):
        return BeautifulSoup(text, 'html.parser').get_text()
    return text

df['cleaned_info'] = df['full_info'].apply(clean_html)

# 3. Named Entity Recognition (NER)

In [None]:
# nlp = spacy.load('en_core_web_lg')
nlp = spacy.load('en_core_web_sm')

### Define a function to extract the entities.

In [None]:
import re

def clean_entity_text(text):
    """Helper function to clean and normalize a single entity string."""
    # Remove common prefixes/suffixes and extra whitespace
    text = text.strip()
    text = re.sub(r'•\s*', '', text) # Remove bullet points
    text = re.sub(r'Academic Background', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Corporate Experience', '', text, flags=re.IGNORECASE)
    text = re.sub(r', (USA|Spain|U\.K\.|Belgium)', '', text) # Remove country suffices
    text = text.strip('•, ')
    return text

def extract_and_clean_entities(text):
    doc = nlp(text)
    entities = {'UNIVERSITY': set(), 'STUDY': set(), 'COMPANY': set(), 'COURSE': set()}

    # --- 1. Extract Universities and Companies using spaCy's ORG label ---
    for ent in doc.ents:
        if ent.label_ == 'ORG':
            cleaned_text = clean_entity_text(ent.text)
            if 'University' in cleaned_text or 'College' in cleaned_text or 'School' in cleaned_text or 'Institute' in cleaned_text or 'Universidad' in cleaned_text:
                entities['UNIVERSITY'].add(cleaned_text)
            # Add a length check to avoid fragments like "EU"
            elif len(cleaned_text) > 3 and "experience" not in cleaned_text.lower(): 
                entities['COMPANY'].add(cleaned_text)

    # --- 2. Extract Studies using precise Regular Expressions ---
    # Pattern to find degrees like "Master in X", "Ph.D. in Y", "Bachelor of Z"
    study_pattern = re.compile(r'\b(Master|Bachelor|Ph\.D\.|M\.A\.|B\.S\.|M\.B\.A\.|E\.M\.B\.A\.)\s*(in|of|degree in)?\s*([\w\s,&]+?)(?=\s*,|\s*\d{4}|\s*•)')
    matches = study_pattern.finditer(text)
    for match in matches:
        # Combine the degree type (e.g., "Master") with the field (e.g., "Graphic Design")
        full_study = f"{match.group(1)} in {match.group(3).strip()}"
        entities['STUDY'].add(clean_entity_text(full_study))

    # --- 3. Extract Courses ---
    # (This can be further refined if needed)
    course_keywords = ['course on', 'program in', 'lectures on', 'teaches']
    for keyword in course_keywords:
        if keyword in text:
            for sent in doc.sents:
                if keyword in sent.text:
                     # Split the sentence at the keyword and take the second part
                     course_name = sent.text.split(keyword, 1)[1].strip().split(',')[0]
                     entities['COURSE'].add(clean_entity_text(course_name))

    # Convert sets back to lists for consistency
    return {k: list(v) for k, v in entities.items()}

In [None]:
df['entities'] = df['cleaned_info'].apply(extract_and_clean_entities)

In [None]:
# Display the entities extracted for the first 5 professors
for index, row in df.head().iterrows():
    print(f"Entities for {row['alias']}:")
    print(row['entities'])
    print("-" * 30)

# 4. Social Network Analysis (SNA) with NetworkX

In [None]:
# Re-create the graph with cleaned data
G_cleaned = nx.Graph()

for index, row in df.iterrows():
    professor_alias = row['alias']
    G_cleaned.add_node(professor_alias, type='professor')

    entities = row['entities']
    for university in entities['UNIVERSITY']:
        # This now adds a single, clean node for each university
        G_cleaned.add_node(university, type='university')
        G_cleaned.add_edge(professor_alias, university, relation='affiliated_with')

    for company in entities['COMPANY']:
        G_cleaned.add_node(company, type='company')
        G_cleaned.add_edge(professor_alias, company, relation='worked_at')

    for study in entities['STUDY']:
        G_cleaned.add_node(study, type='study')
        G_cleaned.add_edge(professor_alias, study, relation='studied')

    for course in entities['COURSE']:
        G_cleaned.add_node(course, type='course')
        G_cleaned.add_edge(professor_alias, course, relation='teaches')

# Visualize the new, cleaner graph
plt.figure(figsize=(20, 20))
pos = nx.spring_layout(G_cleaned, k=0.6, iterations=50)

# Define colors for the node types
node_colors_map = {
    'professor': 'red', 
    'university': 'blue', 
    'company': 'green', 
    'study': 'purple', 
    'course': 'orange'
}
node_colors = [node_colors_map.get(G_cleaned.nodes[node]['type'], 'grey') for node in G_cleaned.nodes()]

# nx.draw(G_cleaned, pos, with_labels=True, node_color=node_colors, node_size=1500, font_size=8, alpha=0.7)
# plt.title('Cleaned and Normalized Knowledge Graph')
# plt.show()

In [None]:
import pandas as pd

# Create a list to hold all the connections (edges)
edge_list = []

# Loop through every edge in your cleaned graph
# data=True fetches the attributes we added (like 'relation')
for source, target, data in G_cleaned.edges(data=True):
    edge_list.append({
        "Source": source,
        "Relationship": data['relation'],
        "Target": target
    })

# Convert the list of connections into a DataFrame
graph_table = pd.DataFrame(edge_list)

# Display the full table
print("Graph 'G_cleaned' represented as a table:")
pd.set_option('display.max_rows', None) # Show all rows
pd.set_option('display.max_colwidth', None) # Show full text in columns
display(graph_table)

In [None]:
# Basic Graph Information
num_nodes = G_cleaned.number_of_nodes()
num_edges = G_cleaned.number_of_edges()
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")

# Check if the graph is connected
is_connected = nx.is_connected(G_cleaned)
print(f"Is the graph connected? {is_connected}")

# If not connected, find the number of connected components
if not is_connected:
    num_components = nx.number_connected_components(G_cleaned)
    print(f"Number of connected components: {num_components}")

In [None]:
# Degree Centrality: Number of connections a node has
degree_centrality = nx.degree_centrality(G_cleaned)
sorted_degree = sorted(degree_centrality.items(), key=lambda item: item[1], reverse=True)
print("Top 5 nodes by Degree Centrality:")
for node, centrality in sorted_degree[:5]:
    print(f"- {node}: {centrality:.4f}")

print("-" * 30)

# Betweenness Centrality: How often a node lies on the shortest path between other nodes
betweenness_centrality = nx.betweenness_centrality(G_cleaned)
sorted_betweenness = sorted(betweenness_centrality.items(), key=lambda item: item[1], reverse=True)
print("Top 5 nodes by Betweenness Centrality:")
for node, centrality in sorted_betweenness[:5]:
    print(f"- {node}: {centrality:.4f}")

print("-" * 30)

# Eigenvector Centrality: Influence of a node in the network
eigenvector_centrality = nx.eigenvector_centrality(G_cleaned, max_iter=1000) # Increased max_iter to ensure convergence
sorted_eigenvector = sorted(eigenvector_centrality.items(), key=lambda item: item[1], reverse=True)
print("Top 5 nodes by Eigenvector Centrality:")
for node, centrality in sorted_eigenvector[:5]:
    print(f"- {node}: {centrality:.4f}")

# 4.1 Subgraphs

In [None]:
def plot_subgraph(sub_graph, pos, node_colors_map, title):
    """Helper function to draw a subgraph with correct colors and labels."""
    plt.figure(figsize=(15, 10))
    
    # Get the colors for the nodes in this specific subgraph
    sub_node_colors = [node_colors_map.get(sub_graph.nodes[node]['type'], 'grey') for node in sub_graph.nodes()]
    
    nx.draw(sub_graph, pos, with_labels=True, node_color=sub_node_colors, node_size=2000, font_size=10, alpha=0.8)
    plt.title(title, fontsize=20)
    plt.show()

In [None]:
# Get the top node from your sorted degree list
top_degree_node = sorted_degree[0][0]

# Create an "ego graph" for this node. This includes the node and all its direct neighbors.
ego_graph = nx.ego_graph(G_cleaned, top_degree_node)

# Use a spring layout for this subgraph
pos = nx.spring_layout(ego_graph, k=0.8)

plot_subgraph(ego_graph, pos, node_colors_map, f"Ego Network for Most Connected Node: {top_degree_node}")

In [None]:
# Get the top node from your sorted betweenness list
top_broker_node = sorted_betweenness[0][0]

# Create an "ego graph" for this node
broker_ego_graph = nx.ego_graph(G_cleaned, top_broker_node)

# Use a spring layout
pos = nx.spring_layout(broker_ego_graph, k=0.8)

plot_subgraph(broker_ego_graph, pos, node_colors_map, f"Ego Network for Top 'Broker' Node: {top_broker_node}")

## But IE University and IE Business School are obviously the most connected nodes so let's ommit them

# 5. Analysis of Filtered Graph

In [None]:
# Create a deep copy of the cleaned graph to work with
G_filtered = G_cleaned.copy()

# Define the nodes to remove
nodes_to_remove = ['IE University', 'IE Business School']

# We should check which of these nodes actually exist in the graph before trying to remove them
existing_nodes_to_remove = [node for node in nodes_to_remove if G_filtered.has_node(node)]

if existing_nodes_to_remove:
    print(f"Removing the following nodes: {existing_nodes_to_remove}")
    G_filtered.remove_nodes_from(existing_nodes_to_remove)
else:
    print("Nodes to remove were not found in the graph.")

print(f"\nOriginal graph node count: {G_cleaned.number_of_nodes()}")
print(f"Filtered graph node count: {G_filtered.number_of_nodes()}")

In [None]:
# --- Re-run Numerical Analysis on the FILTERED Graph ---

print("--- Analysis of Filtered Graph ---")

# Degree Centrality
degree_centrality_f = nx.degree_centrality(G_filtered)
sorted_degree_f = sorted(degree_centrality_f.items(), key=lambda item: item[1], reverse=True)
print("Top 5 nodes by Degree Centrality (Filtered):")
for node, centrality in sorted_degree_f[:5]:
    print(f"- {node}: {centrality:.4f}")

print("-" * 30)

# Betweenness Centrality
betweenness_centrality_f = nx.betweenness_centrality(G_filtered)
sorted_betweenness_f = sorted(betweenness_centrality_f.items(), key=lambda item: item[1], reverse=True)
print("Top 5 nodes by Betweenness Centrality (Filtered):")
for node, centrality in sorted_betweenness_f[:5]:
    print(f"- {node}: {centrality:.4f}")

print("-" * 30)

# Eigenvector Centrality
eigenvector_centrality_f = nx.eigenvector_centrality(G_filtered, max_iter=1000)
sorted_eigenvector_f = sorted(eigenvector_centrality_f.items(), key=lambda item: item[1], reverse=True)
print("Top 5 nodes by Eigenvector Centrality (Filtered):")
for node, centrality in sorted_eigenvector_f[:5]:
    print(f"- {node}: {centrality:.4f}")

# 6. Subgraph Visualizations (Filtered)

In [None]:
# Helper function for plotting
def plot_subgraph(sub_graph, pos, node_colors_map, title):
    """Helper function to draw a subgraph with correct colors and labels."""
    plt.figure(figsize=(15, 10))
    
    # Get the colors for the nodes in this specific subgraph
    sub_node_colors = [node_colors_map.get(sub_graph.nodes[node]['type'], 'grey') for node in sub_graph.nodes()]
    
    nx.draw(sub_graph, pos, with_labels=True, node_color=sub_node_colors, node_size=2000, font_size=10, alpha=0.8)
    plt.title(title, fontsize=20)
    plt.show()

In [None]:
# Create a unique set of all the top nodes you found
nodes_to_visualize = [
    # Top 5 Degree
    'Business Administration',
    'Appius Licinius Cicero',
    'Horatia Pulchra',
    'Universidad Autónoma de Madrid',
    'Marcus Fabius Crassus',
    
    # Top 5 Betweenness
    'Harvard University',
    'IE Law School',
    'Universidad Complutense de Madrid',
    
    # Top 5 Eigenvector
    'Bachelor in Business Administration',
    'IESE',
    'Gaius Julius Scipio'
]

# Loop through each node and plot its ego network
for node_name in nodes_to_visualize:
    # First, check if the node actually exists in our filtered graph
    if not G_filtered.has_node(node_name):
        print(f"Node '{node_name}' not found in the filtered graph. Skipping.")
        print("-" * 30)
        continue
    
    # Create the ego graph
    ego_graph = nx.ego_graph(G_filtered, node_name)
    
    # Use a spring layout
    pos = nx.spring_layout(ego_graph, k=0.8, iterations=50)
    
    # Plot the subgraph using our helper function
    plot_subgraph(ego_graph, pos, node_colors_map, f"Ego Network for: {node_name}")

In [None]:
# --- Final Cell: Export Interactive Graphs to HTML ---

# 1. Install pyvis (uncomment the line below if you haven't installed it)
# !pip install pyvis

from pyvis.network import Network
import re

# 2. Define the color map for pyvis (using HTML hex codes)
pyvis_color_map = {
    'professor': '#FF0000', 
    'university': '#0000FF', 
    'company': '#008000', 
    'study': '#800080', 
    'course': '#FFA500',
    'other': '#808080'
}

# 3. List of nodes to export (copied from your analysis)
nodes_to_visualize = [
    # Top 5 Degree
    'Business Administration',
    'Appius Licinius Cicero',
    'Horatia Pulchra',
    'Universidad Autónoma de Madrid',
    'Marcus Fabius Crassus',
    
    # Top 5 Betweenness
    'Harvard University',
    'IE Law School',
    'Universidad Complutense de Madrid',
    
    # Top 5 Eigenvector
    'Bachelor in Business Administration',
    'IESE',
    'Gaius Julius Scipio'
]

print("Starting export of interactive HTML graphs...")

# 4. Loop through each node, create graph, and save to HTML
for node_name in nodes_to_visualize:
    if not G_filtered.has_node(node_name):
        print(f"Skipping '{node_name}': Node not found in the filtered graph.")
        continue

    # Create the ego graph from the filtered network
    ego_graph = nx.ego_graph(G_filtered, node_name)
    
    # Create a pyvis network
    plot_title = f"Interactive Ego Network for: {node_name}"
    nt = Network(height='800px', width='100%', notebook=True, heading=plot_title)

    # Convert the networkx ego_graph to a pyvis graph
    nt.from_nx(ego_graph)
    
    # Manually set colors and titles (for hover info)
    for node in nt.nodes:
        node_id = node['id']
        node_type = G_filtered.nodes[node_id].get('type', 'other')
        node['color'] = pyvis_color_map.get(node_type, '#808080')
        node['title'] = f"Type: {node_type}" # Tooltip on hover

    # Create a safe filename
    safe_filename = re.sub(r'[^\w\s-]', '', node_name).strip().replace(' ', '_')
    filename = f"interactive_ego_network_{safe_filename}.html"
    
    # Save the interactive HTML file
    nt.save_graph(filename)
    print(f"Successfully saved: {filename}")

print("--- Export Complete ---")