Data Pre-Processing after Web Scraping for Chord Diagram

In [None]:
import pandas as pd
import json
from google.colab import files

In [None]:
# Load the dataset
data = pd.read_csv('/content/professor_papers_data.csv')

In [None]:
# Fill missing research interests and Professor Links with empty strings
data['Research Interests'] = data['Research Interests'].fillna('')
data['Professor Link'] = data['Professor Link'].fillna('')

# Capitalize the first letter of each word in 'Professor Name' and 'Research Interests'
data['Professor Name'] = data['Professor Name'].str.title()
data['Research Interests'] = data['Research Interests'].str.title()

# Prepare the nodes data
professors = sorted(data['Professor Name'].unique())
interests = sorted(set(', '.join(data['Research Interests']).split(', ')))  # Extract unique research interests

In [None]:
# Create nodes with 'faculty' type
faculty_nodes = [
    {
        'id': prof,
        'type': 'faculty',
        'googleScholarUrl': data[data['Professor Name'] == prof]['Professor Link'].iloc[0].strip()
    }
    for prof in professors
]

# Create nodes with 'interest' type
interest_nodes = [{'id': interest.strip(), 'type': 'interest'} for interest in interests if interest.strip()]

# Combine nodes
nodes = faculty_nodes + interest_nodes

# Create a mapping of professors to their interests
professor_interest_map = data.groupby('Professor Name')['Research Interests'].apply(
    lambda x: sorted(set(', '.join(x).split(', ')))
)


In [None]:
# Prepare the links data
links = []
for professor, interests_list in professor_interest_map.items():
    for interest in interests_list:
        if interest.strip():  # Avoid empty strings
            links.append({
                'source': professor.strip(),
                'target': interest.strip(),
                'weight': 1  # Default weight
            })

In [None]:
# Sort links by source and then by target
links = sorted(links, key=lambda x: (x['source'], x['target']))

In [None]:
# Write nodes to a JSON file
nodes_path = 'nodes_updated.json'
with open(nodes_path, 'w') as f:
    json.dump(nodes, f, indent=4)

# Write links to a JSON file
links_path = 'links_updated.json'
with open(links_path, 'w') as f:
    json.dump(links, f, indent=4)

In [None]:

# Download files locally
files.download(nodes_path)
files.download(links_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>