In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
import scipy
from pyvis.network import Network


In [3]:
# narrowed down: only edges with >100 count
edgelist_df = pd.read_csv("edgelist_df_small_fandoms.csv").drop(columns=["Unnamed: 0"])
fandoms_small = pd.read_csv("fandoms_small.csv")
edgelist_df.head()

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
1,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,陈情令 | The Untamed (TV)
2,4084,1633246,14988696,Dangan Ronpa - All Media Types,Dangan Ronpa: Trigger Happy Havoc
3,16448,218280,254648,Video Blogging RPF,Minecraft (Video Game)
4,140,51823,251062,Forgotten Realms,Dungeons & Dragons (Roleplaying Game)


In [4]:
len(edgelist_df)

3444

In [5]:
edgelist_df = edgelist_df[edgelist_df['count'] > 100]
len(edgelist_df)

3444

In [6]:
G = nx.from_pandas_edgelist(edgelist_df, source='name_1', target='name_2', edge_attr='count', create_using=nx.MultiGraph())
G.nodes(data=True)

NodeDataView({'Marvel Cinematic Universe': {}, 'The Avengers (Marvel Movies)': {}, '魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù': {}, '陈情令 | The Untamed (TV)': {}, 'Dangan Ronpa - All Media Types': {}, 'Dangan Ronpa: Trigger Happy Havoc': {}, 'Video Blogging RPF': {}, 'Minecraft (Video Game)': {}, 'Forgotten Realms': {}, 'Dungeons & Dragons (Roleplaying Game)': {}, 'Star Trek: The Original Series': {}, 'Star Trek': {}, 'Star Trek: Alternate Original Series (Movies)': {}, 'Tokyo Babylon': {}, 'X -エックス- | X/1999': {}, 'Marvel': {}, '鬼滅の刃 | Kimetsu no Yaiba (Manga)': {}, '鬼滅の刃 | Demon Slayer: Kimetsu no Yaiba (Anime)': {}, 'Captain America (Movies)': {}, 'His Dark Materials - Philip Pullman': {}, 'His Dark Materials (TV)': {}, 'Star Wars - All Media Types': {}, 'Star Wars: The Clone Wars (2008) - All Media Types': {}, 'Katekyou Hitman Reborn!': {}, '僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia': {}, 'Arrow (TV 2012)': {}, 'The Flash (TV 2014)': {}, 'Supergirl (TV 2015)': {}, "DC

In [7]:
# create dict mapping of ids to fandom names to secure node attributes
id_fandom_mapping = dict(fandoms_small[['id', 'name']].values)

# create dict mapping of id to fandom cached_count for nodes
id_cached_count_mapping = dict(fandoms_small[['name', 'cached_count']].values)

# need to adjust cached_count because too big distance between numbers
adjusted_cached_count_mapping = {k: np.log1p(v) for k, v in id_cached_count_mapping.items()}

In [8]:
def convert_int(d):
    # result = {}
    # Convert all values to Python int
    for key, value in d.items():
        if isinstance(value, np.int64):  # Check if the value is numpy.int64
            d[key] = int(value)  # Convert to Python int
    return d

id_fandom_mapping = convert_int(id_fandom_mapping)
id_cached_count_mapping = convert_int(id_cached_count_mapping)
adjusted_cached_count_mapping = convert_int(adjusted_cached_count_mapping)

In [9]:
# set node attribudes for fandom names
nx.set_node_attributes(G, name='fandom', values=id_fandom_mapping)

# i want to color by cached_count of each fandom
nx.set_node_attributes(G, name='cached_count', values=id_cached_count_mapping)

# need to adjust cached_count because too big distance between numbers
nx.set_node_attributes(G, name='adjusted_cached_count', values=adjusted_cached_count_mapping)

# node attributes for degree 
degrees_all = dict(nx.degree(G))
nx.set_node_attributes(G, name='degree', values=degrees_all)

# Slightly adjust degree so that the nodes with very small degrees are still visible
number_to_adjust_by = 10
adjusted_node_size = dict([(node, degree/number_to_adjust_by) for node, degree in nx.degree(G)])
nx.set_node_attributes(G, name='adjusted_node_size', values=adjusted_node_size)

# Shortest Path

most well connected: Harry Potter - J. K. Rowling

In [10]:
def shortest_hp(user_input, target="Marvel Cinematic Universe"):
    try:
        # Attempt to find the shortest path length between the two nodes
        path_length = nx.shortest_path_length(G, source=user_input, target=target)
        # print(f"The shortest path between '{user_input}' and '{target}' has {path_length} edges.")
        return path_length
    except nx.NetworkXNoPath:
        # Handle the case where no path exists
        # print(f"No path exists between '{user_input}' and '{target}'.")
        return 0

In [11]:
shortest_hp("Steve Rogers - Fandom")
# The Avengers (Marvel Movies)

1

In [13]:
mcu_num = {}
for node in G.nodes:
    mcu_num[node] = shortest_hp(node)


In [16]:
dict(sorted(mcu_num.items(), key=lambda item: item[1], reverse=True))

{'FF14': 8,
 'FFXIV': 8,
 'Final Fantasy XIII-2': 8,
 'Final Fantasy XIII Series': 8,
 'Lightning Returns: Final Fantasy XIII': 8,
 'Final Fantasy X-2': 8,
 'Mo Dao Zu Shi': 7,
 '魔道祖师 | Módào Zǔshī (Cartoon)': 7,
 'Final Fantasy VII Remake (Video Game 2020)': 7,
 'Crisis Core: Final Fantasy VII': 7,
 'Final Fantasy VII (Video Game 1997)': 7,
 '天官赐福 - 墨香铜臭 | Tiān Guān Cì Fú - Mòxiāng Tóngxiù': 7,
 'Final Fantasy VII: Advent Children': 7,
 'Final Fantasy XIV': 7,
 'Grandmaster of Demonic Cultivation': 7,
 'Before Crisis: Final Fantasy VII': 7,
 'Dirge of Cerberus: Final Fantasy VII': 7,
 "人渣反派自救系统 - 墨香铜臭 | The Scum Villain's Self-Saving System - Mòxiāng Tóngxiù": 7,
 'The Grandmaster of Demonic Cultivation': 7,
 'The Founder of Diabolism': 7,
 'Final Fantasy XIII': 7,
 'FFXV - Fandom': 7,
 'Final Fantasy 15': 7,
 'Final Fantasy X': 7,
 '博肖 - Fandom': 7,
 'Kingsglaive': 7,
 '魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù': 6,
 '陈情令 | The Untamed (TV)': 6,
 'Compilation of Final Fantasy VII':

In [19]:
sum(x == 0 for x in mcu_num.values())

806

In [166]:
edgelist_df.sample()

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
1964,179,71059,449545,Taylor Swift (Musician),One Direction (Band)


In [137]:
edgelist_df[edgelist_df['name_1'].str.contains("Star Wars")]

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
14,12260,101375,747342,Star Wars - All Media Types,Star Wars: The Clone Wars (2008) - All Media T...
42,2989,101375,31516237,Star Wars - All Media Types,The Mandalorian (TV)
53,23519,101375,6048501,Star Wars - All Media Types,Star Wars Sequel Trilogy
78,12969,29577,101375,Star Wars Prequel Trilogy,Star Wars - All Media Types
79,6450,29577,747342,Star Wars Prequel Trilogy,Star Wars: The Clone Wars (2008) - All Media T...
...,...,...,...,...,...
3267,106,29767,29773,Star Wars Episode IV: A New Hope,Star Wars Episode V: Empire Strikes Back
3269,102,19796,29767,Star Wars Episode VI: Return of the Jedi,Star Wars Episode IV: A New Hope
3270,117,19796,29773,Star Wars Episode VI: Return of the Jedi,Star Wars Episode V: Empire Strikes Back
3286,187,101375,15818487,Star Wars - All Media Types,Star Wars: Thrawn - Timothy Zahn


In [120]:
print(type(G))

<class 'networkx.classes.graph.Graph'>


# Centrality

In [149]:
max(nx.degree_centrality(G), key=nx.degree_centrality(G).get)


'Marvel Cinematic Universe'

In [148]:
dict(sorted(nx.degree_centrality(G).items(), key=lambda item: item[1], reverse=True))


{'Marvel Cinematic Universe': 0.07203151378728194,
 'Harry Potter - J. K. Rowling': 0.05458638154192459,
 'The Avengers (Marvel Movies)': 0.051772650534608895,
 'Marvel': 0.04783342712436691,
 'Batman - All Media Types': 0.045019696117051214,
 'DCU': 0.04164321890827237,
 'The Avengers (Marvel) - All Media Types': 0.03657850309510411,
 'Supernatural': 0.03601575689364097,
 'DCU (Comics)': 0.03545301069217783,
 'Captain America (Movies)': 0.03151378728193585,
 'Batman (Comics)': 0.02982554867754643,
 'Thor (Movies)': 0.028137310073157007,
 'Iron Man (Movies)': 0.028137310073157007,
 'Star Wars - All Media Types': 0.027011817670230726,
 'Marvel (Comics)': 0.023635340461451885,
 'Spider-Man - All Media Types': 0.022509848058525607,
 'Sherlock (TV)': 0.020821609454136185,
 'Video Blogging RPF': 0.018570624648283626,
 'Original Work': 0.016882386043894203,
 '僕のヒーローアカデミア | Boku no Hero Academia | My Hero Academia': 0.016319639842431063,
 'X-Men - All Media Types': 0.016319639842431063,
 'Mar

In [54]:
max(nx.eigenvector_centrality(G), key=nx.eigenvector_centrality(G).get)

'Harry Potter - J. K. Rowling'

In [56]:
max(nx.eigenvector_centrality_numpy(G), key=nx.eigenvector_centrality_numpy(G).get)

'Harry Potter - J. K. Rowling'