# Network of Cross-overs between Fandoms

In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
import scipy
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, from_networkx, output_file, save
from bokeh.models import HoverTool, ColorBar, LinearColorMapper
from bokeh.transform import linear_cmap
from bokeh.layouts import column
import matplotlib.colors as mcolors
from bokeh.embed import file_html
from bokeh.resources import CDN

np.random.seed(seed=42)

## Setting up

In [3]:
tags = pd.read_csv("tags-20210226.csv")
works = pd.read_csv("works-20210226.csv")

In [4]:
# process tags
works['tags'] = works['tags'].astype(str).apply(lambda tags: tags.split('+'))

fandoms = tags[tags['type'] == "Fandom"]

In [5]:
# Filter to only columns that are needed - merge will need a lot of compute
works_small = works[["tags", "word_count"]]
fandoms_small = fandoms[['id', 'name', 'cached_count']]
works_small

Unnamed: 0,tags,word_count
0,"[10, 414093, 1001939, 4577144, 1499536, 110, 4...",388.0
1,"[10, 20350917, 34816907, 23666027, 23269305, 2...",1638.0
2,"[10, 10613413, 9780526, 3763877, 3741104, 7657...",1502.0
3,"[10, 15322, 54862755, 20595867, 32994286, 663,...",100.0
4,"[11, 721553, 54604, 1439500, 3938423, 53483274...",994.0
...,...,...
7269688,"[78, 77, 84, 101, 104, 105, 106, 23, 13, 16, 7...",705.0
7269689,"[78, 77, 84, 107, 23, 10, 16, 70, 933, 616]",1392.0
7269690,"[77, 78, 69, 108, 109, 62, 110, 23, 9, 111, 16...",1755.0
7269691,"[112, 113, 13, 114, 16, 115, 101, 117, 118, 11...",1338.0


In [6]:
works_small['tags'][0]

['10', '414093', '1001939', '4577144', '1499536', '110', '4682892', '21', '16']

In [7]:
fandoms_small

Unnamed: 0,id,name,cached_count
25,27,Supernatural,310300
27,31,Redacted,5
33,37,Boondock Saints (1999),47
38,46,Lord of the Rings RPF,3538
44,56,Gravitation (Anime),6
...,...,...,...
14466583,55393953,Карнавальная ночь | Carnival Night (1956),0
14466639,55394121,Brain Dump (Web Series),0
14466697,55394295,Redacted,3
14466959,55395081,In the Reign of Terror - G. A. Henty,0


## Cleaning & reorg

### 1. check if the numbers in each list of each row are in the "id" column of another df "fandoms_small"

In [8]:
# Convert strings to integers in the "tags" column, handling NaN values
works_small['tags'] = works_small['tags'].apply(
    lambda x: [int(tag) for tag in x if tag != 'nan']
)

works_small['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['tags'] = works_small['tags'].apply(


[10, 414093, 1001939, 4577144, 1499536, 110, 4682892, 21, 16]

In [None]:
# works_small.to_csv('works_small.csv', index=False)

In [9]:
# Convert the "id" column in fandoms_small to a set for faster lookup
fandom_ids = set(fandoms_small['id'].astype(int))

# Filter tags in works_small to only include those present in fandoms_small
works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


### make edgelist
2. of the numbers in that column, count how many times a pair of them appear, and return a new df with integer 1, integer 2, and the count of each pair's appearance

In [10]:
from itertools import combinations
from collections import Counter

# Generate all possible unique pairs from each list in the filtered_tags column
pairs = []

for tags in works_small['filtered_tags']:
    # Generate combinations (unique pairs) for each list
    if len(tags) > 1:
        pairs.extend(combinations(sorted(tags), 2))

# Count the occurrences of each pair
pair_counts = Counter(pairs)


In [11]:
# Convert the pair counts to a DataFrame
pair_counts_df = pd.DataFrame(
    pair_counts.items(), columns=['ids', 'count']
)

pair_counts_df[['integer_1', 'integer_2']] = pd.DataFrame(pair_counts_df['ids'].tolist(), index=pair_counts_df.index) 
pair_counts_df = pair_counts_df.drop(columns='ids')

pair_counts_df.head()

Unnamed: 0,count,integer_1,integer_2
0,83981,414093,1001939
1,128,3741104,3763877
2,249,3741104,9780526
3,109,3741104,10613413
4,177,3763877,9780526


### Get edgelist with fandom names

In [12]:
# Merge to get the first name
merged_df_1 = pair_counts_df.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_1', right_on='id')

# Rename the columns for clarity
merged_df_1 = merged_df_1.rename(columns={'name': 'name_1'})
merged_df_1 = merged_df_1.drop(columns=['id'])

merged_df_1.head()


Unnamed: 0,count,integer_1,integer_2,name_1
0,83981,414093,1001939,Marvel Cinematic Universe
1,128,3741104,3763877,Boku no Hero Academia
2,249,3741104,9780526,Boku no Hero Academia
3,109,3741104,10613413,Boku no Hero Academia
4,177,3763877,9780526,BnHA


In [13]:
# Merge to get the first name
merged_df_2 = merged_df_1.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_2', right_on='id')

# Rename the columns for clarity
merged_df_2 = merged_df_2.rename(columns={'name': 'name_2'})
merged_df_2 = merged_df_2.drop(columns=['id'])

merged_df_2.head()


Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
1,128,3741104,3763877,Boku no Hero Academia,BnHA
2,249,3741104,9780526,Boku no Hero Academia,My Hero Academia
3,109,3741104,10613413,Boku no Hero Academia,mha
4,177,3763877,9780526,BnHA,My Hero Academia


In [30]:
edgelist_df = merged_df_2

In [31]:
# Select and order the relevant columns
# edgelist_df = merged_df_2[['name_1', 'name_2', 'count']]

edgelist_df.head(20)

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
1,128,3741104,3763877,Boku no Hero Academia,BnHA
2,249,3741104,9780526,Boku no Hero Academia,My Hero Academia
3,109,3741104,10613413,Boku no Hero Academia,mha
4,177,3763877,9780526,BnHA,My Hero Academia
5,280,3763877,10613413,BnHA,mha
6,138,9780526,10613413,My Hero Academia,mha
7,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,陈情令 | The Untamed (TV)
8,4,34431205,34782485,Redacted,Sands of Arawiya - Hafsah Faizal
9,4084,1633246,14988696,Dangan Ronpa - All Media Types,Dangan Ronpa: Trigger Happy Havoc


### Clean up edgelist

In [32]:
# edgelist_df = edgelist_df.sort_values(by="count", ascending=False)

In [33]:
len(edgelist_df)

1054762

In [34]:
# remove redacted
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "Redacted") | (edgelist_df.name_2 == "Redacted")].index)
len(edgelist_df)

757734

In [35]:
# remove edges between fandoms and "anime - fandom" and other possibilities
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "anime - Fandom") | (edgelist_df.name_2 == "anime - Fandom")].index)
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "animation - Fandom") | (edgelist_df.name_2 == "animation - Fandom")].index)
len(edgelist_df)

757239

In [36]:
# drop all combinations of bnha, boku no hero academia, mha, my hero academia etc
edgelist_df = edgelist_df.drop(index=[1, 2, 3, 4, 5, 6])
len(edgelist_df)

# probably more cases like this - mha, bnha etc, but none with high counts as far as i can tell. also some are valid, like "Star Wars - All Media Types" connected to the Clone Wars etc

757233

In [37]:
# check
edgelist_df.head(20)

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
7,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,陈情令 | The Untamed (TV)
9,4084,1633246,14988696,Dangan Ronpa - All Media Types,Dangan Ronpa: Trigger Happy Havoc
10,16448,218280,254648,Video Blogging RPF,Minecraft (Video Game)
11,140,51823,251062,Forgotten Realms,Dungeons & Dragons (Roleplaying Game)
12,4437,1464,1801,Star Trek: The Original Series,Star Trek
13,3071,1464,601802,Star Trek: The Original Series,Star Trek: Alternate Original Series (Movies)
14,6753,1801,601802,Star Trek,Star Trek: Alternate Original Series (Movies)
15,242,3658,1281791,Tokyo Babylon,X -エックス- | X/1999
16,83,34606586,34674836,ケンガンアシュラ | Kengan Ashura (Manga),ケンガンアシュラ | Kengan Ashura (Anime)


In [38]:
# edgelist_df.to_csv('fandom_network_edgelist.csv', index=False)

## NetworkX

In [39]:
# create dict mapping of ids to fandom names to secure node attributes
id_fandom_mapping = dict(fandoms_small[['id', 'name']].values)

### Filter dataset

In [40]:
# cant run below code, takes too long -- need to find way to reduce dataset. maybe 1 graph w random sample, another with just the top 100?

# sns.histplot(data=edgelist_df, x="count", kde=True)

1. Random sample

In [41]:
sample_size = 100

sample_edgelist = edgelist_df.sample(sample_size)

2. Take top 100

In [42]:
top_num = 100

top_edgelist = edgelist_df.sort_values(by="count", ascending=False).head(100)

### Graph sample edgelist

In [43]:
sample_edgelist.head()

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
753298,1,3132749,3235337,Star Wars: Rebels,Star Wars Legends: Legacy Era - All Media Types
957322,1,9151,226658,Band of Brothers,X-Men (Movies)
1035139,2,934,114828,Angel: the Series,Dresden Files - Jim Butcher
241876,1,106193,39566590,Super Sentai Series,淫妖蟲 ～凌触学園退魔録～ | Inyouchuu ~Ryoujoku Gakuen Tai...
727037,1,114961,3040670,The Hobbit - J. R. R. Tolkien,NCIS: New Orleans
...,...,...,...,...,...
328684,1,29577,131264,Star Wars Prequel Trilogy,Adventures of Huckleberry Finn - Mark Twain
690051,1,484757,3235649,Hawkeye (Comics),Star Wars Legends - All Media Types
187170,2,273721,26110751,Fate/Zero,Yu-Gi-Oh! Duel Monsters (Anime & Manga)
372733,14,13253,96689,Bandom,All-American Rejects


In [44]:
# needs to be indices so that bokeh can work

G_sample = nx.from_pandas_edgelist(sample_edgelist, source='integer_1', target='integer_2', edge_attr='count')
G_sample.nodes(data=True)

NodeDataView({3132749: {}, 3235337: {}, 9151: {}, 226658: {}, 934: {}, 114828: {}, 106193: {}, 39566590: {}, 114961: {}, 3040670: {}, 44235: {}, 8943664: {}, 685: {}, 5148298: {}, 13806: {}, 45591: {}, 3376487: {}, 3687869: {}, 25600176: {}, 25647039: {}, 524391: {}, 3101060: {}, 5876600: {}, 20965191: {}, 406: {}, 50170690: {}, 27306116: {}, 27306128: {}, 48402: {}, 44002918: {}, 15106: {}, 24067: {}, 739186: {}, 4192583: {}, 9630: {}, 28199: {}, 21252: {}, 13157464: {}, 49281637: {}, 52382346: {}, 15731: {}, 643028: {}, 119943: {}, 140887: {}, 858052: {}, 18536088: {}, 921793: {}, 1604734: {}, 218280: {}, 7500476: {}, 885: {}, 238454: {}, 448285: {}, 442710: {}, 865923: {}, 10767: {}, 725923: {}, 35751836: {}, 35857274: {}, 4039: {}, 305222: {}, 9830: {}, 114608: {}, 29577: {}, 20332791: {}, 30417: {}, 126504: {}, 18912129: {}, 29839922: {}, 722: {}, 2664768: {}, 39750406: {}, 42547171: {}, 19411: {}, 105412: {}, 105119: {}, 4130342: {}, 43537: {}, 254648: {}, 136512: {}, 730108: {},

In [45]:
nx.set_node_attributes(G_sample, name='fandom', values=id_fandom_mapping)

# node attributes for degree 
degrees = dict(nx.degree(G_sample))
nx.set_node_attributes(G_sample, name='degree', values=degrees)

### Graph top 100

In [46]:
top_edgelist.head()

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
46,51455,414093,586439,Marvel Cinematic Universe,Captain America (Movies)
47,35409,586439,1001939,Captain America (Movies),The Avengers (Marvel Movies)
156,31923,116304,242462,A Song of Ice and Fire - George R. R. Martin,Game of Thrones (TV)
366,27523,727114,1001939,The Avengers (Marvel) - All Media Types,The Avengers (Marvel Movies)


In [48]:
# needs to be indices so that bokeh can work

G_top = nx.from_pandas_edgelist(top_edgelist, source='integer_1', target='integer_2', edge_attr='count')
G_top.nodes(data=True)

NodeDataView({414093: {}, 1001939: {}, 586439: {}, 116304: {}, 242462: {}, 727114: {}, 226657: {}, 105692: {}, 133185: {}, 7266: {}, 114591: {}, 27251507: {}, 101375: {}, 6048501: {}, 245368: {}, 858574: {}, 1080663: {}, 406: {}, 27785: {}, 11987966: {}, 33035890: {}, 218280: {}, 254648: {}, 827055: {}, 29577: {}, 32833447: {}, 747342: {}, 2007008: {}, 578887: {}, 541478: {}, 873394: {}, 82576: {}, 236208: {}, 5407051: {}, 115613: {}, 224545: {}, 10801369: {}, 105412: {}, 114961: {}, 299357: {}, 29890172: {}, 229522: {}, 130638: {}, 390: {}, 658827: {}, 134900: {}, 34555820: {}, 1633246: {}, 11902568: {}, 969647: {}, 29576: {}, 1801: {}, 601802: {}, 28642: {}, 711035: {}, 879346: {}, 235690: {}, 23985107: {}, 116314: {}, 452309: {}, 219012: {}, 726906: {}, 21872622: {}, 34135873: {}, 662604: {}, 25760640: {}, 169: {}, 13253: {}, 102330: {}, 133386: {}, 21927: {}, 96137: {}, 448284: {}, 452874: {}, 10223: {}, 22034: {}, 81269: {}, 4231202: {}, 145124: {}, 741433: {}, 289604: {}, 870188:

In [49]:
nx.set_node_attributes(G_top, name='fandom', values=id_fandom_mapping)

# node attributes for degree 
degrees = dict(nx.degree(G_top))
nx.set_node_attributes(G_top, name='degree', values=degrees)