# Network of Cross-overs between Fandoms

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
import scipy
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, from_networkx, output_file, save
from bokeh.models import HoverTool, ColorBar, LinearColorMapper
from bokeh.transform import linear_cmap
from bokeh.layouts import column
import matplotlib.colors as mcolors
from bokeh.embed import file_html
from bokeh.resources import CDN

np.random.seed(seed=42)

## Setting up

In [2]:
tags = pd.read_csv("tags-20210226.csv")
works = pd.read_csv("works-20210226.csv")

In [4]:
# process tags
works['tags'] = works['tags'].astype(str).apply(lambda tags: tags.split('+'))

fandoms = tags[tags['type'] == "Fandom"]

In [5]:
# Filter to only columns that are needed - merge will need a lot of compute
works_small = works[["tags", "word_count"]]
fandoms_small = fandoms[['id', 'name', 'cached_count']]
works_small

Unnamed: 0,tags,word_count
0,"[10, 414093, 1001939, 4577144, 1499536, 110, 4...",388.0
1,"[10, 20350917, 34816907, 23666027, 23269305, 2...",1638.0
2,"[10, 10613413, 9780526, 3763877, 3741104, 7657...",1502.0
3,"[10, 15322, 54862755, 20595867, 32994286, 663,...",100.0
4,"[11, 721553, 54604, 1439500, 3938423, 53483274...",994.0
...,...,...
7269688,"[78, 77, 84, 101, 104, 105, 106, 23, 13, 16, 7...",705.0
7269689,"[78, 77, 84, 107, 23, 10, 16, 70, 933, 616]",1392.0
7269690,"[77, 78, 69, 108, 109, 62, 110, 23, 9, 111, 16...",1755.0
7269691,"[112, 113, 13, 114, 16, 115, 101, 117, 118, 11...",1338.0


In [7]:
works_small['tags'][0]

['10', '414093', '1001939', '4577144', '1499536', '110', '4682892', '21', '16']

In [8]:
fandoms_small

Unnamed: 0,id,name,cached_count
25,27,Supernatural,310300
27,31,Redacted,5
33,37,Boondock Saints (1999),47
38,46,Lord of the Rings RPF,3538
44,56,Gravitation (Anime),6
...,...,...,...
14466583,55393953,Карнавальная ночь | Carnival Night (1956),0
14466639,55394121,Brain Dump (Web Series),0
14466697,55394295,Redacted,3
14466959,55395081,In the Reign of Terror - G. A. Henty,0


## Cleaning & reorg

### 1. check if the numbers in each list of each row are in the "id" column of another df "fandoms_small"

In [11]:
# Convert strings to integers in the "tags" column, handling NaN values
works_small['tags'] = works_small['tags'].apply(
    lambda x: [int(tag) for tag in x if tag != 'nan']
)

works_small['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['tags'] = works_small['tags'].apply(


[10, 414093, 1001939, 4577144, 1499536, 110, 4682892, 21, 16]

In [12]:
# Convert the "id" column in fandoms_small to a set for faster lookup
fandom_ids = set(fandoms_small['id'].astype(int))

# Filter tags in works_small to only include those present in fandoms_small
works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


In [13]:
works_small.head()

Unnamed: 0,tags,word_count,filtered_tags
0,"[10, 414093, 1001939, 4577144, 1499536, 110, 4...",388.0,"[414093, 1001939]"
1,"[10, 20350917, 34816907, 23666027, 23269305, 2...",1638.0,[20350917]
2,"[10, 10613413, 9780526, 3763877, 3741104, 7657...",1502.0,"[10613413, 9780526, 3763877, 3741104]"
3,"[10, 15322, 54862755, 20595867, 32994286, 663,...",100.0,[15322]
4,"[11, 721553, 54604, 1439500, 3938423, 53483274...",994.0,[721553]
...,...,...,...
7269688,"[78, 77, 84, 101, 104, 105, 106, 23, 13, 16, 7...",705.0,[70]
7269689,"[78, 77, 84, 107, 23, 10, 16, 70, 933, 616]",1392.0,[70]
7269690,"[77, 78, 69, 108, 109, 62, 110, 23, 9, 111, 16...",1755.0,[70]
7269691,"[112, 113, 13, 114, 16, 115, 101, 117, 118, 11...",1338.0,[114]


### make edgelist
2. of the numbers in that column, count how many times a pair of them appear, and return a new df with integer 1, integer 2, and the count of each pair's appearance

In [14]:
from itertools import combinations
from collections import Counter

# Generate all possible unique pairs from each list in the filtered_tags column
pairs = []

for tags in works_small['filtered_tags']:
    # Generate combinations (unique pairs) for each list
    if len(tags) > 1:
        pairs.extend(combinations(sorted(tags), 2))

# Count the occurrences of each pair
pair_counts = Counter(pairs)


In [20]:
# Convert the pair counts to a DataFrame
pair_counts_df = pd.DataFrame(
    pair_counts.items(), columns=['ids', 'count']
)

pair_counts_df[['integer_1', 'integer_2']] = pd.DataFrame(pair_counts_df['ids'].tolist(), index=pair_counts_df.index) 
pair_counts_df = pair_counts_df.drop(columns='ids')

pair_counts_df.head()

Unnamed: 0,count,integer_1,integer_2
0,83981,414093,1001939
1,128,3741104,3763877
2,249,3741104,9780526
3,109,3741104,10613413
4,177,3763877,9780526
...,...,...,...
1054757,1,431,885
1054758,1,431,30204
1054759,1,266,2095
1054760,1,346,2449


### Get edgelist with fandom names

In [21]:
# Merge to get the first name
merged_df_1 = pair_counts_df.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_1', right_on='id')

# Rename the columns for clarity
merged_df_1 = merged_df_1.rename(columns={'name': 'name_1'})
merged_df_1 = merged_df_1.drop(columns=['id'])

merged_df_1.head()


Unnamed: 0,count,integer_1,integer_2,name_1
0,83981,414093,1001939,Marvel Cinematic Universe
1,128,3741104,3763877,Boku no Hero Academia
2,249,3741104,9780526,Boku no Hero Academia
3,109,3741104,10613413,Boku no Hero Academia
4,177,3763877,9780526,BnHA


In [22]:
# Merge to get the first name
merged_df_2 = merged_df_1.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_2', right_on='id')

# Rename the columns for clarity
merged_df_2 = merged_df_2.rename(columns={'name': 'name_2'})
merged_df_2 = merged_df_2.drop(columns=['id'])

merged_df_2.head()


Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
1,128,3741104,3763877,Boku no Hero Academia,BnHA
2,249,3741104,9780526,Boku no Hero Academia,My Hero Academia
3,109,3741104,10613413,Boku no Hero Academia,mha
4,177,3763877,9780526,BnHA,My Hero Academia


In [63]:
edgelist_df = merged_df_2

In [55]:
# Select and order the relevant columns
edgelist_df = merged_df_2[['name_1', 'name_2', 'count']]

edgelist_df.head(20)

Unnamed: 0,name_1,name_2,count
0,Marvel Cinematic Universe,The Avengers (Marvel Movies),83981
1,Boku no Hero Academia,BnHA,128
2,Boku no Hero Academia,My Hero Academia,249
3,Boku no Hero Academia,mha,109
4,BnHA,My Hero Academia,177
5,BnHA,mha,280
6,My Hero Academia,mha,138
7,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,陈情令 | The Untamed (TV),17742
8,Redacted,Sands of Arawiya - Hafsah Faizal,4
9,Dangan Ronpa - All Media Types,Dangan Ronpa: Trigger Happy Havoc,4084


### Clean up edgelist

In [50]:
# edgelist_df = edgelist_df.sort_values(by="count", ascending=False)

In [64]:
len(edgelist_df)

1054762

In [65]:
# remove redacted
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "Redacted") | (edgelist_df.name_2 == "Redacted")].index)
len(edgelist_df)

757734

In [66]:
# remove edges between fandoms and "anime - fandom" and other possibilities
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "anime - Fandom") | (edgelist_df.name_2 == "anime - Fandom")].index)
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "animation - Fandom") | (edgelist_df.name_2 == "animation - Fandom")].index)
len(edgelist_df)

757239

In [67]:
# drop all combinations of bnha, boku no hero academia, mha, my hero academia etc
edgelist_df = edgelist_df.drop(index=[1, 2, 3, 4, 5, 6])
len(edgelist_df)

# probably more cases like this - mha, bnha etc, but none with high counts as far as i can tell. also some are valid, like "Star Wars - All Media Types" connected to the Clone Wars etc

757233

In [68]:
# check
edgelist_df.head(20)

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
7,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,陈情令 | The Untamed (TV)
9,4084,1633246,14988696,Dangan Ronpa - All Media Types,Dangan Ronpa: Trigger Happy Havoc
10,16448,218280,254648,Video Blogging RPF,Minecraft (Video Game)
11,140,51823,251062,Forgotten Realms,Dungeons & Dragons (Roleplaying Game)
12,4437,1464,1801,Star Trek: The Original Series,Star Trek
13,3071,1464,601802,Star Trek: The Original Series,Star Trek: Alternate Original Series (Movies)
14,6753,1801,601802,Star Trek,Star Trek: Alternate Original Series (Movies)
15,242,3658,1281791,Tokyo Babylon,X -エックス- | X/1999
16,83,34606586,34674836,ケンガンアシュラ | Kengan Ashura (Manga),ケンガンアシュラ | Kengan Ashura (Anime)


In [None]:
# edgelist_df.to_csv('fandom_network_edgelist.csv', index=False)

## NetworkX

In [None]:
# create dict mapping of ids to fandom names to secure node attributes
id_fandom_mapping = dict(fandoms_small[['id', 'name']].values)

### Filter dataset

In [79]:
# cant run below code, takes too long -- need to find way to reduce dataset. maybe 1 graph w random sample, another with just the top 100?

# sns.histplot(data=edgelist_df, x="count", kde=True)

1. Random sample

In [None]:
sample_size = 100

sample_edgelist = edgelist_df.sample(sample_size)

2. Take top 100

In [1]:
top_num = 100

top_edgelist = edgelist_df.sort_values(by="count", ascending=False).head(100)

NameError: name 'edgelist_df' is not defined

### Graph sample edgelist

In [69]:
# needs to be indices so that bokeh can work

G_sample = nx.from_pandas_edgelist(sample_edgelist, source='integer_1', target='integer_2', edge_attr='count')
G_sample.nodes(data=True)

NodeDataView({414093: {}, 1001939: {}, 11987966: {}, 33035890: {}, 1633246: {}, 14988696: {}, 218280: {}, 254648: {}, 51823: {}, 251062: {}, 1464: {}, 1801: {}, 601802: {}, 3658: {}, 1281791: {}, 34606586: {}, 34674836: {}, 12845: {}, 250093: {}, 7266: {}, 21872622: {}, 34135873: {}, 43220: {}, 586439: {}, 18100257: {}, 18197556: {}, 114548: {}, 35398640: {}, 101375: {}, 747342: {}, 244259: {}, 3828398: {}, 196510: {}, 463728: {}, 587792: {}, 1635478: {}, 2282814: {}, 5148298: {}, 5171863: {}, 46023568: {}, 87508: {}, 21364713: {}, 47822806: {}, 50934729: {}, 236870: {}, 1118749: {}, 20169792: {}, 579568: {}, 303506: {}, 26238188: {}, 136512: {}, 24757434: {}, 26740493: {}, 1113845: {}, 21679434: {}, 23167083: {}, 2692: {}, 19361860: {}, 35341370: {}, 40750387: {}, 40750390: {}, 41082133: {}, 44392423: {}, 45140518: {}, 47402860: {}, 299357: {}, 29890172: {}, 275900: {}, 23354435: {}, 767851: {}, 11476789: {}, 115633: {}, 31516237: {}, 47555740: {}, 38231911: {}, 38231914: {}, 930966: 

In [78]:
nx.set_node_attributes(G_sample, name='fandom', values=id_fandom_mapping)

# node attributes for degree
degrees = dict(nx.degree(G_sample))
nx.set_node_attributes(G_sample, name='degree', values=degrees)

### Graph top 100

{27: 'Supernatural',
 31: 'Redacted',
 37: 'Boondock Saints (1999)',
 46: 'Lord of the Rings RPF',
 56: 'Gravitation (Anime)',
 65: 'Avatar: The Last Airbender',
 70: 'Stargate Atlantis',
 83: 'Chronicles of Narnia',
 86: 'Redacted',
 90: 'American Idol RPF',
 91: 'HP',
 114: "Grey's Anatomy",
 121: 'American Idol',
 129: 'Farscape',
 162: 'Stargate: Atlantis',
 164: 'The Dead Zone',
 169: 'My Chemical Romance',
 183: 'Harry Potter - Rowling',
 187: 'Smallville',
 196: 'CSI: Las Vegas',
 203: 'Torchwood',
 206: 'House',
 250: 'Firefly',
 259: 'Pirates of the Caribbean',
 266: 'Hellsing',
 272: 'House M.D.',
 273: 'The Sentinel',
 287: 'Final Fantasy XII',
 288: 'Song Of The Lioness',
 298: 'Redacted',
 328: 'Redacted',
 337: 'due South',
 340: 'Manic Street Preachers',
 346: 'Die Hard',
 353: 'Friday Night Lights',
 365: 'NSYNC',
 370: 'Good Omens',
 379: 'Donald Strachey Mysteries',
 383: 'Buffy',
 386: 'The X-Files',
 390: 'DCU',
 403: 'Redacted',
 406: 'Doctor Who',
 413: 'Savage Ga