# Network of Cross-overs between Fandoms

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
import scipy
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, from_networkx, output_file, save
from bokeh.models import HoverTool, ColorBar, LinearColorMapper
from bokeh.transform import linear_cmap
from bokeh.layouts import column
import matplotlib.colors as mcolors
from bokeh.embed import file_html
from bokeh.resources import CDN

## Setting up

In [2]:
tags = pd.read_csv("tags-20210226.csv")
works = pd.read_csv("works-20210226.csv")

In [3]:
# process tags
works['tags'] = works['tags'].astype(str).apply(lambda tags: tags.split('+'))

fandoms = tags[tags['type'] == "Fandom"]

In [4]:
# Filter to only columns that are needed - merge will need a lot of compute
works_small = works[["tags", "word_count"]]
fandoms_small = fandoms[['id', 'name', 'cached_count']]
works_small

Unnamed: 0,tags,word_count
0,"[10, 414093, 1001939, 4577144, 1499536, 110, 4...",388.0
1,"[10, 20350917, 34816907, 23666027, 23269305, 2...",1638.0
2,"[10, 10613413, 9780526, 3763877, 3741104, 7657...",1502.0
3,"[10, 15322, 54862755, 20595867, 32994286, 663,...",100.0
4,"[11, 721553, 54604, 1439500, 3938423, 53483274...",994.0
...,...,...
7269688,"[78, 77, 84, 101, 104, 105, 106, 23, 13, 16, 7...",705.0
7269689,"[78, 77, 84, 107, 23, 10, 16, 70, 933, 616]",1392.0
7269690,"[77, 78, 69, 108, 109, 62, 110, 23, 9, 111, 16...",1755.0
7269691,"[112, 113, 13, 114, 16, 115, 101, 117, 118, 11...",1338.0


In [5]:
works_small['tags'][0]

['10', '414093', '1001939', '4577144', '1499536', '110', '4682892', '21', '16']

## Cleaning & reorg

### 1. check if the numbers in each list of each row are in the "id" column of another df "fandoms_small"

In [6]:
# Convert strings to integers in the "tags" column, handling NaN values
works_small['tags'] = works_small['tags'].apply(
    lambda x: [int(tag) for tag in x if tag != 'nan']
)

works_small['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['tags'] = works_small['tags'].apply(


[10, 414093, 1001939, 4577144, 1499536, 110, 4682892, 21, 16]

In [7]:
# works_small.to_csv('works_small.csv', index=False)

In [8]:
# Convert the "id" column in fandoms_small to a set for faster lookup
fandom_ids = set(fandoms_small['id'].astype(int))

# Filter tags in works_small to only include those present in fandoms_small
works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


### make edgelist
2. of the numbers in that column, count how many times a pair of them appear, and return a new df with integer 1, integer 2, and the count of each pair's appearance

In [9]:
from itertools import combinations
from collections import Counter

# Generate all possible unique pairs from each list in the filtered_tags column
pairs = []

for tags in works_small['filtered_tags']:
    # Generate combinations (unique pairs) for each list
    if len(tags) > 1:
        pairs.extend(combinations(sorted(tags), 2))

# Count the occurrences of each pair
pair_counts = Counter(pairs)


In [10]:
# Convert the pair counts to a DataFrame
pair_counts_df = pd.DataFrame(
    pair_counts.items(), columns=['ids', 'count']
)

pair_counts_df[['integer_1', 'integer_2']] = pd.DataFrame(pair_counts_df['ids'].tolist(), index=pair_counts_df.index) 
pair_counts_df = pair_counts_df.drop(columns='ids')

pair_counts_df.head()

Unnamed: 0,count,integer_1,integer_2
0,83981,414093,1001939
1,128,3741104,3763877
2,249,3741104,9780526
3,109,3741104,10613413
4,177,3763877,9780526


### Get edgelist with fandom names

In [11]:
# Merge to get the first name
merged_df_1 = pair_counts_df.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_1', right_on='id')

# Rename the columns for clarity
merged_df_1 = merged_df_1.rename(columns={'name': 'name_1'})
merged_df_1 = merged_df_1.drop(columns=['id'])

merged_df_1.head()


Unnamed: 0,count,integer_1,integer_2,name_1
0,83981,414093,1001939,Marvel Cinematic Universe
1,128,3741104,3763877,Boku no Hero Academia
2,249,3741104,9780526,Boku no Hero Academia
3,109,3741104,10613413,Boku no Hero Academia
4,177,3763877,9780526,BnHA


In [12]:
# Merge to get the first name
merged_df_2 = merged_df_1.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_2', right_on='id')

# Rename the columns for clarity
merged_df_2 = merged_df_2.rename(columns={'name': 'name_2'})
merged_df_2 = merged_df_2.drop(columns=['id'])

merged_df_2.head()


Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
1,128,3741104,3763877,Boku no Hero Academia,BnHA
2,249,3741104,9780526,Boku no Hero Academia,My Hero Academia
3,109,3741104,10613413,Boku no Hero Academia,mha
4,177,3763877,9780526,BnHA,My Hero Academia


In [13]:
edgelist_df = merged_df_2

In [14]:
# Select and order the relevant columns
# edgelist_df = merged_df_2[['name_1', 'name_2', 'count']]

edgelist_df.head(20)

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
1,128,3741104,3763877,Boku no Hero Academia,BnHA
2,249,3741104,9780526,Boku no Hero Academia,My Hero Academia
3,109,3741104,10613413,Boku no Hero Academia,mha
4,177,3763877,9780526,BnHA,My Hero Academia
5,280,3763877,10613413,BnHA,mha
6,138,9780526,10613413,My Hero Academia,mha
7,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,陈情令 | The Untamed (TV)
8,4,34431205,34782485,Redacted,Sands of Arawiya - Hafsah Faizal
9,4084,1633246,14988696,Dangan Ronpa - All Media Types,Dangan Ronpa: Trigger Happy Havoc


### Clean up edgelist

In [15]:
len(edgelist_df)

1054762

In [16]:
# remove redacted
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "Redacted") | (edgelist_df.name_2 == "Redacted")].index)
len(edgelist_df)

757734

In [17]:
# remove edges between fandoms and "anime - fandom" and other possibilities
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "anime - Fandom") | (edgelist_df.name_2 == "anime - Fandom")].index)
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "animation - Fandom") | (edgelist_df.name_2 == "animation - Fandom")].index)
len(edgelist_df)

757239

In [18]:
# drop all combinations of bnha, boku no hero academia, mha, my hero academia etc
edgelist_df = edgelist_df.drop(index=[1, 2, 3, 4, 5, 6])
len(edgelist_df)

# probably more cases like this - mha, bnha etc, but none with high counts as far as i can tell. also some are valid, like "Star Wars - All Media Types" connected to the Clone Wars etc

757233

In [19]:
# drop "& Related Fandoms"
# Create a boolean mask to identify rows that contain "& Related Fandoms" in either 'name_1' or 'name_2'
mask_related_fandoms = edgelist_df['name_1'].str.contains("& Related Fandoms") | edgelist_df['name_2'].str.contains("& Related Fandoms")

# Use the mask to filter out the rows
edgelist_df = edgelist_df[~mask_related_fandoms]

len(edgelist_df)

741606

In [20]:
# check
edgelist_df.head(20)

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
7,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,陈情令 | The Untamed (TV)
9,4084,1633246,14988696,Dangan Ronpa - All Media Types,Dangan Ronpa: Trigger Happy Havoc
10,16448,218280,254648,Video Blogging RPF,Minecraft (Video Game)
11,140,51823,251062,Forgotten Realms,Dungeons & Dragons (Roleplaying Game)
12,4437,1464,1801,Star Trek: The Original Series,Star Trek
13,3071,1464,601802,Star Trek: The Original Series,Star Trek: Alternate Original Series (Movies)
14,6753,1801,601802,Star Trek,Star Trek: Alternate Original Series (Movies)
15,242,3658,1281791,Tokyo Babylon,X -エックス- | X/1999
16,83,34606586,34674836,ケンガンアシュラ | Kengan Ashura (Manga),ケンガンアシュラ | Kengan Ashura (Anime)


## NetworkX

In [21]:
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import Blues8, Reds8, Purples8, Oranges8, Viridis8, Spectral8
from bokeh.transform import linear_cmap

# np.random.seed(seed=42)

output_notebook()

In [22]:
# create dict mapping of ids to fandom names to secure node attributes
id_fandom_mapping = dict(fandoms_small[['id', 'name']].values)

In [23]:
# create dict mapping of id to fandom cached_count for nodes
id_cached_count_mapping = dict(fandoms_small[['id', 'cached_count']].values)
# id_cached_count

In [24]:
# DOWNLOAD THINGS
# edgelist_df.to_csv('crossover_network_edgelist.csv', index=False)
# fandoms_small.to_csv('fandoms_small.csv', index=False)


### Filter dataset

In [25]:
# cant run below code, takes too long -- need to find way to reduce dataset. maybe 1 graph w random sample, another with just the top 100?

# sns.histplot(data=edgelist_df, x="count", kde=True)

1. Random sample

In [26]:
sample_size = 50
min_count = 100

sample_edgelist = edgelist_df[edgelist_df['count'] > min_count].sample(sample_size)

2. Take top 50

In [27]:
top_num = 50

top_edgelist = edgelist_df.sort_values(by="count", ascending=False).head(top_num)

### Graph sample edgelist

In [28]:
sample_edgelist.head()

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
44875,1455,152669,3810482,Emmerdale,robron
1645,105,851797,969647,Injustice: Gods Among Us,DCU (Comics)
12242,309,11987966,21693609,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,The Grandmaster of Demonic Cultivation
700713,127,11146,739806,Iron Man (Comic),Iron Man - All Media Types
38778,116,522032,1967132,Tom Hiddleston - Fandom,Marvel Cinematic Universe RPF


In [29]:
# needs to be indices so that bokeh can work

G_sample = nx.from_pandas_edgelist(sample_edgelist, source='integer_1', target='integer_2', edge_attr='count')
G_sample.nodes(data=True)

NodeDataView({152669: {}, 3810482: {}, 851797: {}, 969647: {}, 11987966: {}, 21693609: {}, 11146: {}, 739806: {}, 522032: {}, 1967132: {}, 10596: {}, 13855: {}, 414093: {}, 586437: {}, 114706: {}, 43249945: {}, 12310528: {}, 12312739: {}, 190901: {}, 728807: {}, 289604: {}, 32833447: {}, 13999: {}, 777705: {}, 2814336: {}, 15270224: {}, 34555820: {}, 245368: {}, 625992: {}, 13907: {}, 31108: {}, 27: {}, 379999: {}, 82576: {}, 2346771: {}, 2128865: {}, 25307253: {}, 226657: {}, 236208: {}, 17856255: {}, 37357117: {}, 1320148: {}, 16105794: {}, 9607162: {}, 18081549: {}, 117953: {}, 133185: {}, 1801: {}, 9630: {}, 730108: {}, 10223: {}, 145749: {}, 11923942: {}, 15303162: {}, 5946839: {}, 9505516: {}, 115613: {}, 5392312: {}, 70: {}, 146772: {}, 20027: {}, 727114: {}, 239605: {}, 13867325: {}, 4997791: {}, 19017606: {}, 235690: {}, 15133407: {}, 1001939: {}, 1928582: {}, 5466945: {}, 141299: {}, 41332: {}, 218342: {}, 735927: {}, 7857352: {}, 250093: {}, 2019008: {}, 105122: {}, 997960: 

**Node attributes**

In [30]:
# set node attribudes for fandom names
nx.set_node_attributes(G_sample, name='fandom', values=id_fandom_mapping)

# i want to color by cached_count of each fandom
nx.set_node_attributes(G_sample, name='cached_count', values=id_cached_count_mapping)

# need to adjust cached_count because too big distance between numbers
adjusted_cached_count_mapping = {k: np.log1p(v) for k, v in id_cached_count_mapping.items()}
nx.set_node_attributes(G_sample, name='adjusted_cached_count', values=adjusted_cached_count_mapping)

In [31]:
# node attributes for degree 
degrees = dict(nx.degree(G_sample))
nx.set_node_attributes(G_sample, name='degree', values=degrees)

In [32]:
# Slightly adjust degree so that the nodes with very small degrees are still visible
number_to_adjust_by = 10
adjusted_node_size = dict([(node, degree/number_to_adjust_by) for node, degree in nx.degree(G_sample)])
nx.set_node_attributes(G_sample, name='adjusted_node_size', values=adjusted_node_size)

In [33]:
#Choose attributes from G network to size and color by — setting manual size (e.g. 10) or color (e.g. 'skyblue') also allowed
size_by_this_attribute = 'adjusted_node_size'
color_by_this_attribute = 'adjusted_cached_count' #'#860101' 

#Pick a color palette — Blues8, Reds8, Purples8, Oranges8, Viridis8
color_palette = Reds8

#Choose a title!
title_sample = 'Fandom Network by Cross-overs, sample of ' + str(sample_size) + ' more than ' + str(min_count) + ' fics'

#Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [
        ("Fandom", "@fandom"),
        ("Degree", "@degree"),
        ("Count", "@cached_count")
]

#Create a plot — set dimensions, toolbar, and title
plot_sample = figure(tooltips = HOVER_TOOLTIPS,
              tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
            x_range=Range1d(-1.5, 1.5), y_range=Range1d(-1.5, 1.5), title=title_sample)

# pos = nx.spring_layout(G_sample, k=0.2, iterations=1000)

pos_sample = nx.fruchterman_reingold_layout(G_sample, k=0.2, iterations=1000)

graph_sample = from_networkx(G_sample, pos_sample, scale=1, center=(0, 0))

#Set node sizes and colors according to node degree (color as spectrum of color palette)
minimum_value_color_sample = min(graph_sample.node_renderer.data_source.data[color_by_this_attribute])
maximum_value_color_sample = max(graph_sample.node_renderer.data_source.data[color_by_this_attribute])
graph_sample.node_renderer.glyph = Circle(radius=size_by_this_attribute, fill_color=linear_cmap(color_by_this_attribute, color_palette, maximum_value_color_sample, minimum_value_color_sample), fill_alpha=0.7)

# graph_sample.node_renderer.glyph = Circle(radius=size_by_this_attribute, fill_color=color_by_this_attribute, fill_alpha=0.5)
# graph_sample.edge_renderer.glyph = MultiLine(line_alpha=1, line_width=4, line_color="black")

plot_sample.renderers.append(graph_sample)

show(plot_sample)



Interpretation: the bigger the node, the greater the degree. the darker the color, the more fics in the fandom.

### Graph top 100

In [34]:
top_edgelist.head()

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
46,51455,414093,586439,Marvel Cinematic Universe,Captain America (Movies)
47,35409,586439,1001939,Captain America (Movies),The Avengers (Marvel Movies)
156,31923,116304,242462,A Song of Ice and Fire - George R. R. Martin,Game of Thrones (TV)
366,27523,727114,1001939,The Avengers (Marvel) - All Media Types,The Avengers (Marvel Movies)


Noticing that a lot of the above 'crossovers' are just within the same fandom e.g. The Avengers (Marvel) - All Media Types and The Avengers (Marvel Movies). A lot of these are authors just tagging multiple times for the same fandom, which isn't the same as a cross-over. But! This doesn't mean that 2 tags with "All Media Types" could come together to be a crossover from 2 different fandoms. It's just more common for the bigger fandoms to have the possibility of tagging "All Media Types".

Going to remove "All Media Types" for this edgelist of top 50 crossovers, so there's more information. Going to leave the random sample as it is, because it doesn't have this high incidence of "All Media Types" invalidating 'crossovers'.

In [35]:
len(edgelist_df)

741606

In [36]:
# Create a boolean mask to identify rows that contain "& Related Fandoms" in either 'name_1' or 'name_2'
mask_all_media_types = edgelist_df['name_1'].str.contains("All Media Types") | edgelist_df['name_2'].str.contains("All Media Types")

# Use the mask to filter out the rows
top_edgelist_df = edgelist_df[~mask_all_media_types]
len(top_edgelist_df)

666344

In [37]:
top_edgelist = top_edgelist_df.sort_values(by="count", ascending=False).head(top_num)

In [38]:
# needs to be indices so that bokeh can work

G_top = nx.from_pandas_edgelist(top_edgelist, source='integer_1', target='integer_2', edge_attr='count')
G_top.nodes(data=True)

NodeDataView({414093: {}, 1001939: {}, 586439: {}, 116304: {}, 242462: {}, 226657: {}, 7266: {}, 114591: {}, 27251507: {}, 245368: {}, 858574: {}, 1080663: {}, 406: {}, 27785: {}, 11987966: {}, 33035890: {}, 218280: {}, 254648: {}, 32833447: {}, 115613: {}, 224545: {}, 10801369: {}, 229522: {}, 134900: {}, 34555820: {}, 114961: {}, 873394: {}, 82576: {}, 969647: {}, 1801: {}, 601802: {}, 28642: {}, 390: {}, 879346: {}, 235690: {}, 23985107: {}, 21872622: {}, 34135873: {}, 25760640: {}, 169: {}, 13253: {}, 96137: {}, 10223: {}, 22034: {}, 81269: {}, 4231202: {}, 289604: {}, 870188: {}, 5437936: {}, 1464: {}, 1072216: {}, 6856609: {}, 15270224: {}, 587792: {}, 2282814: {}, 5316694: {}, 2428: {}})

In [39]:
# set node attribudes for fandom names
nx.set_node_attributes(G_top, name='fandom', values=id_fandom_mapping)

# i want to color by cached_count of each fandom
nx.set_node_attributes(G_top, name='cached_count', values=id_cached_count_mapping)

# need to adjust cached_count because too big distance between numbers
nx.set_node_attributes(G_top, name='adjusted_cached_count', values=adjusted_cached_count_mapping)

In [40]:
# node attributes for degree 
degrees = dict(nx.degree(G_top))
nx.set_node_attributes(G_top, name='degree', values=degrees)

In [41]:
# Slightly adjust degree so that the nodes with very small degrees are still visible
number_to_adjust_by = 10
adjusted_node_size = dict([(node, degree/number_to_adjust_by) for node, degree in nx.degree(G_top)])
nx.set_node_attributes(G_top, name='adjusted_node_size', values=adjusted_node_size)

In [42]:
#Choose a title!
title = 'Fandom Network by Cross-overs, ' + str(top_num) + ' most cross-overs'

#Create a plot — set dimensions, toolbar, and title
plot_top = figure(tooltips = HOVER_TOOLTIPS,
              tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
            x_range=Range1d(-1.5, 1.5), y_range=Range1d(-1.5, 1.5), title=title)

# pos_top = nx.spring_layout(G_top, k=0.2, iterations=1000)

pos_top = nx.fruchterman_reingold_layout(G_top, k=0.3, iterations=1000)

graph_top = from_networkx(G_top, pos_top, scale=1, center=(0, 0))

#Set node sizes and colors according to node degree (color as spectrum of color palette)
minimum_value_color_top = min(graph_top.node_renderer.data_source.data[color_by_this_attribute])
maximum_value_color_top = max(graph_top.node_renderer.data_source.data[color_by_this_attribute])
graph_top.node_renderer.glyph = Circle(radius=size_by_this_attribute, fill_color=linear_cmap(color_by_this_attribute, color_palette, maximum_value_color_top, minimum_value_color_top), fill_alpha=0.7)

plot_top.renderers.append(graph_top)

show(plot_top)
