# Network of Cross-overs between Fandoms

In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
import scipy
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, from_networkx, output_file, save
from bokeh.models import HoverTool, ColorBar, LinearColorMapper
from bokeh.transform import linear_cmap
from bokeh.layouts import column
import matplotlib.colors as mcolors
from bokeh.embed import file_html
from bokeh.resources import CDN

## Setting up

In [3]:
tags = pd.read_csv("tags-20210226.csv")
works = pd.read_csv("works-20210226.csv")

In [4]:
# process tags
works['tags'] = works['tags'].astype(str).apply(lambda tags: tags.split('+'))

fandoms = tags[tags['type'] == "Fandom"]

In [5]:
# Filter to only columns that are needed - merge will need a lot of compute
works_small = works[["tags", "word_count"]]
fandoms_small = fandoms[['id', 'name', 'cached_count']]
works_small

Unnamed: 0,tags,word_count
0,"[10, 414093, 1001939, 4577144, 1499536, 110, 4...",388.0
1,"[10, 20350917, 34816907, 23666027, 23269305, 2...",1638.0
2,"[10, 10613413, 9780526, 3763877, 3741104, 7657...",1502.0
3,"[10, 15322, 54862755, 20595867, 32994286, 663,...",100.0
4,"[11, 721553, 54604, 1439500, 3938423, 53483274...",994.0
...,...,...
7269688,"[78, 77, 84, 101, 104, 105, 106, 23, 13, 16, 7...",705.0
7269689,"[78, 77, 84, 107, 23, 10, 16, 70, 933, 616]",1392.0
7269690,"[77, 78, 69, 108, 109, 62, 110, 23, 9, 111, 16...",1755.0
7269691,"[112, 113, 13, 114, 16, 115, 101, 117, 118, 11...",1338.0


In [6]:
works_small['tags'][0]

['10', '414093', '1001939', '4577144', '1499536', '110', '4682892', '21', '16']

In [7]:
fandoms_small

Unnamed: 0,id,name,cached_count
25,27,Supernatural,310300
27,31,Redacted,5
33,37,Boondock Saints (1999),47
38,46,Lord of the Rings RPF,3538
44,56,Gravitation (Anime),6
...,...,...,...
14466583,55393953,Карнавальная ночь | Carnival Night (1956),0
14466639,55394121,Brain Dump (Web Series),0
14466697,55394295,Redacted,3
14466959,55395081,In the Reign of Terror - G. A. Henty,0


## Cleaning & reorg

### 1. check if the numbers in each list of each row are in the "id" column of another df "fandoms_small"

In [8]:
# Convert strings to integers in the "tags" column, handling NaN values
works_small['tags'] = works_small['tags'].apply(
    lambda x: [int(tag) for tag in x if tag != 'nan']
)

works_small['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['tags'] = works_small['tags'].apply(


[10, 414093, 1001939, 4577144, 1499536, 110, 4682892, 21, 16]

In [None]:
# works_small.to_csv('works_small.csv', index=False)

In [9]:
# Convert the "id" column in fandoms_small to a set for faster lookup
fandom_ids = set(fandoms_small['id'].astype(int))

# Filter tags in works_small to only include those present in fandoms_small
works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  works_small['filtered_tags'] = works_small['tags'].apply(lambda tags: [tag for tag in tags if tag in fandom_ids])


### make edgelist
2. of the numbers in that column, count how many times a pair of them appear, and return a new df with integer 1, integer 2, and the count of each pair's appearance

In [10]:
from itertools import combinations
from collections import Counter

# Generate all possible unique pairs from each list in the filtered_tags column
pairs = []

for tags in works_small['filtered_tags']:
    # Generate combinations (unique pairs) for each list
    if len(tags) > 1:
        pairs.extend(combinations(sorted(tags), 2))

# Count the occurrences of each pair
pair_counts = Counter(pairs)


In [11]:
# Convert the pair counts to a DataFrame
pair_counts_df = pd.DataFrame(
    pair_counts.items(), columns=['ids', 'count']
)

pair_counts_df[['integer_1', 'integer_2']] = pd.DataFrame(pair_counts_df['ids'].tolist(), index=pair_counts_df.index) 
pair_counts_df = pair_counts_df.drop(columns='ids')

pair_counts_df.head()

Unnamed: 0,count,integer_1,integer_2
0,83981,414093,1001939
1,128,3741104,3763877
2,249,3741104,9780526
3,109,3741104,10613413
4,177,3763877,9780526


### Get edgelist with fandom names

In [107]:
# Merge to get the first name
merged_df_1 = pair_counts_df.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_1', right_on='id')

# Rename the columns for clarity
merged_df_1 = merged_df_1.rename(columns={'name': 'name_1'})
merged_df_1 = merged_df_1.drop(columns=['id'])

merged_df_1.head()


Unnamed: 0,count,integer_1,integer_2,name_1,cached_count_1
0,83981,414093,1001939,Marvel Cinematic Universe,240536
1,128,3741104,3763877,Boku no Hero Academia,2472
2,249,3741104,9780526,Boku no Hero Academia,2472
3,109,3741104,10613413,Boku no Hero Academia,2472
4,177,3763877,9780526,BnHA,33805


In [109]:
# Merge to get the first name
merged_df_2 = merged_df_1.merge(fandoms_small[['id', 'name']], how='left', left_on='integer_2', right_on='id')

# Rename the columns for clarity
merged_df_2 = merged_df_2.rename(columns={'name': 'name_2'})
merged_df_2 = merged_df_2.drop(columns=['id'])

merged_df_2.head()


Unnamed: 0,count,integer_1,integer_2,name_1,cached_count_1,name_2,cached_count_2
0,83981,414093,1001939,Marvel Cinematic Universe,240536,The Avengers (Marvel Movies),157813
1,128,3741104,3763877,Boku no Hero Academia,2472,BnHA,33805
2,249,3741104,9780526,Boku no Hero Academia,2472,My Hero Academia,5078
3,109,3741104,10613413,Boku no Hero Academia,2472,mha,7393
4,177,3763877,9780526,BnHA,33805,My Hero Academia,5078


In [110]:
edgelist_df = merged_df_2

In [111]:
# Select and order the relevant columns
# edgelist_df = merged_df_2[['name_1', 'name_2', 'count']]

edgelist_df.head(20)

Unnamed: 0,count,integer_1,integer_2,name_1,cached_count_1,name_2,cached_count_2
0,83981,414093,1001939,Marvel Cinematic Universe,240536,The Avengers (Marvel Movies),157813
1,128,3741104,3763877,Boku no Hero Academia,2472,BnHA,33805
2,249,3741104,9780526,Boku no Hero Academia,2472,My Hero Academia,5078
3,109,3741104,10613413,Boku no Hero Academia,2472,mha,7393
4,177,3763877,9780526,BnHA,33805,My Hero Academia,5078
5,280,3763877,10613413,BnHA,33805,mha,7393
6,138,9780526,10613413,My Hero Academia,5078,mha,7393
7,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,47441,陈情令 | The Untamed (TV),30914
8,4,34431205,34782485,Redacted,5,Sands of Arawiya - Hafsah Faizal,4
9,4084,1633246,14988696,Dangan Ronpa - All Media Types,21958,Dangan Ronpa: Trigger Happy Havoc,6002


### Clean up edgelist

In [112]:
# edgelist_df = edgelist_df.sort_values(by="count", ascending=False)

In [113]:
len(edgelist_df)

1054762

In [114]:
# remove redacted
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "Redacted") | (edgelist_df.name_2 == "Redacted")].index)
len(edgelist_df)

757734

In [115]:
# remove edges between fandoms and "anime - fandom" and other possibilities
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "anime - Fandom") | (edgelist_df.name_2 == "anime - Fandom")].index)
edgelist_df = edgelist_df.drop(edgelist_df[(edgelist_df.name_1 == "animation - Fandom") | (edgelist_df.name_2 == "animation - Fandom")].index)
len(edgelist_df)

757239

In [116]:
# drop all combinations of bnha, boku no hero academia, mha, my hero academia etc
edgelist_df = edgelist_df.drop(index=[1, 2, 3, 4, 5, 6])
len(edgelist_df)

# probably more cases like this - mha, bnha etc, but none with high counts as far as i can tell. also some are valid, like "Star Wars - All Media Types" connected to the Clone Wars etc

757233

In [117]:
# check
edgelist_df.head(20)

Unnamed: 0,count,integer_1,integer_2,name_1,cached_count_1,name_2,cached_count_2
0,83981,414093,1001939,Marvel Cinematic Universe,240536,The Avengers (Marvel Movies),157813
7,17742,11987966,33035890,魔道祖师 - 墨香铜臭 | Módào Zǔshī - Mòxiāng Tóngxiù,47441,陈情令 | The Untamed (TV),30914
9,4084,1633246,14988696,Dangan Ronpa - All Media Types,21958,Dangan Ronpa: Trigger Happy Havoc,6002
10,16448,218280,254648,Video Blogging RPF,39078,Minecraft (Video Game),31185
11,140,51823,251062,Forgotten Realms,675,Dungeons & Dragons (Roleplaying Game),4841
12,4437,1464,1801,Star Trek: The Original Series,15212,Star Trek,36097
13,3071,1464,601802,Star Trek: The Original Series,15212,Star Trek: Alternate Original Series (Movies),33604
14,6753,1801,601802,Star Trek,36097,Star Trek: Alternate Original Series (Movies),33604
15,242,3658,1281791,Tokyo Babylon,641,X -エックス- | X/1999,724
16,83,34606586,34674836,ケンガンアシュラ | Kengan Ashura (Manga),156,ケンガンアシュラ | Kengan Ashura (Anime),103


In [38]:
# edgelist_df.to_csv('fandom_network_edgelist.csv', index=False)

## NetworkX

In [118]:
from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import Blues8, Reds8, Purples8, Oranges8, Viridis8, Spectral8
from bokeh.transform import linear_cmap

# np.random.seed(seed=42)

output_notebook()

In [119]:
# create dict mapping of ids to fandom names to secure node attributes
id_fandom_mapping = dict(fandoms_small[['id', 'name']].values)

In [130]:
# create dict mapping of id to fandom cached_count for nodes
id_cached_count_mapping = dict(fandoms_small[['id', 'cached_count']].values)
# id_cached_count

### Filter dataset

In [40]:
# cant run below code, takes too long -- need to find way to reduce dataset. maybe 1 graph w random sample, another with just the top 100?

# sns.histplot(data=edgelist_df, x="count", kde=True)

1. Random sample

In [120]:
sample_size = 50
min_count = 100

sample_edgelist = edgelist_df[edgelist_df['count'] > min_count].sample(sample_size)

2. Take top 100

In [121]:
top_num = 100

top_edgelist = edgelist_df.sort_values(by="count", ascending=False).head(100)

### Graph sample edgelist

In [122]:
sample_edgelist.head()

Unnamed: 0,count,integer_1,integer_2,name_1,cached_count_1,name_2,cached_count_2
126337,124,5148298,12442981,Supergirl (TV 2015),37968,Sanvers - Fandom,202
39564,109,9830,232768,Chronicles of Narnia - C. S. Lewis,4244,Merlin (TV),60314
9509,162,451725,451758,Sonic the Hedgehog - All Media Types,5143,Sonic X,242
9147,655,245368,491459,Thor (Movies),45033,Loki - Fandom,2120
2204,371,29577,3132749,Star Wars Prequel Trilogy,19166,Star Wars: Rebels,7292
141969,145,29707,251330,A Song of Ice and Fire,1205,game of thrones,3858
19600,129,937647,3352745,Punisher (Comics),395,Daredevil (TV),15598
43248,210,2415213,3801086,Nothing Much to Do,800,Lovely Little Losers,321
729170,135,127709,223664,Pirates of the Caribbean: Dead Man's Chest (2006),154,Pirates of the Caribbean (Movies),4391
472,1699,226657,578887,Iron Man (Movies),46331,Captain America - All Media Types,20454


In [123]:
# needs to be indices so that bokeh can work

G_sample = nx.from_pandas_edgelist(sample_edgelist, source='integer_1', target='integer_2', edge_attr='count')
G_sample.nodes(data=True)

NodeDataView({5148298: {}, 12442981: {}, 9830: {}, 232768: {}, 451725: {}, 451758: {}, 245368: {}, 491459: {}, 29577: {}, 3132749: {}, 29707: {}, 251330: {}, 937647: {}, 3352745: {}, 2415213: {}, 3801086: {}, 127709: {}, 223664: {}, 226657: {}, 578887: {}, 34098940: {}, 2427: {}, 2428: {}, 550814: {}, 4551485: {}, 431213: {}, 25646718: {}, 478190: {}, 610604: {}, 25486: {}, 2437707: {}, 82576: {}, 525714: {}, 26692334: {}, 27557713: {}, 273721: {}, 28588358: {}, 7380455: {}, 7266: {}, 2814336: {}, 114961: {}, 133297: {}, 136512: {}, 2207797: {}, 2692: {}, 51147222: {}, 601802: {}, 3763877: {}, 3828398: {}, 277627: {}, 4274675: {}, 573551: {}, 16652955: {}, 57398: {}, 827055: {}, 478191: {}, 390: {}, 8404315: {}, 236208: {}, 582724: {}, 1195851: {}, 6388046: {}, 662604: {}, 2346771: {}, 5060107: {}, 101375: {}, 414093: {}, 484757: {}, 193591: {}, 586439: {}, 19427: {}, 19428: {}, 735670: {}, 735672: {}, 1928582: {}, 259514: {}, 969647: {}, 23966: {}, 135825: {}, 461634: {}, 230931: {}, 

**Node attributes**

In [132]:
# set node attribudes for fandom names
nx.set_node_attributes(G_sample, name='fandom', values=id_fandom_mapping)

# i want to color by cached_count of each fandom
nx.set_node_attributes(G_sample, name='cached_count', values=id_cached_count_mapping)

In [133]:
# node attributes for degree 
degrees = dict(nx.degree(G_sample))
nx.set_node_attributes(G_sample, name='degree', values=degrees)

In [134]:
# Slightly adjust degree so that the nodes with very small degrees are still visible
number_to_adjust_by = 10
adjusted_node_size = dict([(node, degree/number_to_adjust_by) for node, degree in nx.degree(G_sample)])
nx.set_node_attributes(G_sample, name='adjusted_node_size', values=adjusted_node_size)

In [141]:
#Choose attributes from G network to size and color by — setting manual size (e.g. 10) or color (e.g. 'skyblue') also allowed
size_by_this_attribute = 'adjusted_node_size'
color_by_this_attribute = 'cached_count' #'#860101' 

#Pick a color palette — Blues8, Reds8, Purples8, Oranges8, Viridis8
color_palette = Reds8

#Choose a title!
title = 'Fandom Network by Cross-overs, sample of ' + str(sample_size) + ' more than ' + str(min_count) + ' fics'

#Establish which categories will appear when hovering over each node
HOVER_TOOLTIPS = [
       ("Fandom", "@fandom"),
        ("Degree", "@degree")
]

#Create a plot — set dimensions, toolbar, and title
plot = figure(tooltips = HOVER_TOOLTIPS,
              tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
            x_range=Range1d(-1.5, 1.5), y_range=Range1d(-1.5, 1.5), title=title)

# pos = nx.spring_layout(G_sample, k=0.3, iterations=1000)

pos = nx.fruchterman_reingold_layout(G_sample, k=0.2, iterations=1000)

graph_sample = from_networkx(G_sample, pos, scale=1, center=(0, 0))

#Set node sizes and colors according to node degree (color as spectrum of color palette)
minimum_value_color = min(graph_sample.node_renderer.data_source.data[color_by_this_attribute])
maximum_value_color = max(graph_sample.node_renderer.data_source.data[color_by_this_attribute])
graph_sample.node_renderer.glyph = Circle(radius=size_by_this_attribute, fill_color=linear_cmap(color_by_this_attribute, color_palette, maximum_value_color, minimum_value_color), fill_alpha=0.7)

# graph_sample.node_renderer.glyph = Circle(radius=size_by_this_attribute, fill_color=color_by_this_attribute, fill_alpha=0.5)
# graph_sample.edge_renderer.glyph = MultiLine(line_alpha=1, line_width=4, line_color="black")

plot.renderers.append(graph_sample)

show(plot)



In [None]:
#Create a network graph object
# https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html\
network_graph = from_networkx(G, networkx.spring_layout, scale=10, center=(0, 0))

#Set node sizes and colors according to node degree (color as spectrum of color palette)
minimum_value_color = min(network_graph.node_renderer.data_source.data[color_by_this_attribute])
maximum_value_color = max(network_graph.node_renderer.data_source.data[color_by_this_attribute])
network_graph.node_renderer.glyph = Circle(size=size_by_this_attribute, fill_color=linear_cmap(color_by_this_attribute, color_palette, minimum_value_color, maximum_value_color))

#Set edge opacity and width
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1)

plot.renderers.append(network_graph)

show(plot)
#save(plot, filename=f"{title}.html")

### Graph top 100

In [46]:
top_edgelist.head()

Unnamed: 0,count,integer_1,integer_2,name_1,name_2
0,83981,414093,1001939,Marvel Cinematic Universe,The Avengers (Marvel Movies)
46,51455,414093,586439,Marvel Cinematic Universe,Captain America (Movies)
47,35409,586439,1001939,Captain America (Movies),The Avengers (Marvel Movies)
156,31923,116304,242462,A Song of Ice and Fire - George R. R. Martin,Game of Thrones (TV)
366,27523,727114,1001939,The Avengers (Marvel) - All Media Types,The Avengers (Marvel Movies)


In [48]:
# needs to be indices so that bokeh can work

G_top = nx.from_pandas_edgelist(top_edgelist, source='integer_1', target='integer_2', edge_attr='count')
G_top.nodes(data=True)

NodeDataView({414093: {}, 1001939: {}, 586439: {}, 116304: {}, 242462: {}, 727114: {}, 226657: {}, 105692: {}, 133185: {}, 7266: {}, 114591: {}, 27251507: {}, 101375: {}, 6048501: {}, 245368: {}, 858574: {}, 1080663: {}, 406: {}, 27785: {}, 11987966: {}, 33035890: {}, 218280: {}, 254648: {}, 827055: {}, 29577: {}, 32833447: {}, 747342: {}, 2007008: {}, 578887: {}, 541478: {}, 873394: {}, 82576: {}, 236208: {}, 5407051: {}, 115613: {}, 224545: {}, 10801369: {}, 105412: {}, 114961: {}, 299357: {}, 29890172: {}, 229522: {}, 130638: {}, 390: {}, 658827: {}, 134900: {}, 34555820: {}, 1633246: {}, 11902568: {}, 969647: {}, 29576: {}, 1801: {}, 601802: {}, 28642: {}, 711035: {}, 879346: {}, 235690: {}, 23985107: {}, 116314: {}, 452309: {}, 219012: {}, 726906: {}, 21872622: {}, 34135873: {}, 662604: {}, 25760640: {}, 169: {}, 13253: {}, 102330: {}, 133386: {}, 21927: {}, 96137: {}, 448284: {}, 452874: {}, 10223: {}, 22034: {}, 81269: {}, 4231202: {}, 145124: {}, 741433: {}, 289604: {}, 870188:

In [49]:
nx.set_node_attributes(G_top, name='fandom', values=id_fandom_mapping)

# node attributes for degree 
degrees = dict(nx.degree(G_top))
nx.set_node_attributes(G_top, name='degree', values=degrees)