# scatter plot of average fic length for various ships/fandoms

public deepnote file here: https://deepnote.com/workspace/ao3-cdb24469-2834-4827-96fb-17793ac7554f/project/AO3-x-NetworkX-ebe988dc-072f-4e99-a43e-95707726522c/notebook/AO3%3A%20fic%20lengths-d038f08f2cd04613a0e30b8df5daeeea

## Set up

In [127]:
import numpy as np
import pandas as pd
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import combinations
import scipy
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, from_networkx, output_file, save
from bokeh.models import HoverTool, ColorBar, LinearColorMapper, ColumnDataSource
from bokeh.transform import linear_cmap
from bokeh.layouts import column
import matplotlib.colors as mcolors
from bokeh.embed import file_html
from bokeh.resources import CDN
import matplotlib.dates as mdates

# FOR FIGURES
sns.set_context("paper")
sns.set_palette("deep")
sns.set_style("white")

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.facecolor'] = '#EBE9E1'
plt.rcParams['figure.facecolor'] = '#EBE9E1'

# plt.rcParams['font.sans-serif'] = ['Courier']

# set random seed
# np.random.seed(seed=42)

In [3]:
tags = pd.read_csv("tags-20210226.csv")
works = pd.read_csv("works-20210226.csv")

In [4]:
# process tags
works['tags'] = works['tags'].astype(str).apply(lambda tags: tags.split('+'))

In [5]:
fandoms = tags[tags['type'] == "Fandom"]

## Prep

In [14]:
# Filter to only columns that are needed - merge will need a lot of compute
works_small = works[["tags", "word_count"]]
fandoms_small = fandoms[['id', 'name', 'cached_count']]
works_small

Unnamed: 0,tags,word_count
0,"[10, 414093, 1001939, 4577144, 1499536, 110, 4...",388.0
1,"[10, 20350917, 34816907, 23666027, 23269305, 2...",1638.0
2,"[10, 10613413, 9780526, 3763877, 3741104, 7657...",1502.0
3,"[10, 15322, 54862755, 20595867, 32994286, 663,...",100.0
4,"[11, 721553, 54604, 1439500, 3938423, 53483274...",994.0
...,...,...
7269688,"[78, 77, 84, 101, 104, 105, 106, 23, 13, 16, 7...",705.0
7269689,"[78, 77, 84, 107, 23, 10, 16, 70, 933, 616]",1392.0
7269690,"[77, 78, 69, 108, 109, 62, 110, 23, 9, 111, 16...",1755.0
7269691,"[112, 113, 13, 114, 16, 115, 101, 117, 118, 11...",1338.0


In [15]:
# Separate out the list of tags to individual rows
works_small = works_small.explode('tags').reset_index()
works_small = works_small.drop(columns=['index']).dropna()
works_small

Unnamed: 0,index,tags,word_count
0,0,10,388.0
1,0,414093,388.0
2,0,1001939,388.0
3,0,4577144,388.0
4,0,1499536,388.0
...,...,...,...
119146247,7269692,2246,1836.0
119146248,7269692,2832,1836.0
119146249,7269692,2831,1836.0
119146250,7269692,968,1836.0


In [17]:
#filter out the nans
works_small = works_small[works_small['tags'] != 'nan']

In [18]:
# change to int if not the matching won't work - still in str rn
works_small['tags'] = works_small['tags'].astype(int)

In [20]:
# filter only for tags that correspond to fandoms, to further reduce df size
works_small = works_small[works_small['tags'].isin(fandoms_small['id'])]

len(works_small)

9732905

## Merge

In [63]:
all_works_merged = pd.merge(works_small, fandoms_small, left_on='tags', right_on='id', how='left')

In [73]:
# group by to get average word count
fandoms_word_count = all_works_merged.groupby('id')[['word_count']].mean().reset_index()
fandoms_word_count = fandoms_word_count.rename(columns={"word_count": "average_word_count"})

fandoms_word_count

Unnamed: 0,id,average_word_count
0,27,6702.969641
1,31,2478.800000
2,37,4431.021739
3,46,3671.977899
4,56,2570.000000
...,...,...
150063,54949974,5381.000000
150064,54954141,16290.000000
150065,54956880,2190.750000
150066,55037796,5976.000000


In [74]:
all_works_merged_for_merge = all_works_merged[['id', 'cached_count', 'name']]
all_works_merged_for_merge = all_works_merged_for_merge.drop_duplicates()
all_works_merged_for_merge

Unnamed: 0,id,cached_count,name
0,414093,240536,Marvel Cinematic Universe
1,1001939,157813,The Avengers (Marvel Movies)
2,20350917,2338,Rusty Quill Gaming (Podcast)
3,10613413,7393,mha
4,9780526,5078,My Hero Academia
...,...,...,...
9732742,809,1,Redacted
9732745,821,1,Redacted
9732754,851,1,Redacted
9732849,328,1,Redacted


In [76]:
word_cache_counts = pd.merge(fandoms_word_count, all_works_merged_for_merge, how='left', on='id')
word_cache_counts

Unnamed: 0,id,average_word_count,cached_count,name
0,27,6702.969641,310300,Supernatural
1,31,2478.800000,5,Redacted
2,37,4431.021739,47,Boondock Saints (1999)
3,46,3671.977899,3538,Lord of the Rings RPF
4,56,2570.000000,6,Gravitation (Anime)
...,...,...,...,...
150063,54949974,5381.000000,1,Redacted
150064,54954141,16290.000000,1,Redacted
150065,54956880,2190.750000,5,Redacted
150066,55037796,5976.000000,1,Redacted


In [78]:
# check
word_cache_counts.isna().sum()  

id                    0
average_word_count    0
cached_count          0
name                  0
dtype: int64

In [79]:
# filter out redacted fandoms (why redacted?)
word_cache_counts = word_cache_counts[word_cache_counts['name'] != "Redacted"]
len(word_cache_counts)

# woah huge diff

45716

In [80]:
word_cache_counts

Unnamed: 0,id,average_word_count,cached_count,name
0,27,6702.969641,310300,Supernatural
2,37,4431.021739,47,Boondock Saints (1999)
3,46,3671.977899,3538,Lord of the Rings RPF
4,56,2570.000000,6,Gravitation (Anime)
5,65,9533.997624,33306,Avatar: The Last Airbender
...,...,...,...,...
149882,54806220,20.000000,1,Балканский рубеж | The Balkan Line (2019)
149935,54847029,11034.363636,11,Tomb Raider (Trinity)
149993,54881538,31.000000,1,Captives (1994)
150036,54899295,2473.000000,1,Chroniric XIX (Video Game)


# download for deepnote

In [125]:
word_cache_counts.to_csv('word_cache_counts.csv', index=False)

## Plot with bokeh

Too many rows! Will randomly sample 50 rows using the sample method

In [92]:
output_notebook()

In [107]:
sample_size = 100

In [128]:
sample_df = word_cache_counts.sample(sample_size)
source = ColumnDataSource(sample_df)

p = figure(
    tools="pan,wheel_zoom,save,reset, tap", active_scroll='wheel_zoom',
    width=800, height=800
)
p.scatter(x="cached_count", y='average_word_count', 
         source = source,
         size=10, fill_color='#18323d', line_color = '#18323d', fill_alpha=0.5)

p.title.text = "Scatterplot of Average Word Count and Cached Count of "+str(sample_size)+" Fandoms"
p.xaxis.axis_label = 'Number of Fics on AO3'
p.yaxis.axis_label = 'Average Word Count'

hover = HoverTool()
hover.tooltips=[
    ('Fandom', '@name'),
    ('#Fics', '@cached_count'),
    ('Avg Word Count', '@average_word_count')
]
p.add_tools(hover)

show(p)

### Try limiting to fandoms with more than 100 fic

In [120]:
min_fics = 100
word_cache_counts_min = word_cache_counts[word_cache_counts['cached_count'] > min_fics]
len(word_cache_counts_min)

5472

In [123]:
sample_df_min = word_cache_counts_min.sample(sample_size)
source_min = ColumnDataSource(sample_df_min)

p = figure(
    tools="pan,wheel_zoom,save,reset, tap", active_scroll='wheel_zoom',
    width=800, height=800
)
p.scatter(x="cached_count", y='average_word_count', 
         source = source_min,
         size=10, fill_color='#18323d', line_color = '#18323d', fill_alpha=0.5)

p.title.text = "Scatterplot of Average Word Count and Cached Count of "+str(sample_size)+" Fandoms (min "+str(min_fics)+" fics)"
p.xaxis.axis_label = 'Number of Fics on AO3'
p.yaxis.axis_label = 'Average Word Count'

hover = HoverTool()
hover.tooltips=[
    ('Fandom', '@name'),
    ('#Fics', '@cached_count'),
    ('Avg Word Count', '@average_word_count')
]
p.add_tools(hover)

show(p)