In [12]:
import os
import pandas as pd
import collections
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import re
import numpy as np

In [None]:
def load_data(path_to_folder):
    all_files = os.listdir(path_to_folder) # get all files
    valid_files = [file for file in all_files if 'errors' not in file] # remove error logs
    dfs = [] # empty list
    for file in valid_files:
        file_path = os.path.join(path_to_folder, file)
        filename = str(file.split('.')[0])
        fandom = str(filename.split('_')[0])
        au = str(filename.split('_')[1])
        try:
            print(f"Loading file: {file}")
            df = pd.read_csv(file_path)
            df['Main fandom'] = fandom
            df['Main AU'] = au
            dfs.append(df) # load files into list of dfs
            print(f"Finished loading file: {file}")
        except Exception as e:
            print(f"Could not read {file}: {e}") # in case sth is missed
    big_df = pd.concat(dfs, ignore_index=True) # pull into df
    return big_df

In [3]:
path = os.path.join('..', '..', 'texts')

In [19]:
df = load_data(path)

hp_coffeeshop
Loading file: hp_coffeeshop.csv
Finished loading file: hp_coffeeshop.csv
hp_vampire
Loading file: hp_vampire.csv
Finished loading file: hp_vampire.csv
mcu_vampire
Loading file: mcu_vampire.csv
Finished loading file: mcu_vampire.csv
bts_vampire
Loading file: bts_vampire.csv
Finished loading file: bts_vampire.csv
bnha_royalty
Loading file: bnha_royalty.csv
Finished loading file: bnha_royalty.csv
mcu_royalty
Loading file: mcu_royalty.csv
Finished loading file: mcu_royalty.csv
mcu_coffeeshop
Loading file: mcu_coffeeshop.csv
Finished loading file: mcu_coffeeshop.csv
bnha_coffeeshop
Loading file: bnha_coffeeshop.csv
Finished loading file: bnha_coffeeshop.csv
bts_coffeeshop
Loading file: bts_coffeeshop.csv
Finished loading file: bts_coffeeshop.csv
bnha_vampire
Loading file: bnha_vampire.csv
Finished loading file: bnha_vampire.csv
mcu_soulmates
Loading file: mcu_soulmates.csv
Finished loading file: mcu_soulmates.csv
bts_soulmates
Loading file: bts_soulmates.csv
Finished loading f

In [23]:
df.columns

Index(['work_id', 'title', 'author', 'rating', 'category', 'fandom',
       'relationship', 'character', 'additional tags', 'language', 'published',
       'status', 'status date', 'words', 'chapters', 'comments', 'kudos',
       'bookmarks', 'hits', 'all_kudos', 'all_bookmarks', 'body',
       'Main fandom', 'Main AU'],
      dtype='object')

In [11]:
df = df.drop_duplicates(subset='work_id')

In [33]:
unique_types = df['words'].apply(type).unique()
print(unique_types)

[<class 'str'> <class 'float'>]


In [20]:
df.dtypes

work_id             int64
title              object
author             object
rating             object
category           object
fandom             object
relationship       object
character          object
additional tags    object
language           object
published          object
status             object
status date        object
words              object
chapters           object
comments           object
kudos              object
bookmarks          object
hits               object
all_kudos          object
all_bookmarks      object
body               object
Main fandom        object
Main AU            object
dtype: object

In [5]:
def str_to_list(value):
    if pd.isna(value): 
        value = []
        return value
    elif isinstance(value, str):
        value = value.strip("[]").replace("'", "").split(", ")
        return value
    else:
        pass
    
def str_cols_to_list(data, cols):
    for col in cols:
        data[col] = data[col].apply(str_to_list)
    return data

In [21]:
str_columns = ['author', 'category', 'fandom', 'relationship', 'character', 'additional tags', 'all_kudos']
df = str_cols_to_list(df, str_columns)

In [7]:
def str_to_int(value):
    if pd.isna(value): 
        value = 0
        value = int(value)
        return value
    elif isinstance(value, int):
        pass
    elif isinstance(value, float):
        value = int(value)
        return(value)
    elif isinstance(value, str):
        value = value.replace(",", "").replace(".0", "")
        value = int(value)
        return value
    else:
        print(f"Look at this weird value: {value}")

def str_cols_to_int(data, cols):
    for col in cols:
        data[col] = data[col].apply(str_to_int)
    return data

In [22]:
int_columns = ['words', 'comments', 'kudos', 'bookmarks', 'hits']
df = str_cols_to_int(df, int_columns)

In [23]:
test=pd.DataFrame

In [25]:
def int_to_log(value):
    if value!=0:
        value = np.log10(value)
    return value
def int_cols_to_log(data, cols):
    for col in cols:
        data[f'log_{col}'] = data[col].apply(int_to_log)
    return data

In [28]:
int_columns = ['words', 'comments', 'kudos', 'bookmarks', 'hits']
df = int_cols_to_log(df, int_columns)

In [29]:
df.dtypes

work_id              int64
title               object
author              object
rating              object
category            object
fandom              object
relationship        object
character           object
additional tags     object
language            object
published           object
status              object
status date         object
words                int64
chapters            object
comments             int64
kudos                int64
bookmarks            int64
hits                 int64
all_kudos           object
all_bookmarks       object
body                object
Main fandom         object
Main AU             object
log_words          float64
log_comments       float64
log_kudos          float64
log_bookmarks      float64
log_hits           float64
dtype: object

In [9]:
df.iloc[0:20, 13]

0       4457
1     113031
2        574
3      37345
4       5653
5       3452
6       6182
7      25406
8       3184
9       1875
10     19856
11     12735
12     32581
13     16707
14      9077
15      4976
16       580
17       797
18      3302
19      2515
Name: words, dtype: int64

In [84]:
df.iloc[0:20, 15]

0      0
1     72
2      0
3     20
4     32
5     13
6      6
7     89
8      1
9      4
10     2
11    35
12    17
13     0
14     0
15     5
16     6
17     0
18     2
19     0
Name: comments, dtype: int64

In [85]:
df.iloc[0:20, 16]

0       1
1     151
2       2
3     292
4      15
5      11
6      64
7     169
8      46
9      13
10     29
11    276
12     97
13      3
14     14
15     55
16      9
17      1
18      5
19      7
Name: kudos, dtype: int64

In [86]:
df.iloc[0:20, 17]

0       0
1      56
2       0
3     183
4       3
5       5
6       8
7      38
8       4
9       0
10      6
11     53
12     53
13      0
14      3
15      8
16      1
17      0
18      1
19      1
Name: bookmarks, dtype: int64

In [87]:
df.iloc[0:20, 17]

0       0
1      56
2       0
3     183
4       3
5       5
6       8
7      38
8       4
9       0
10      6
11     53
12     53
13      0
14      3
15      8
16      1
17      0
18      1
19      1
Name: bookmarks, dtype: int64

In [None]:
df['hits'].plot(kind='hist', bins=1000000)
plt.ylim(0, 10)
plt.xlim(0, 1000000)

In [8]:
meta = df.drop(columns='body')
txt = df[['work_id', 'body', 'Main fandom', 'Main AU']]