In [69]:
# imports libraries
import pickle						# import/export lists
import re 							# regular expression
import math							# mathematical functions
import numpy as np
import pandas as pd					# dataframes

In [2]:
# opens raw data
with open ('data_stories', 'rb') as fp:
    data_stories = pickle.load(fp)

In [45]:
# converts to dataframe
df = pd.DataFrame(data_stories)
df.columns = ['storyid', 'userid', 'cat', 'title', 'summary', 'info']

In [46]:
# finds userid from link
uid = re.compile("=(.*)$")
df.loc[df.userid != 'NA', 'userid'] = [uid.search(row).group(1) 
                                       for row in df.loc[df.userid != 'NA', 'userid']]

In [47]:
# splits up category and fandom
df['category'] = 'NA' 
df['fandom'] = 'NA'
df.loc[df.cat != 'NA', 'category'] = [row[0] for row in df.loc[df.cat != 'NA', 'cat']]
hasfandom = [type(row) is list and len(row) > 1 for row in df['cat']]
df.loc[hasfandom, 'fandom'] = [row[1] for row in df.loc[hasfandom, 'cat']]
del df['cat']

In [63]:
# splits info
df.loc[df.userid != 'NA', 'info'] = [row.split(' - ') 
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [72]:
# retrieves rating
df['rated'] = 'NA'
df.loc[df.userid != 'NA', 'rated'] = [row[0]
                                     for row in df.loc[df.userid != 'NA', 'info']]
df.loc[df.userid != 'NA', 'info'] = [row[1:]
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [83]:
# retrieves language
df['language'] = 'NA'
df.loc[df.userid != 'NA', 'language'] = [row[0]
                                     for row in df.loc[df.userid != 'NA', 'info']]
df.loc[df.userid != 'NA', 'info'] = [row[1:]
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [94]:
# retrieves genre
isgenre = [type(row) is list and ': ' not in row[0] and row[0][0] != ' ' for row in df['info']]
df['genre'] = 'NA'
df.loc[isgenre, 'genre'] = [row[0]
                            for row in df.loc[isgenre, 'info']]
df.loc[isgenre, 'info'] = [row[1:]
                            for row in df.loc[isgenre, 'info']]

In [98]:
# retrieves characters
ischaracter = [type(row) is list and ': ' not in row[0] for row in df['info']]
df['characters'] = 'NA'
df.loc[ischaracter, 'characters'] = [row[0]
                            for row in df.loc[ischaracter, 'info']]
df.loc[ischaracter, 'info'] = [row[1:]
                            for row in df.loc[ischaracter, 'info']]

In [100]:
# retrieves chapters
ischapter = [type(row) is list and 'Chapters' in row[0] for row in df['info']]
df['chapters'] = 'NA'
df.loc[ischapter, 'chapters'] = [row[0]
                            for row in df.loc[ischapter, 'info']]
df.loc[ischapter, 'info'] = [row[1:]
                            for row in df.loc[ischapter, 'info']]

In [None]:
# the next item is either characters, chapters, or words
# genre does not contain ',' '.' or ':'
# characters contain space ' ' before and after

In [101]:
df

Unnamed: 0,storyid,userid,title,summary,info,category,fandom,rated,language,genre,characters,chapters
0,36526,,,,,,,,,,,
1,52417,,,,,,,,,,,
2,54058,,,,,,,,,,,
3,70618,,,,,,,,,,,
4,87424,,,,,,,,,,,
5,95917,,,,,,,,,,,
6,106985,24686,Fractal Images (2 of 2),Sometimes you can't see the circles for the crops,"[Words: 2,242, Reviews: 2, Favs: 1, Published:...",TV Shows,X-Files,Rated: Fiction T,English,Romance,,
7,107154,,,,,,,,,,,
8,127927,,,,,,,,,,,
9,128600,,,,,,,,,,,


In [4]:
# splits tabs into individual columns
tabs = df['tabs'].apply(pd.Series).fillna('0')
tabs = tabs.apply(pd.to_numeric)
tabs.columns = [name[1:] for name in tabs.columns]
df = df.join(tabs)
del df['tabs']

In [5]:
# parses description column
df['status'] = 'inactive'
df.loc[['reader' in row for row in df['desc']], 'status'] = 'reader'
df.loc[['author' in row for row in df['desc']], 'status'] = 'author'
del df['desc']

In [6]:
# parses date column
df['join'] = [re.split(r'[-/]+', row) for row in df['join_date']]
valid = [row[0] != 'NA' for row in df['join']]

df['join_month'] = 'NA'
df.loc[valid, 'join_month'] = [row[0] for row in df['join'][valid]]

df['join_year'] = 'NA'
df.loc[valid, 'join_year'] = [row[2] for row in df['join'][valid]]
old_time = [len(row) == 4 for row in df.join_year]
df.loc[old_time, 'join_year'] = [row[2:] for row in df.loc[old_time, 'join_year']]

del df['join']

In [7]:
# saves dataframe
df.to_pickle("df_profile")