In [1]:
# imports libraries
import pickle						# import/export lists
import re 							# regular expression
import pandas as pd					# dataframes
import datetime						# dates

In [2]:
# opens raw data
with open ('data_stories', 'rb') as fp:
    data_stories = pickle.load(fp)

In [3]:
# converts to dataframe
df = pd.DataFrame(data_stories)
df.columns = ['storyid', 'userid', 'cat', 'title', 'summary', 'info', 'error']

In [4]:
# finds userid from link
uid = re.compile("=(.*)$")
df.loc[df.userid != 'NA', 'userid'] = [uid.search(row).group(1) 
                                       for row in df.loc[df.userid != 'NA', 'userid']]

In [5]:
# splits up category and fandom
df['media'] = 'NA' 
df['fandom'] = 'NA'
iscontained = [type(row) is list and len(row) == 2 for row in df['cat']]
iscrossover = [type(row) is list and len(row) == 1 for row in df['cat']]
df.loc[iscontained, 'media'] = [row[0] for row in df.loc[iscontained, 'cat']]
df.loc[iscontained, 'fandom'] = [row[1] for row in df.loc[iscontained, 'cat']]
df.loc[iscrossover, 'media'] = 'Crossover'
df.loc[iscrossover, 'fandom'] = [row[0] for row in df.loc[iscrossover, 'cat']]
del df['cat']

In [6]:
# splits info
df.loc[df.userid != 'NA', 'info'] = [row.split(' - ') 
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [7]:
# retrieves rated
df['rated'] = 'NA'
df.loc[df.userid != 'NA', 'rated'] = [re.sub('Rated: Fiction  ', '', row[0])
                                     for row in df.loc[df.userid != 'NA', 'info']]
df.loc[df.userid != 'NA', 'info'] = [row[1:]
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [8]:
# retrieves language
df['language'] = 'NA'
df.loc[df.userid != 'NA', 'language'] = [row[0]
                                     for row in df.loc[df.userid != 'NA', 'info']]
df.loc[df.userid != 'NA', 'info'] = [row[1:]
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [9]:
# retrieves genre
isgenre = [type(row) is list and ': ' not in row[0] and row[0][0] != ' ' for row in df['info']]
df['genre'] = 'NA'
df.loc[isgenre, 'genre'] = [row[0]
                            for row in df.loc[isgenre, 'info']]
df.loc[isgenre, 'info'] = [row[1:]
                            for row in df.loc[isgenre, 'info']]

In [10]:
# retrieves characters
ischaracter = [type(row) is list and ': ' not in row[0] for row in df['info']]
df['characters'] = 'NA'
a = [row[0] for row in df.loc[ischaracter, 'info']]
df.loc[ischaracter, 'characters'] = a
df.loc[ischaracter, 'info'] = [row[1:]
                            for row in df.loc[ischaracter, 'info']]

# if hyphen in character's name (rare)
ischaracter = [type(row) is list and ': ' not in row[0] for row in df['info']]
b = [row[0] for row in df.loc[ischaracter, 'info']]
df.loc[ischaracter, 'characters'] = [m+n for m,n in zip(a,b)]
df.loc[ischaracter, 'info'] = [row[1:]
                            for row in df.loc[ischaracter, 'info']]

In [11]:
# retrieves chapters
ischapter = [type(row) is list and 'Chapters' in row[0] for row in df['info']]
df['chapters'] = 'NA'
df.loc[ischapter, 'chapters'] = [re.sub("\D", "", row[0])
                            for row in df.loc[ischapter, 'info']]
df.loc[ischapter, 'info'] = [row[1:]
                            for row in df.loc[ischapter, 'info']]

In [12]:
# retrieves words
iswords = [type(row) is list for row in df['info']]
df['words'] = 'NA'
df.loc[iswords, 'words'] = [re.sub("\D", "", row[0])
                            for row in df.loc[iswords, 'info']]
df.loc[iswords, 'info'] = [row[1:]
                            for row in df.loc[iswords, 'info']]

In [13]:
# retrieves reviews
isreviews = [type(row) is list and 'Reviews' in row[0] for row in df['info']]
df['reviews'] = 'NA'
df.loc[isreviews, 'reviews'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isreviews, 'info']]
df.loc[isreviews, 'info'] = [row[1:]
                            for row in df.loc[isreviews, 'info']]

In [14]:
# retrieves favs
isfavs = [type(row) is list and 'Favs' in row[0] for row in df['info']]
df['favs'] = 'NA'
df.loc[isfavs, 'favs'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isfavs, 'info']]
df.loc[isfavs, 'info'] = [row[1:]
                            for row in df.loc[isfavs, 'info']]

In [15]:
# retrieves follows
isfollows = [type(row) is list and 'Follows' in row[0] for row in df['info']]
df['follows'] = 'NA'
df.loc[isfollows, 'follows'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isfollows, 'info']]
df.loc[isfollows, 'info'] = [row[1:]
                            for row in df.loc[isfollows, 'info']]

In [16]:
# retrieves updated
isupdated = [type(row) is list and 'Updated' in row[0] for row in df['info']]
df['updated'] = 'NA'
df.loc[isupdated, 'updated'] = [re.sub('Updated: ', '', row[0])
                            for row in df.loc[isupdated, 'info']]
df.loc[isupdated, 'info'] = [row[1:]
                            for row in df.loc[isupdated, 'info']]

In [17]:
# retrieves published
ispublished = [type(row) is list and 'Published' in row[0] for row in df['info']]
df['published'] = 'NA'
df.loc[ispublished, 'published'] = [re.sub('Published: ', '', row[0])
                            for row in df.loc[ispublished, 'info']]
df.loc[ispublished, 'info'] = [row[1:]
                            for row in df.loc[ispublished, 'info']]

In [18]:
# retrieves status
isstatus = [type(row) is list and 'Status' in row[0] for row in df['info']]
df['status'] = 'NA'
df.loc[df.userid != 'NA', 'status'] = 'Incomplete'
df.loc[isstatus, 'status'] = [re.sub('Status: ', '', row[0])
                              for row in df.loc[isstatus, 'info']]

In [19]:
del df['info']

In [20]:
# format numeric types
intcols = ['chapters', 'words', 'reviews', 'favs', 'follows']
for intcol in intcols:
    df[intcol] = pd.to_numeric(df[intcol], errors = 'coerce')

In [21]:
# formats published dates
cyear = str(datetime.datetime.now().year)
pub_date = df.loc[df.published != 'NA', 'published']
pub_date = [row.split('/') for row in pub_date]
df.loc[df.published != 'NA', 'published'] = pub_date
for row in df.loc[df.published != 'NA', 'published']:
    if len(row) == 2:
        row.append(cyear)

In [22]:
# formats updated dates
upd_date = df.loc[df.updated != 'NA', 'updated']
upd_date = [row.split('/') for row in upd_date]
df.loc[df.updated != 'NA', 'updated'] = upd_date
for row in df.loc[df.updated != 'NA', 'updated']:
    if len(row) == 2:
        row.append(cyear)

In [23]:
# finds current state of story
df['state'] = 'online'
df.loc[df.userid == 'NA', 'state'] = 'deleted'
df.loc[df.error != 'NA', 'state'] = 'missing'

del df['error']

In [24]:
# saves dataframe
df.to_pickle("df_stories")