In [134]:
# imports libraries
import pickle						# import/export lists
import re 							# regular expression
import math							# mathematical functions
import pandas as pd					# dataframes

In [135]:
# opens raw data
with open ('data_stories', 'rb') as fp:
    data_stories = pickle.load(fp)

In [136]:
# converts to dataframe
df = pd.DataFrame(data_stories)
df.columns = ['storyid', 'userid', 'cat', 'title', 'summary', 'info']

In [137]:
# finds userid from link
uid = re.compile("=(.*)$")
df.loc[df.userid != 'NA', 'userid'] = [uid.search(row).group(1) 
                                       for row in df.loc[df.userid != 'NA', 'userid']]

In [138]:
# splits up category and fandom
df['category'] = 'NA' 
df['fandom'] = 'NA'
df.loc[df.cat != 'NA', 'category'] = [row[0] for row in df.loc[df.cat != 'NA', 'cat']]
hasfandom = [type(row) is list and len(row) > 1 for row in df['cat']]
df.loc[hasfandom, 'fandom'] = [row[1] for row in df.loc[hasfandom, 'cat']]
del df['cat']

In [139]:
# splits info
df.loc[df.userid != 'NA', 'info'] = [row.split(' - ') 
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [140]:
# retrieves rating
df['rated'] = 'NA'
df.loc[df.userid != 'NA', 'rated'] = [row[0]
                                     for row in df.loc[df.userid != 'NA', 'info']]
df.loc[df.userid != 'NA', 'info'] = [row[1:]
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [141]:
# retrieves language
df['language'] = 'NA'
df.loc[df.userid != 'NA', 'language'] = [row[0]
                                     for row in df.loc[df.userid != 'NA', 'info']]
df.loc[df.userid != 'NA', 'info'] = [row[1:]
                                     for row in df.loc[df.userid != 'NA', 'info']]

In [142]:
# retrieves genre
isgenre = [type(row) is list and ': ' not in row[0] and row[0][0] != ' ' for row in df['info']]
df['genre'] = 'NA'
df.loc[isgenre, 'genre'] = [row[0]
                            for row in df.loc[isgenre, 'info']]
df.loc[isgenre, 'info'] = [row[1:]
                            for row in df.loc[isgenre, 'info']]

In [143]:
# retrieves characters
ischaracter = [type(row) is list and ': ' not in row[0] for row in df['info']]
df['characters'] = 'NA'
df.loc[ischaracter, 'characters'] = [row[0]
                            for row in df.loc[ischaracter, 'info']]
df.loc[ischaracter, 'info'] = [row[1:]
                            for row in df.loc[ischaracter, 'info']]

In [144]:
# retrieves chapters
ischapter = [type(row) is list and 'Chapters' in row[0] for row in df['info']]
df['chapters'] = 'NA'
df.loc[ischapter, 'chapters'] = [re.sub("\D", "", row[0])
                            for row in df.loc[ischapter, 'info']]
df.loc[ischapter, 'info'] = [row[1:]
                            for row in df.loc[ischapter, 'info']]

In [145]:
# retrieves words
iswords = [type(row) is list for row in df['info']]
df['words'] = 'NA'
df.loc[iswords, 'words'] = [re.sub("\D", "", row[0])
                            for row in df.loc[iswords, 'info']]
df.loc[iswords, 'info'] = [row[1:]
                            for row in df.loc[iswords, 'info']]

In [146]:
# retrieves reviews
isreviews = [type(row) is list and 'Reviews' in row[0] for row in df['info']]
df['reviews'] = 'NA'
df.loc[isreviews, 'reviews'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isreviews, 'info']]
df.loc[isreviews, 'info'] = [row[1:]
                            for row in df.loc[isreviews, 'info']]

In [147]:
# retrieves favs
isfavs = [type(row) is list and 'Favs' in row[0] for row in df['info']]
df['favs'] = 'NA'
df.loc[isfavs, 'favs'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isfavs, 'info']]
df.loc[isfavs, 'info'] = [row[1:]
                            for row in df.loc[isfavs, 'info']]

In [148]:
# retrieves follows
isfollows = [type(row) is list and 'Follows' in row[0] for row in df['info']]
df['follows'] = 'NA'
df.loc[isfollows, 'follows'] = [re.sub("\D", "", row[0])
                            for row in df.loc[isfollows, 'info']]
df.loc[isfollows, 'info'] = [row[1:]
                            for row in df.loc[isfollows, 'info']]

In [149]:
# retrieves updated
isupdated = [type(row) is list and 'Updated' in row[0] for row in df['info']]
df['updated'] = 'NA'
df.loc[isupdated, 'updated'] = [row[0]
                            for row in df.loc[isupdated, 'info']]
df.loc[isupdated, 'info'] = [row[1:]
                            for row in df.loc[isupdated, 'info']]

In [150]:
# retrieves published
ispublished = [type(row) is list and 'Published' in row[0] for row in df['info']]
df['published'] = 'NA'
df.loc[ispublished, 'published'] = [row[0]
                            for row in df.loc[ispublished, 'info']]
df.loc[ispublished, 'info'] = [row[1:]
                            for row in df.loc[ispublished, 'info']]

In [151]:
# retrieves status
isstatus = [type(row) is list and 'Published' in row[0] for row in df['info']]
df['status'] = 'NA'
df.loc[df.userid != 'NA', 'status'] = 'Incomplete'
df.loc[isstatus, 'status'] = [row[0]
                            for row in df.loc[isstatus, 'info']]
df.loc[isstatus, 'info'] = [row[1:]
                            for row in df.loc[isstatus, 'info']]

In [152]:
del df['info']

In [153]:
df

Unnamed: 0,storyid,userid,title,summary,category,fandom,rated,language,genre,characters,chapters,words,reviews,favs,follows,updated,published,status
0,36526,,,,,,,,,,,,,,,,,
1,52417,,,,,,,,,,,,,,,,,
2,54058,,,,,,,,,,,,,,,,,
3,70618,,,,,,,,,,,,,,,,,
4,87424,,,,,,,,,,,,,,,,,
5,95917,,,,,,,,,,,,,,,,,
6,106985,24686,Fractal Images (2 of 2),Sometimes you can't see the circles for the crops,TV Shows,X-Files,Rated: Fiction T,English,Romance,,,2242,2,1,,,Published: 11/6/2000,Incomplete
7,107154,,,,,,,,,,,,,,,,,
8,127927,,,,,,,,,,,,,,,,,
9,128600,,,,,,,,,,,,,,,,,


In [154]:
# saves dataframe
df.to_pickle("df_stories")