In [26]:
import pandas as pd
from uuid import uuid4

In [27]:
from book_tools.clean import (
    Cleaner, FillNA, Exclude, Substitute,
    FTFYFixText, RemoveNonAscii, FixMultipleSpaces,
    PadPunctuation, ParseHumanName, Lower, RemovePunctuation,
    FixedListLength, Split, ListFlatten, SplitCombinedNames
)

In [28]:
# Clean Books of Titans

In [29]:
bot = pd.read_csv('04_books_deduped.csv')
bot = bot[['book', 'author']].drop_duplicates().reset_index(drop=True)
bot.head()

Unnamed: 0,book,author
0,The Gift of Fear,Gavin de Becker
1,Getting the Love You Want,Harville Hendrix Ph.D. and Helen LaKelly Hunt PhD
2,Codependent No More,Melody Beattie
3,Spark,John J. Ratey MD
4,The Surrender Experiment,Michael A. Singer


In [34]:
len(bot)

1646

In [30]:
cleaner = Cleaner()

cleaner.add(FillNA('author', value='Unknown'))
# Apply Clerical Fixes to 'author' column.
clf = pd.read_csv('clerical_fixes__author.csv')
clf_map = {row['orig']: row['fix'] for _, row in clf.iterrows()}
cleaner.add(Substitute('author', subs=clf_map))

# Apply general text fixes
for column in ['author', 'book']:
    cleaner.add(FTFYFixText(column))
    cleaner.add(RemoveNonAscii(column))
    cleaner.add(FixMultipleSpaces(column))

bot = cleaner.compute(bot)

# Split combined author names
cleaner.add(SplitCombinedNames('author'))
bot = cleaner.compute(bot)

bot = bot.explode('author')
cleaner.add(Substitute('author', subs=clf_map))
cleaner.add(PadPunctuation('author'))
bot = cleaner.compute(bot)

# Parse Human Names
cleaner.add(ParseHumanName('author', out_label='split_author'))
bot = cleaner.compute(bot)

# Combine, Flatten and Truncate author names
bot = bot.groupby('book').agg(list).reset_index()
cleaner.add(ListFlatten('split_author'))
cleaner.add(FixedListLength('split_author', length=6, pad_value=''))
bot = cleaner.compute(bot)
bot['author_1_given_name'], bot['author_1_middle_name'], bot['author_1_surname'], \
bot['author_2_given_name'], bot['author_2_middle_name'], bot['author_2_surname'] = \
zip(*bot['split_author'])
bot = bot.drop(columns=['split_author'])

# Split book title and subtitle
cleaner.add(Split('book', out_label='split_book', separator=':', maxsplit=1))
cleaner.add(FixedListLength('split_book', length=2, pad_value=''))
bot = cleaner.compute(bot)
bot['title'], bot['subtitle'] = zip(*bot['split_book'])
bot = bot.drop(columns=['split_book'])

# Final Text Fixes
match_columns = [
    'title', 'subtitle',
    'author_1_given_name', 'author_1_middle_name', 'author_1_surname',
    'author_2_given_name', 'author_2_middle_name', 'author_2_surname'
]
for column in match_columns:
    cleaner.add(Lower(column))
    cleaner.add(RemovePunctuation(column))
    cleaner.add(FixMultipleSpaces(column))

bot = cleaner.compute(bot)

# Add uuid4 id
bot['uuid'] = bot.index.to_series().map(lambda x: f"bot:{uuid4()}")

# Reorder Columns
columns = ['uuid', 'book', 'author'] + match_columns
bot = bot[columns]

# Write to CSV
bot.to_csv('01_books_of_titans_clean.csv', index=False)

bot.head()

Unnamed: 0,uuid,book,author,title,subtitle,author_1_given_name,author_1_middle_name,author_1_surname,author_2_given_name,author_2_middle_name,author_2_surname
0,bot:d56c28d9-9fed-4a6e-960b-e4c2a75117dd,10% Happier,[Dan Harris],10 happier,,dan,,harris,,,
1,bot:63a1c3d6-663a-42d0-9eff-45d4cceb10eb,100 Secrets of the Art World,"[Thomas Girst, Magnus Resch]",100 secrets of the art world,,thomas,,girst,magnus,,resch
2,bot:7dca16f6-e960-4a84-a230-31219a9444bf,101 Knife Designs: Practical Knives for Daily Use,[Murray Carter],101 knife designs,practical knives for daily use,murray,,carter,,,
3,bot:0efc2f03-65c5-4654-8238-4dcc7bc1c96a,12 Rules for Life,[Jordan Peterson],12 rules for life,,jordan,,peterson,,,
4,bot:e4ebaa54-42f8-4173-a57f-7c2f00edebd8,13 Secrets for Speaking Fluent Japanese,[Giles Murray],13 secrets for speaking fluent japanese,,giles,,murray,,,


In [31]:
# Clean Audiobooks

In [32]:
aud = pd.read_csv('audiobooks.csv')
aud = aud.rename(columns={'title': 'book'})
aud = aud[['book', 'author']]
aud = aud.dropna(subset=['book'])
aud.head()

Unnamed: 0,book,author
0,The Book of Joy: Lasting Happiness in a Changi...,Dalai Lama XIV
1,The Secret Lives of Introverts: Inside Our Hid...,Jenn Granneman
2,Find Your Why: A Practical Guide to Discoverin...,Simon Sinek
3,"Outrage, Inc.: How the Liberal Mob Ruined Scie...",Derek Hunter
4,Blink: The Power of Thinking Without Thinking,Malcolm Gladwell


In [33]:
cleaner = Cleaner()

# Apply general text fixes
for column in ['author', 'book']:
    cleaner.add(FillNA(column, value=''))
    cleaner.add(FTFYFixText(column))
    cleaner.add(RemoveNonAscii(column))
    cleaner.add(FixMultipleSpaces(column))

aud = cleaner.compute(aud)

# Parse Human Names
cleaner.add(ParseHumanName('author', out_label='split_author'))
aud = cleaner.compute(aud)

# Combine, Flatten and Truncate author names
aud = aud.groupby('book').agg(list).reset_index()
cleaner.add(ListFlatten('split_author'))
cleaner.add(FixedListLength('split_author', length=6, pad_value=''))
aud = cleaner.compute(aud)
aud['author_1_given_name'], aud['author_1_middle_name'], aud['author_1_surname'], \
aud['author_2_given_name'], aud['author_2_middle_name'], aud['author_2_surname'] = \
zip(*aud['split_author'])
aud = aud.drop(columns=['split_author'])

# Split book title and subtitle
cleaner.add(Split('book', out_label='split_book', separator=':', maxsplit=1))
cleaner.add(FixedListLength('split_book', length=2, pad_value=''))
aud = cleaner.compute(aud)
aud['title'], aud['subtitle'] = zip(*aud['split_book'])
aud = aud.drop(columns=['split_book'])

# Final Text Fixes
match_columns = [
    'title', 'subtitle',
    'author_1_given_name', 'author_1_middle_name', 'author_1_surname',
    'author_2_given_name', 'author_2_middle_name', 'author_2_surname'
]
for column in match_columns:
    cleaner.add(Lower(column))
    cleaner.add(RemovePunctuation(column))
    cleaner.add(FixMultipleSpaces(column))

aud = cleaner.compute(aud)

# Add uuid4 id
aud['uuid'] = aud.index.to_series().map(lambda x: f"aud:{uuid4()}")

# Reorder Columns
columns = ['uuid', 'book', 'author'] + match_columns
aud = aud[columns]

# Write to CSV
aud.to_csv('01_audiobooks_clean.csv', index=False)

aud.head()

Unnamed: 0,uuid,book,author,title,subtitle,author_1_given_name,author_1_middle_name,author_1_surname,author_2_given_name,author_2_middle_name,author_2_surname
0,aud:e82e5be9-de17-402d-948d-1d3dcfd03577,12 Rules for Life: An Antidote to Chaos,[Jordan B. Peterson],12 rules for life,an antidote to chaos,jordan,b,peterson,,,
1,aud:896a6acb-d722-448f-b441-7148436420dd,21 Lessons for the 21st Century,[Yuval Noah Harari],21 lessons for the 21st century,,yuval,noah,harari,,,
2,aud:4c3bb39d-2032-4dec-81bf-c118d7fbc17f,A Brief History of Time,[Stephen Hawking],a brief history of time,,stephen,,hawking,,,
3,aud:ea0da6d0-c7b9-489d-8b57-07c9046731e5,A Slip of the Keyboard,[Terry Pratchett],a slip of the keyboard,,terry,,pratchett,,,
4,aud:34eeb21b-cf54-4dbb-b0ea-70792a789358,"A Troublesome Inheritance: Genes, Race and Hum...",[Nicholas J. Wade],a troublesome inheritance,genes race and human history,nicholas,j,wade,,,
