In [51]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import re

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [52]:
url="https://www.booksoftitans.com/list"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(
    url,
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")

In [53]:
# soup.text

In [54]:
table = soup.find('table', id="tablepress-3")

In [55]:
# str(table)

In [56]:
dfs = pd.read_html(str(table))
len(dfs)

1

In [57]:
df = dfs[0]
len(df)

2454

In [58]:
fixes = {
    'body of science': 'body by science',
    'garden cities of to morrow': 'garden cities of tomorrow',
    'poor richards almanac': 'poor richards almanack',
    'school for good and evil series': 'school for good and evil',
    'secrets of power negotiating audio': 'secrets of power negotiating',
    'humans of new york stories': 'humans of new york',
    'i thought it was just me but it isnt': 'i thought it was just me',
    'i will teach you to be rich second edition': 'i will teach you to be rich',
}

In [9]:
df = df.dropna(subset=['Book'])
df['base_title'] = df['Book'].apply(lambda x: re.split("[:;]+", x)[0].lower().replace("-", " "))
df['base_title'] = df['base_title'].apply(lambda x: re.sub('[^A-Za-z0-9\s]+', '', x))
df['base_title'] = df['base_title'].apply(lambda text: text[text.startswith('the ') and len('the '):])
df['base_title'] = df['base_title'].apply(lambda x: fixes[x] if x in fixes.keys() else x)
df['base_title'] = df['base_title'].apply(lambda x: x.rstrip())

In [10]:
df.head(10)

Unnamed: 0,Book,Author,Recommended By,Podcast #,base_title
0,Presto!: How I Made Over 100 Pounds Disappear ...,Penn Jillette,Penn Jillette,405,presto
1,"God, No!: Signs You May Already Be an Atheist ...",Penn Jillette,Penn Jillette,405,god no
2,The Art of Thought Reading,Joseph Dunniger,Penn Jillette,405,art of thought reading
3,"Flim-Flam! Psychics, ESP, Unicorns, and Other ...",James Randi,Penn Jillette,405,flim flam psychics esp unicorns and other delu...
4,The Artist's Way,Julia Cameron,Penn Jillette,405,artists way
5,On the Road,Jack Kerouac,Penn Jillette,405,on the road
6,Eat for Life,Joel Fuhrman,Penn Jillette,405,eat for life
7,First Bite,Bee Wilson,Penn Jillette,405,first bite
8,"Surely You're Joking, Mr. Feynman!",Richard Feynman,Penn Jillette,405,surely youre joking mr feynman
9,The Scientist in the Crib,"Alison Gopnik, Andrew N. Meltzoff, and Patrici...",Steve Jurvetson,404,scientist in the crib


In [61]:
df[df['Book'] == '######']

Unnamed: 0,Book,Author,Recommended By,Podcast #


In [12]:
books = pysqldf(
"""
select distinct
    df.base_title,
    book,
    author,
    df_counts.count
from df
inner join (
    select
        base_title,
        count(*) as count
    from df
    group by 1
) as df_counts on df.base_title = df_counts.base_title
order by df_counts.count desc, df.base_title
"""
)
books = books.fillna('#####')
books.head()

Unnamed: 0,base_title,Book,Author,count
0,4 hour workweek,The 4-Hour Workweek,Timothy Ferriss,42
1,4 hour workweek,The 4-Hour Workweek,Tim Ferriss,42
2,4 hour workweek,The 4-Hour Workweek,The 4-Hour Workweek,42
3,4 hour workweek,"The 4-Hour Workweek: Escape 9-5, Live Anywhere...",Timothy Ferriss,42
4,tools of titans,Tools of Titans,Timothy Ferriss,25


In [13]:
bl = pysqldf(
"""
select
    base_title,
    count(*) as count
from df
group by 1
order by 2 desc
"""
)
bl = bl.fillna('#####')
bl.head(50)

Unnamed: 0,base_title,count
0,4 hour workweek,42
1,tools of titans,25
2,4 hour chef,21
3,4 hour body,19
4,tribe of mentors,17
5,surely youre joking mr feynman,14
6,bird by bird,12
7,black swan,12
8,mans search for meaning,12
9,radical acceptance,12


In [14]:
def cross_fuzz(df, column):
    ct = pd.crosstab(df[column].unique(), df[column].unique())
    ct = ct.apply(lambda col: [fuzz.ratio(col.name, x) for x in col.index])
    return ct

In [15]:
ct = cross_fuzz(bl, 'base_title')
ct.head()

col_0,Unnamed: 1_level_0,10 happier,100 secrets of the art world,100 year life,101 knife designs,12 rules for life,13 clocks,13 secrets for speaking fluent japanese,1984,22 immutable laws of branding,...,wtf,year of living biblically,year without pants,years of lyndon johnson set 4 books,you can be a stock market genius,your memory,your money or your life,zen and the art of motorcycle maintenance,zero to one,zorba the greek
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10 happier,0,100,32,52,37,30,21,20,14,21,...,0,11,21,13,19,29,18,20,19,32
100 secrets of the art world,0,32,100,44,31,44,22,48,6,32,...,13,34,35,38,37,26,39,41,36,33
100 year life,0,52,44,100,40,60,27,31,12,24,...,12,37,39,29,27,33,44,30,33,29
101 knife designs,0,37,31,40,100,29,31,32,10,30,...,10,24,29,27,33,14,25,24,21,25


In [16]:
matches = {}
for row_name, row in ct.iterrows():
    matched = []
    for col_name, col_value in row.iteritems():
        if (col_value < 100) and (col_value > 75):
            matched.append((col_name, col_value))
    if matched:
        matches[row_name] = matched        

In [17]:
matches

{'22 immutable laws of branding': [('22 immutable laws of marketing', 85)],
 '22 immutable laws of marketing': [('22 immutable laws of branding', 85)],
 'a guide to the good life': [('art of the good life', 77)],
 'a peoples history of the united states': [('oxford history of the united states',
   82)],
 'a walk in the woods': [('awake in the wild', 78)],
 'art of asking': [('art of learning', 79),
  ('art of living', 77),
  ('art of loving', 77)],
 'art of fielding': [('art of living', 79), ('art of loving', 79)],
 'art of learning': [('art of asking', 79),
  ('art of living', 79),
  ('art of loving', 79)],
 'art of living': [('art of asking', 77),
  ('art of fielding', 79),
  ('art of learning', 79),
  ('art of loving', 92)],
 'art of loving': [('art of asking', 77),
  ('art of fielding', 79),
  ('art of learning', 79),
  ('art of living', 92)],
 'art of the good life': [('a guide to the good life', 77)],
 'art of war': [('war of art', 80)],
 'autobiography of a yogi': [('autobiogra

In [18]:
df[df['base_title'].isin(['tao te ching'])]

Unnamed: 0,Book,Author,Recommended By,Podcast #,base_title
842,Tao Te Ching,Lao Tzu,Soman Chainani / Susan Cain / Graham Duncan,292,tao te ching
898,Tao Te Ching,Lao Tzu,Gretchen Rubin,290,tao te ching
1156,Tao Te Ching,Lao Tsu,,250,tao te ching
1328,Tao Te Ching,Lao Tsu,Krista Tippett,223,tao te ching
1372,Tao Te Ching,Lao Tsu,Soman Chainani,220,tao te ching
1382,Tao Te Ching,Lao Tsu,Adam Robinson,219,tao te ching
1470,Tao Te Ching,Lao Tsu,"Josh Waitzkin, Ramit Sethi, and Adam Robinson",210,tao te ching
1647,Tao Te Ching,"Lao Tzu, Sam Torode and Ancient Renewal",Jason Nemer,182,tao te ching
1892,Tao Te Ching,"Lao Tzu, Sam Torode and Ancient Renewal",Naval Ravikant,136,tao te ching
2102,Tao Te Ching,Laozi,Laird Hamilton,89,tao te ching


In [33]:
flat = []
for base_title, grp in df.groupby('base_title'):
    flat.append(
        {
            'base_title': base_title,
            'title': list(grp['Book'].unique()),
            'author': list(grp['Author'].unique()),
            'recommended_by': list(grp['Recommended By'].unique()),
            'podcast_no': list(grp['Podcast #'].unique())
        }
    )
flt = pd.DataFrame(flat)
flt.head(50)

Unnamed: 0,author,base_title,podcast_no,recommended_by,title
0,,,281,Stewart Brand,######
1,Dan Harris,10 happier,50,Dr. Peter Attia,10% Happier
2,Thomas Girst & Magnus Resch,100 secrets of the art world,343,Seth Godin,100 Secrets of the Art World
3,"""Lynda Gratton, Andrew Scott""",100 year life,374,Chip Conley,The 100-Year Life
4,Murray Carter,101 knife designs,236,Murray Carter,101 Knife Designs: Practical Knives for Daily Use
5,Jordan Peterson,12 rules for life,332,Coach George Raveling,12 Rules for Life
6,James Thurber,13 clocks,366,Neil Gaiman,The 13 Clocks
7,Giles Murray,13 secrets for speaking fluent japanese,315236224,"Tim Ferriss,Murray Carter,Kevin Rose",13 Secrets for Speaking Fluent Japanese
8,George Orwell,1984,387305190,"Tristan Harris,Daniel Pink,Matt Mullenweg",1984
9,"Al Ries & Laura Ries,Al Ries and Laura Ries",22 immutable laws of branding,351210,"""Allen Walton, Elaine Pofeldt"",""Josh Waitzkin,...",The 22 Immutable Laws of Branding


In [62]:
len(flt)

1453

In [25]:
flt.to_json('tim_ferriss_book_reccommendations.json', orient='records')