In [150]:
import json
import requests
import itertools
import xmltodict

import pandas as pd
from pprint import pprint
from datetime import datetime

In [7]:
url="https://www.mostrecommendedbooks.com/sitemap.xml"

# Make a GET request to fetch the raw HTML content
xml_content = requests.get(
    url,
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
).text

In [8]:
sitemap = xmltodict.parse(
    xml_content,
    #process_namespaces=True
)

In [9]:
sitemap.keys()

odict_keys(['urlset'])

In [10]:
locs = [
    site['loc'].split('/')[-1]
    for site in sitemap['urlset']['url']
    if site['loc'].split('/')[-1]
]
locs

['elon-musk-twitter-books',
 'aj-jacobs-books',
 'adam-fisher-books',
 'adam-gazzaley-books',
 'adam-robinson-books',
 'adam-savage-books',
 'alain-de-botton-books',
 'alex-blumberg-books',
 'alex-honnold-books',
 'alexis-ohanian-books',
 'alice-little-books',
 'amanda-palmer-books',
 'amelia-boone-books',
 'andrew-zimmern-books',
 'aniela-gregorek-books',
 'ann-miura-ko-books',
 'annie-mist-orisdottir-books',
 'arianna-huffington-books',
 'arnold-schwarzenegger-books',
 'art-de-vany-books',
 'ashton-kutcher-books',
 'astro-teller-books',
 'aubrey-marcus-books',
 'ayaan-hirsi-ali-books',
 'bj-novak-books',
 'bj-miller-books',
 'barack-obama-books',
 'bear-grylls-books',
 'ben-horowitz-books',
 'ben-shapiro-books',
 'ben-silbermann-books',
 'ben-stiller-books',
 'bill-gates-books',
 'bill-nye-books',
 'bill-rasmussen-books',
 'blake-mycoskie-books',
 'bob-metcalfe-books',
 'bozoma-saint-john-books',
 'brandon-stanton-books',
 'brendan-moynihan-books',
 'brene-brown-books',
 'brian-armst

In [11]:
def get_recommender_json(url, f_base="https://www.mostrecommendedbooks.com/page-data/{stub}/page-data.json"):
    # Make a GET request to fetch the raw HTML content
    book_reccommendations = requests.get(
        f_base.format(stub=url),
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    ).json()
    return book_reccommendations

In [12]:
def extract_book_recommendations(json_response):
    try:
        return json_response['result']['data']['recommenderBooks']['recommenderBooks']
    except KeyError:
        return None

In [13]:
books, errors = [], []
for loc in locs:
    try:
        exp = get_recommender_json(loc)
    except:
        errors.append(loc)
        continue
    results = extract_book_recommendations(exp)
    if results:
        books.extend(results)
len(books)

3664

In [14]:
errors

[]

In [15]:
books[:5]

[{'author': 'Steven Pinker',
  'imageUrl': '/img/books/2EnlightenmentNow.png',
  'newLink': 'https://amzn.to/38dsTjH',
  'title': 'Enlightenment Now',
  'subtitle': 'The Case for Reason, Science, Humanism, and Progress',
  'recommenders': [{'name': 'A.J. Jacobs'},
   {'name': 'Bill Gates'},
   {'name': 'Jordan Peterson'},
   {'name': 'Sam Harris'}]},
 {'author': 'Sam Harris',
  'imageUrl': '/img/books/3FreeWill.png',
  'newLink': 'https://amzn.to/2v3bc8d',
  'title': 'Free Will',
  'subtitle': '',
  'recommenders': [{'name': 'A.J. Jacobs'}]},
 {'author': 'Adam Mansbach',
  'imageUrl': '/img/books/4GotheFktoSleep.png',
  'newLink': 'https://amzn.to/2RamwIv',
  'title': 'Go the F**k to Sleep',
  'subtitle': '',
  'recommenders': [{'name': 'A.J. Jacobs'}]},
 {'author': '',
  'imageUrl': '/img/books/5HolyBible.png',
  'newLink': 'https://amzn.to/3atfXbi',
  'title': 'Holy Bible',
  'subtitle': '',
  'recommenders': [{'name': 'A.J. Jacobs'},
   {'name': 'Caterina Fake'},
   {'name': 'Kevin 

In [125]:
df = pd.DataFrame(books)

In [126]:
len(df)

3664

In [127]:
len(df['title'].unique())

2654

In [128]:
len(df['author'].unique())

2039

In [129]:
df.head()

Unnamed: 0,author,imageUrl,newLink,recommenders,subtitle,title
0,Steven Pinker,/img/books/2EnlightenmentNow.png,https://amzn.to/38dsTjH,"[{'name': 'A.J. Jacobs'}, {'name': 'Bill Gates...","The Case for Reason, Science, Humanism, and Pr...",Enlightenment Now
1,Sam Harris,/img/books/3FreeWill.png,https://amzn.to/2v3bc8d,[{'name': 'A.J. Jacobs'}],,Free Will
2,Adam Mansbach,/img/books/4GotheFktoSleep.png,https://amzn.to/2RamwIv,[{'name': 'A.J. Jacobs'}],,Go the F**k to Sleep
3,,/img/books/5HolyBible.png,https://amzn.to/3atfXbi,"[{'name': 'A.J. Jacobs'}, {'name': 'Caterina F...",,Holy Bible
4,Sam Harris,/img/books/6Lying.png,https://amzn.to/2NJj3hV,"[{'name': 'A.J. Jacobs'}, {'name': 'Brian Arms...",,Lying


In [145]:
flattened = df.copy()
flattened['recommenders'] = flattened['recommenders'].apply(lambda x: [y['name'] for y in x])
flattened = flattened.groupby(['title', 'subtitle', 'author']).agg({'recommenders': 'sum'}).reset_index()
flattened['recommenders'] = flattened['recommenders'].apply(lambda x: list(set(x)))
flattened['recommendations'] = flattened['recommenders'].apply(lambda x: len(x))
flattened.sort_values('recommendations', ascending=False, inplace=True)

In [147]:
flattened

Unnamed: 0,title,subtitle,author,recommenders,recommendations
1017,Man's Search for Meaning,,Viktor E. Frankl,"[Tony Robbins, Michael Gervais, Turia Pitt, Ry...",21
1602,The 4-Hour Workweek,"Escape 9-5, Live Anywhere, and Join the New Rich",Timothy Ferriss,"[Daniel Pink, Eric Weinstein, Jason Silva, Tri...",16
1415,Sapiens,A Brief History of Humankind,Yuval Noah Harari,"[Whitney Cummings, Ashton Kutcher, Richard Bra...",15
1333,Principles,Life and Work,Ray Dalio,"[Kevin Systrom, Mark Cuban, Howard Marks, Brya...",12
203,Atlas Shrugged,,Ayn Rand,"[Ev Williams, Joe De Sena, Brian Armstrong, Ga...",11
1617,The Alchemist,,Paulo Coelho,"[Eric Ripert, LeBron James, Gabby Reece, Ryan ...",11
1870,The Fountainhead,,Ayn Rand,"[Emma Watson, Mark Cuban, Ev Williams, Vince V...",10
747,Holy Bible,,,"[Wim Hof, Kevin Kelly, A.J. Jacobs, Neil deGra...",10
735,High Output Management,,Andrew S. Grove,"[Ben Horowitz, Marc Andreessen, Keith Rabois, ...",10
1689,The Black Swan,The Impact of the Highly Improbable,Nassim Nicholas Taleb,"[James Altucher, Jeff Bezos, Edward Norton, Ar...",9


In [146]:
flattened.loc[1017]['recommenders']

['Tony Robbins',
 'Michael Gervais',
 'Turia Pitt',
 'Ryan Holiday',
 'Daniel Pink',
 'Dave Elitch',
 'Chelsea Handler',
 'Aniela Gregorek',
 'Emma Watson',
 'Ev Williams',
 'Fedor Holz',
 'Maria Popova',
 'Naval Ravikant',
 'David Blaine',
 'Jocko Willink',
 'Jimmy Fallon',
 'Bryan Johnson',
 'Terry Crews',
 'Chip Conley',
 'Esther Perel',
 'Jordan Peterson']

In [148]:
len(flattened)

2673

In [153]:
flattened['source'] = 'mostrecommendedbooks'
flattened['date_accessed'] = datetime.now()

In [154]:
flattened.head()

Unnamed: 0,title,subtitle,author,recommenders,recommendations,source,date_accessed
1017,Man's Search for Meaning,,Viktor E. Frankl,"[Tony Robbins, Michael Gervais, Turia Pitt, Ry...",21,mostrecommendedbooks,2020-02-03 21:19:44.906348
1602,The 4-Hour Workweek,"Escape 9-5, Live Anywhere, and Join the New Rich",Timothy Ferriss,"[Daniel Pink, Eric Weinstein, Jason Silva, Tri...",16,mostrecommendedbooks,2020-02-03 21:19:44.906348
1415,Sapiens,A Brief History of Humankind,Yuval Noah Harari,"[Whitney Cummings, Ashton Kutcher, Richard Bra...",15,mostrecommendedbooks,2020-02-03 21:19:44.906348
1333,Principles,Life and Work,Ray Dalio,"[Kevin Systrom, Mark Cuban, Howard Marks, Brya...",12,mostrecommendedbooks,2020-02-03 21:19:44.906348
203,Atlas Shrugged,,Ayn Rand,"[Ev Williams, Joe De Sena, Brian Armstrong, Ga...",11,mostrecommendedbooks,2020-02-03 21:19:44.906348
