In [56]:
from requests_html import HTMLSession, HTML
import numpy as np
import pandas as pd
from datetime import datetime
from threading import Thread
import time
import matplotlib.pyplot as plt
import progressbar
import warnings
warnings.filterwarnings("ignore")

In [2]:
base_url = 'https://www.audible.com/search?pf_rd_p=7fe4387b-4762-42a8-8d9a-a63254c74bb2&pf_rd_r=C7ENYKDADHMCH4KY12D4&ref=a_search_l1_feature_five_browse-bin_6&feature_six_browse-bin=9178177011&pageSize=50'

In [3]:
def build_dict(items, category, data):
    for item in items:
        text_fields = item.text.split('\n')
        link = [link for link in item.absolute_links if '/pd/' in link][0]
        dict_entry={
            'category' : category,
            'title' : text_fields[0],
            'link' : link
            }
        try:
            dict_entry['rating_count'] = int([s for s in text_fields if 'stars' in s][0].split(
                'stars ')[1].replace(',',''))
        except: pass
        try:
            dict_entry['narrator'] = [s for s in text_fields if 'Narrated by' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['asin'] = [s for s in link.split('/') if 'B0' in s][0].split('?')[0]
        except: pass
        try:
            dict_entry['length'] = [s for s in text_fields if 'Length' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['rating'] = float([s for s in text_fields if 'stars' in s][-1].split(' out')[0])
        except: pass
        try:
            dict_entry['author'] = [s for s in text_fields if 'By' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(
                ': ')[1], '%m-%d-%y')
        except : pass
        data.append(dict_entry)
    return data

def scrape_great_courses(mthreads, category, pages, url_list, data):
    sess = HTMLSession()
    
    for url in url_list:
        try:
            r = sess.get(url)
        except:
            try:
                time.sleep(0.5)
                r = sess.get(url)
            except:
                pass
            
        items = r.html.find('li.bc-list-item.productListItem', first=False)
        
        threads = []
        for j in range(mthreads):
            item_sublist = items[j::mthreads]
            t = Thread(target=build_dict, args=(item_sublist, category, data))
            threads.append(t)
            
        [t.start() for t in threads]
        [t.join() for t in threads]
    
    sess.close()
    return data

In [135]:
def scrape_threader(nthreads, mthreads, category, pages, base_url, data=None):
    if data == None:
        data = []
    
    # Create url list
    url_list = []
    for page in range(pages):
        pageurl = base_url + '&page=' + str(page+1)
        url_list.append(pageurl)
        
    # Create threads
    threads = []
    for i in range(nthreads):
        url_sublist = url_list[i::nthreads]
        t = Thread(target=scrape_great_courses, args=(mthreads, category, pages, url_sublist, data))
        threads.append(t)
    
    # Run threads
    [t.start() for t in threads]
    [t.join() for t in threads]
    
    return data

In [5]:
def loop_categories(nthreads, mthreads, cat_names, cat_page_nums, cat_links):
    data = []
    
#     widgets = [
#         progressbar.Percentage(), 
#         progressbar.Bar(), 
#         progressbar.ETA(),
#         progressbar.DynamicMessage('cat')]
#     bar = progressbar.ProgressBar(widgets=widgets, max_value=sum(cat_page_nums)).start()
    
    finished_pages = 0  
    for category, pages, link in zip(cat_names, cat_page_nums, cat_links):
        print('Scraping ', category, '...')
#         bar.update(finished_pages, cat=category)
        data.extend(scrape_threader(nthreads, mthreads, category, pages, link, data=data))
        finished_pages += pages
        
#     bar.finish()    
    return data

In [6]:
sess = HTMLSession()
r = sess.get(base_url)

cat_items = r.html.find('div.bc-col-responsive.bc-col-3')[0].find('ul.bc-list')[0].find('li.bc-list-item')
cat_names = [item.text for item in cat_items]
cat_page_nums = [1300 for item in cat_names]
cat_item_nums = [1 for item in cat_items]
cat_page_nums = [10 for item in cat_names]
cat_links = [item.absolute_links.pop() + '&pageSize=50' for item in cat_items]

sess.close()

In [8]:
all_links = []
def get_links(links):
    all_links.extend(links)
    for link in links:
        r = sess.get(link)
    
        cat_items = r.html.find('div.bc-col-responsive.bc-col-3')[0].find('ul.bc-list')[1].find('li.bc-list-item')
        cat_names = [item.text for item in cat_items]
        print(cat_names)
        cat_links = [item.absolute_links.pop() + '&pageSize=50' for item in cat_items if item.absolute_links]
        if cat_links:
            get_links(cat_links)

In [9]:
sess = HTMLSession()
r = sess.get(base_url)

all_links = []
main_cat_items = r.html.find('div.bc-col-responsive.bc-col-3')[0].find('ul.bc-list')[0].find('li.bc-list-item')
main_cat_links = [item.absolute_links.pop() + '&pageSize=50' for item in main_cat_items]
get_links(main_cat_links)


sess.close()

['Architecture', 'Art', 'Audio Performances & Dramatizations', 'Entertainment & Performing Arts', 'Music', 'Photography']
['Architecture']
['Art', 'History & Criticism']
['History & Criticism']
['Audio Performances & Dramatizations']
['Entertainment & Performing Arts', 'Business Aspects', 'Comedy', 'Dance', 'Film & TV', 'Opera']
['Business Aspects']
['Performing Arts']
['Dance']
['Film & TV', 'Direction & Production', 'History & Criticism', 'Screenwriting']
['Direction & Production']
['History & Criticism']
['Screenwriting']
['Opera']
['Music', 'Business Aspects', 'History & Criticism', 'Instruction & Technique', 'Opera']
['Business Aspects']
['History & Criticism']
['Instruction & Technique']
['Opera']
['Photography']
['Adventurers, Explorers & Survival', 'Art & Literature', 'Cultural & Regional', 'Diaries & Correspondence', 'Entertainment & Celebrities', 'Historical', 'LGBTQ+', 'Military & War', 'People with Disabilities', 'Politics & Activism', 'Professionals & Academics', 'Religiou

IndexError: list index out of range

In [61]:
df = pd.DataFrame(all_links)

In [63]:
cleaned_df = df[df.url.str.contains('feature_six_browse-bin')]

In [89]:
cleaned_df['id'] = cleaned_df.url.str[54:82]

In [93]:
cleaned_df = cleaned_df.drop_duplicates(subset=['id'])

In [149]:
cleaned_df = pd.read_csv('cleaned_df.csv', header=None)
cleaned_df.columns = ['url', 'id']

In [150]:
cleaned_df.url.values[400:]

array(['https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18573359011&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=ZExLzbgFOsT18Fzc&ref=a_search_l1_subcatRefs_2&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=99XQK5E4CM84M6QJC6A5&pageLoadId=2zx1aFma1Ot2zuEu&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&pageSize=50',
       'https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18573767011&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=Xkj3J7556UhTd45v&ref=a_search_l1_subcatRefs_9&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=VB3PH6ADXNKK27GDERVY&pageLoadId=LfmjqFkql5uFQjqj&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&pageSize=50',
       'https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18571954011&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=U6iOKc6qc0QADv7I&ref=a_search_l1_subcatRefs_1&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=KAH3BG3E06SF76T2XHM9&pageLoadId=5DjDJxbnm4X1rz3v&creati

In [18]:
df.to_csv('all_links.csv', index=False, header=False)

In [118]:
df

Unnamed: 0,url,id
0,https://www.audible.com/search?feature_six_bro...,18685580011&node=18572544011
1,https://www.audible.com/search?feature_six_bro...,18685580011&node=18572527011
2,https://www.audible.com/search?feature_six_bro...,18685580011&node=18573493011
3,https://www.audible.com/search?feature_six_bro...,18685580011&node=18574909011
4,https://www.audible.com/search?feature_six_bro...,18685580011&node=18572517011
...,...,...
1083,https://www.audible.com/search?feature_six_bro...,18685580011&node=18580524011
1084,https://www.audible.com/search?feature_six_bro...,18685580011&node=18574488011
1085,https://www.audible.com/search?feature_six_bro...,18685580011&node=18580589011
1086,https://www.audible.com/search?feature_six_bro...,18685580011&node=18572134011


In [142]:
def loop_categories(nthreads, mthreads, links):
    data = []
    
    for link in links:
        print('Scraping ', link)
        data.extend(scrape_threader2(nthreads, mthreads, link, data=data))
        
    return data

In [143]:
def scrape_threader2(nthreads, mthreads, base_url, data=None):
    if data == None:
        data = []
    
    # Create url list
    url_list = []
    for page in range(10):
        pageurl = base_url + '&page=' + str(page+1)
        url_list.append(pageurl)
        
    # Create threads
    threads = []
    for i in range(nthreads):
        url_sublist = url_list[i::nthreads]
        t = Thread(target=scrape_great_courses, args=(mthreads, url_sublist, data))
        threads.append(t)
    
    # Run threads
    [t.start() for t in threads]
    [t.join() for t in threads]
    
    return data

In [144]:
def build_dict(items, data):
    for item in items:
        text_fields = item.text.split('\n')
        link = [link for link in item.absolute_links if '/pd/' in link][0]
        dict_entry={
            'title' : text_fields[0],
            'link' : link
            }
        try:
            dict_entry['rating_count'] = int([s for s in text_fields if 'stars' in s][0].split(
                'stars ')[1].replace(',',''))
        except: pass
        try:
            dict_entry['narrator'] = [s for s in text_fields if 'Narrated by' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['asin'] = [s for s in link.split('/') if 'B0' in s][0].split('?')[0]
        except: pass
        try:
            dict_entry['length'] = [s for s in text_fields if 'Length' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['rating'] = float([s for s in text_fields if 'stars' in s][-1].split(' out')[0])
        except: pass
        try:
            dict_entry['author'] = [s for s in text_fields if 'By' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(
                ': ')[1], '%m-%d-%y')
        except : pass
        data.append(dict_entry)
    return data

def scrape_great_courses(mthreads, url_list, data):
    sess = HTMLSession()
    
    for url in url_list:
        try:
            r = sess.get(url)
        except:
            try:
                time.sleep(0.5)
                r = sess.get(url)
            except:
                pass
            
        items = r.html.find('li.bc-list-item.productListItem', first=False)
        
        threads = []
        for j in range(mthreads):
            item_sublist = items[j::mthreads]
            t = Thread(target=build_dict, args=(item_sublist, data))
            threads.append(t)
            
        [t.start() for t in threads]
        [t.join() for t in threads]
    
    sess.close()
    return data

In [145]:
df.url[:2].values

array(['https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18572544011&pageSize=50&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=WrvXANV9VGpxlgOU&ref=a_search_l1_subcatRefs_4&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=3W6EC2VK0DXN0WHQ9V79&pageLoadId=JgcnGkMosXsq3Zyk&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&pageSize=50',
       'https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18572527011&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=WmDBUoPJKrkRnB2v&ref=a_search_l1_subcatRefs_15&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=GSGK3MW51Q8ZW6ZP7FDK&pageLoadId=bKQ6fOHW3b3Twz43&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&pageSize=50'],
      dtype=object)

In [158]:
cleaned_df.url.values[0]

'https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18572544011&pageSize=50&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=WrvXANV9VGpxlgOU&ref=a_search_l1_subcatRefs_4&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=3W6EC2VK0DXN0WHQ9V79&pageLoadId=JgcnGkMosXsq3Zyk&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&pageSize=50'

In [173]:
for i in range(0, len(cleaned_df)):
    df = pd.DataFrame(data=loop_categories(
        8, 2, [cleaned_df.url.values[i]]))
    df = df.drop_duplicates(subset=['title'])
    df.to_csv('{}.csv'.format(i))
    print('Done. ', i)

Scraping  https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18572544011&pageSize=50&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=WrvXANV9VGpxlgOU&ref=a_search_l1_subcatRefs_4&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=3W6EC2VK0DXN0WHQ9V79&pageLoadId=JgcnGkMosXsq3Zyk&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&pageSize=50
Done.  0
Scraping  https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18572527011&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=WmDBUoPJKrkRnB2v&ref=a_search_l1_subcatRefs_15&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=GSGK3MW51Q8ZW6ZP7FDK&pageLoadId=bKQ6fOHW3b3Twz43&creativeId=9648f6bf-4f29-4fb4-9489-33163c0bb63e&pageSize=50
Done.  1
Scraping  https://www.audible.com/search?feature_six_browse-bin=18685580011&node=18573493011&pageSize=50&pageSize=50&pageSize=50&pageSize=50&ref_pageloadid=n0TbJlEihgr57XMM&ref=a_search_l1_subcatRefs_1&pf_rd_p=daf0f1c8-2865-4989-87fb-15115ba5a6d2&pf_rd_r=KFBRCXDFN4

In [174]:
df = pd.read_csv('3.csv')
for i in range(0, len(cleaned_df)):
    df = pd.concat([df, pd.read_csv('{}.csv'.format(i))], ignore_index=True)

In [176]:
df = df.drop_duplicates()
df = df.drop('Unnamed: 0', axis=1)

In [178]:
df.drop_duplicates(subset=['asin'], inplace=True)
df.sort_values(by=['rating_count'], ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv('all_english_audible2.csv', index=False)

In [179]:
len(df)

167998

In [182]:
df[df.title == 'Greenlights']

Unnamed: 0,title,link,rating_count,narrator,asin,length,rating,author,release_date


In [183]:
df[10:20]

Unnamed: 0,title,link,rating_count,narrator,asin,length,rating,author,release_date
10,The Girl on the Train,https://www.audible.com/pd/The-Girl-on-the-Tra...,140547.0,"Clare Corbett, Louise Brealey, India Fisher",B00QXW5GYY,10 hrs and 58 mins,4.5,Paula Hawkins,2015-01-13
11,"Harry Potter and the Chamber of Secrets, Book 2",https://www.audible.com/pd/Harry-Potter-and-th...,132036.0,Jim Dale,B017V4IWVG,9 hrs and 2 mins,5.0,J.K. Rowling,2015-11-20
12,Educated,https://www.audible.com/pd/Educated-Audiobook/...,124102.0,Julia Whelan,B075F8MBMQ,12 hrs and 10 mins,5.0,Tara Westover,2018-02-20
13,"Harry Potter and the Prisoner of Azkaban, Book 3",https://www.audible.com/pd/Harry-Potter-and-th...,121789.0,Jim Dale,B017V4JA2Q,11 hrs and 49 mins,5.0,J.K. Rowling,2015-11-20
14,"Harry Potter and the Goblet of Fire, Book 4",https://www.audible.com/pd/Harry-Potter-and-th...,116878.0,Jim Dale,B017V4NUPO,20 hrs and 36 mins,5.0,J.K. Rowling,2015-11-20
15,Verity,https://www.audible.com/pd/Verity-Audiobook/B0...,116558.0,"Vanessa Johansson, Amy Landon",B07Q769RZS,8 hrs and 10 mins,4.5,Colleen Hoover,2019-05-07
16,Dune,https://www.audible.com/pd/Dune-Audiobook/B002...,115554.0,"Scott Brick, Orlagh Cassidy, Euan Morton, and ...",B002V1OF70,21 hrs and 2 mins,4.5,Frank Herbert,2006-12-31
17,"Harry Potter and the Order of the Phoenix, Book 5",https://www.audible.com/pd/Harry-Potter-and-th...,108796.0,Jim Dale,B017V4NMX4,26 hrs and 28 mins,5.0,J.K. Rowling,2015-11-20
18,The Name of the Wind,https://www.audible.com/pd/The-Name-of-the-Win...,104994.0,Nick Podehl,B002UZMLXM,27 hrs and 55 mins,4.5,Patrick Rothfuss,2009-05-15
19,I'm Glad My Mom Died,https://www.audible.com/pd/Im-Glad-My-Mom-Died...,103965.0,Jennette McCurdy,B09VHWHJS2,6 hrs and 26 mins,5.0,Jennette McCurdy,2022-08-09


In [34]:
df

Unnamed: 0,category,title,link,rating_count,asin,rating,author,narrator,length,release_date
0,Literature & Fiction,Ready Player One,https://www.audible.com/pd/Ready-Player-One-Au...,257287,B005FRGT44,5.0,Ernest Cline,Wil Wheaton,15 hrs and 40 mins,2011-08-16
1,Arts & Entertainment,Born a Crime,https://www.audible.com/pd/Born-a-Crime-Audiob...,211580,B01IW9TQPK,5.0,Trevor Noah,Trevor Noah,8 hrs and 44 mins,2016-11-15
2,"Relationships, Parenting & Personal Development",Can't Hurt Me,https://www.audible.com/pd/Cant-Hurt-Me-Audiob...,194658,B07KKMNZCH,5.0,David Goggins,"David Goggins, Adam Skolnick",13 hrs and 37 mins,2018-11-28
3,Science Fiction & Fantasy,"Harry Potter and the Sorcerer's Stone, Book 1",https://www.audible.com/pd/Harry-Potter-and-th...,188813,B017V4IM1G,5.0,J.K. Rowling,Jim Dale,8 hrs and 18 mins,2015-11-20
4,Politics & Social Sciences,Becoming,https://www.audible.com/pd/Becoming-Audiobook/...,180578,B07B3BCZ9S,5.0,Michelle Obama,Michelle Obama,19 hrs and 3 mins,2018-11-13
...,...,...,...,...,...,...,...,...,...,...
7969,Comedy & Humor,Humour and Madness: Early Restoration Comedies,https://www.audible.com/pd/Humour-and-Madness-...,0,B0CG6SF942,0.0,"William Wycherley, Aphra Behn, John Dryden, an...","Derek Jacobi, Prunella Scales, Maggie Smith, a...",16 hrs and 32 mins,2023-11-16
7970,"Mystery, Thriller & Suspense",Lone Wolf,https://www.audible.com/pd/Lone-Wolf-Audiobook...,0,B0C3NNGJR1,0.0,Gregg Hurwitz,Scott Brick,12 hrs and 30 mins,2024-02-13
7971,"Mystery, Thriller & Suspense",Arkangel,https://www.audible.com/pd/Arkangel-Audiobook/...,0,B0CN9RY2DQ,0.0,James Rollins,,12 hrs,2024-08-06
7972,Biographies & Memoirs,Chapters from My Autobiography,https://www.audible.com/pd/Chapters-from-My-Au...,0,B0CLPQ1QYQ,0.0,Mark Twain,John Greenman,10 hrs and 30 mins,2023-11-11


In [38]:
for i in range(20):
    print(f'{i+1}. [{df.title.loc[i]}]({df.link.loc[i]})')

1. [Ready Player One](https://www.audible.com/pd/Ready-Player-One-Audiobook/B005FRGT44?ref_pageloadid=EDbf6AN9TQ7kH2Cg&ref=a_search_c3_a_list_product-0&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=VGMXZX9HAKXGKYN3J074&pageLoadId=E1av7yk55CYocwRl&ref_plink=not_applicable&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c)
2. [Born a Crime](https://www.audible.com/pd/Born-a-Crime-Audiobook/B01IW9TQPK?ref_pageloadid=EDbf6AN9TQ7kH2Cg&ref=a_search_c3_a_list_product-0&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=T8DHX8JYJ7HAPDRFWHZW&pageLoadId=ZbCX6s1A9jgp4jrR&ref_plink=not_applicable&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c)
3. [Can't Hurt Me](https://www.audible.com/pd/Cant-Hurt-Me-Audiobook/B07KKMNZCH?ref_pageloadid=EDbf6AN9TQ7kH2Cg&ref=a_search_c3_a_list_product-0&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=AD2G114PGC7CV1A1PHFH&pageLoadId=Wq8QTp4axgK53Bk8&ref_plink=not_applicable&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c)
4. [Harry Potter and the Sorcer

In [184]:
for i in range(20):
    print(f'{i+1}. [{df.title.loc[i]}]({df.link.loc[i]})')

1. [Where the Crawdads Sing](https://www.audible.com/pd/Where-the-Crawdads-Sing-Audiobook/B07FSNSLZ1?ref_pageloadid=aW52w7u8rzVovEJJ&ref=a_search_c3_a_list_product-0&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=ZAD5XT1NFFE7MP6MAYX1&pageLoadId=plESDPDPe7z3HCBz&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c)
2. [Ready Player One](https://www.audible.com/pd/Ready-Player-One-Audiobook/B005FRGT44?ref_pageloadid=1E3B8qWPE0UdvA4S&ref=a_search_c3_a_list_product-0&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=VV56VBSB4CHSYWJFVYA6&pageLoadId=30pj6MAThHyCKvVs&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c)
3. [Born a Crime](https://www.audible.com/pd/Born-a-Crime-Audiobook/B01IW9TQPK?qid=1705076913&sr=1-48&ref_pageloadid=aW52w7u8rzVovEJJ&ref=a_search_c3_lProduct_1_48&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=EHFCPFDYHB71M40165G7&pageLoadId=lFlzlsvdanM6fbX6&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c)
4. [Can't Hurt Me](https://www.audible.com/pd/Cant-Hurt-Me-Audiob

In [185]:
for i in range(30):
    print(df.link.loc[i])

https://www.audible.com/pd/Where-the-Crawdads-Sing-Audiobook/B07FSNSLZ1?ref_pageloadid=aW52w7u8rzVovEJJ&ref=a_search_c3_a_list_product-0&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=ZAD5XT1NFFE7MP6MAYX1&pageLoadId=plESDPDPe7z3HCBz&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c
https://www.audible.com/pd/Ready-Player-One-Audiobook/B005FRGT44?ref_pageloadid=1E3B8qWPE0UdvA4S&ref=a_search_c3_a_list_product-0&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=VV56VBSB4CHSYWJFVYA6&pageLoadId=30pj6MAThHyCKvVs&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c
https://www.audible.com/pd/Born-a-Crime-Audiobook/B01IW9TQPK?qid=1705076913&sr=1-48&ref_pageloadid=aW52w7u8rzVovEJJ&ref=a_search_c3_lProduct_1_48&pf_rd_p=83218cca-c308-412f-bfcf-90198b687a2f&pf_rd_r=EHFCPFDYHB71M40165G7&pageLoadId=lFlzlsvdanM6fbX6&creativeId=0d6f6720-f41c-457e-a42b-8c8dceb62f2c
https://www.audible.com/pd/Cant-Hurt-Me-Audiobook/B07KKMNZCH?qid=1705079464&sr=1-39&ref_pageloadid=yUR0PH5Hv7SXC7ZW&ref=a_search_c3_lProd

https://www.audible.com/pd/Where-the-Crawdads-Sing-Audiobook/B07FSNSLZ1
https://www.audible.com/pd/Ready-Player-One-Audiobook/B005FRGT44
https://www.audible.com/pd/Born-a-Crime-Audiobook/B01IW9TQPK
https://www.audible.com/pd/Cant-Hurt-Me-Audiobook/B07KKMNZCH
https://www.audible.com/pd/Harry-Potter-and-the-Sorcerers-Stone-Book-1-Audiobook/B017V4IM1G
https://www.audible.com/pd/Becoming-Audiobook/B07B3BCZ9S
https://www.audible.com/pd/The-Subtle-Art-of-Not-Giving-a-F-ck-Audiobook/B01I28NFEE
https://www.audible.com/pd/Project-Hail-Mary-Audiobook/B08G9PRS1K
https://www.audible.com/pd/A-Game-of-Thrones-Audiobook/B002UZZ93G
https://www.audible.com/pd/Harry-Potter-and-the-Deathly-Hallows-Book-7-Audiobook/B017WJ5ZK6
https://www.audible.com/pd/The-Girl-on-the-Train-Audiobook/B00QXW5GYY
https://www.audible.com/pd/Harry-Potter-and-the-Chamber-of-Secrets-Book-2-Audiobook/B017V4IWVG
https://www.audible.com/pd/Educated-Audiobook/B075F8MBMQ
https://www.audible.com/pd/Harry-Potter-and-the-Prisoner-of-Az

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [200]:
df['url'] = df.apply(lambda x: x.link.split('?')[0], axis=1)

In [208]:
top = df[['title', 'author', 'rating', 'rating_count', 'release_date', 'narrator', 'length', 'asin', 'url']][:100000]

In [210]:
top.to_csv('top_100k.csv')

In [211]:
top[top.rating == 5]

Unnamed: 0,title,author,rating,rating_count,release_date,narrator,length,asin,url
0,Where the Crawdads Sing,Delia Owens,5.0,294029.0,2018-08-14,Cassandra Campbell,12 hrs and 12 mins,B07FSNSLZ1,https://www.audible.com/pd/Where-the-Crawdads-...
1,Ready Player One,Ernest Cline,5.0,257572.0,2011-08-16,Wil Wheaton,15 hrs and 40 mins,B005FRGT44,https://www.audible.com/pd/Ready-Player-One-Au...
2,Born a Crime,Trevor Noah,5.0,212143.0,2016-11-15,Trevor Noah,8 hrs and 44 mins,B01IW9TQPK,https://www.audible.com/pd/Born-a-Crime-Audiob...
3,Can't Hurt Me,David Goggins,5.0,195888.0,2018-11-28,"David Goggins, Adam Skolnick",13 hrs and 37 mins,B07KKMNZCH,https://www.audible.com/pd/Cant-Hurt-Me-Audiob...
4,"Harry Potter and the Sorcerer's Stone, Book 1",J.K. Rowling,5.0,189948.0,2015-11-20,Jim Dale,8 hrs and 18 mins,B017V4IM1G,https://www.audible.com/pd/Harry-Potter-and-th...
...,...,...,...,...,...,...,...,...,...
99984,Football Is a Numbers Game,Matthew Coller,5.0,5.0,2023-10-03,Matthew Coller,7 hrs and 14 mins,B0CCYTLZYC,https://www.audible.com/pd/Football-Is-a-Numbe...
99985,Notes for Healthy Kids,Rujuta Diwekar,5.0,5.0,2019-08-27,Shernaz Patel,6 hrs and 34 mins,B07W94TKJB,https://www.audible.com/pd/Notes-for-Healthy-K...
99988,Foundation Spanish (Michel Thomas Method) - Le...,Michel Thomas,5.0,5.0,2022-11-17,Michel Thomas,1 hr and 7 mins,B0BL84S4TC,https://www.audible.com/pd/Foundation-Spanish-...
99992,Miracle Ball,"Brian Biegel, Peter Fornatale",5.0,5.0,2013-07-17,Steve Coulter,6 hrs and 6 mins,B00D62OC8A,https://www.audible.com/pd/Miracle-Ball-Audiob...
