# Scrape data from: https://en.m.wikipedia.org/wiki/Lists_of_The_New_York_Times_Fiction_Best_Sellers

In [1]:
# Import Libraries
from bs4 import BeautifulSoup as bs
import requests
import os
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# Function to get the years best seller table into a list
def yearTable_toList():
    
    # Initialize the Beautiful Soup instance and read the html with pandas
    html = browser.html
    soup = bs(html, "html.parser")
    table = pd.read_html(html)
    
    # Get the First index, which is table on the year page.
    df = table[0]

    # Format the dataframe to only have unique book titles
    df = df.drop(columns = ["Date", "Author"])
    df = df.drop_duplicates()

    # Convert the dataframe to a list
    bookList = df["Book"].tolist()
    
    # Return the bookList
    return(bookList)

In [3]:
# Store the URL
nytimes_url = "https://en.m.wikipedia.org/wiki/Lists_of_The_New_York_Times_Fiction_Best_Sellers"

In [4]:
# Setup the Splinter Instance
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\ryana\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache


 


In [5]:
# Declare the dictionary variable
bookDict = {}

# Loop through the years in the best sellers list
for year in range(1931,2021):
    try:
        # Go back to the main list to continue navigation
        browser.visit(nytimes_url)
        
        # Navigate to each year's page
        browser.click_link_by_partial_href(year) 
        
        # Call the yearTable_toList and assign it to the bookList variable
        bookList = yearTable_toList()
        
        # Append the list to a dictionary with the year as the reference
        bookDict[year] = bookList
        
        # Output the current year, to keep track of where data has been scraped
        print(year)
        
        # Reset the bookList variable for next sequence
        bookList = []

    except:
        print("Error: Couldn't Scrape Data")



1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
Error: Couldn't Scrape Data
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020


In [6]:
# Close the browser
browser.quit()

In [7]:
print(bookDict)

{1931: ['The Ten Commandments', 'No List Published', 'Maid in Waiting'], 1932: ['Maid in Waiting', 'The Harbourmaster', 'Mr. and Mrs. Pennington', 'The End of Desire', "Mary's Neck", 'Magnolia Street', 'Bright Skin', 'A Modern Hero', 'The Good Earth', 'District Nurse', 'The Fountain', 'Faraway', 'Lark Ascending', 'A New York Tempest', 'The Sheltered Life', 'Sons', 'Invitation to the Waltz', 'Flowering Wilderness'], 1933: ['Flowering Wilderness', 'The Last Adam', 'Ann Vickers', 'The Werewolf of Paris', 'Rain in the Doorway', 'As the Earth Turns', 'The Store', 'Little Man, What Now?', 'Anthony Adverse', nan], 1934: ['Anthony Adverse', 'The Thin Man', 'Work of Art', 'The Oppermanns', 'Seven Gothic Tales', 'Tender is the Night', 'Lamb in His Bosom', 'Five Silver Daughters', 'I, Claudius', 'So Red the Rose', 'Lost Horizon', 'The Forty Days of Musa Dagh'], 1935: ['The Forty Days of Musa Dagh', "Heaven's My Destination", 'Come and Get It', 'Of Time and the River', 'Green Light', 'Now in Novem

In [8]:
finalBook_df = pd.DataFrame.from_dict(bookDict, orient = "index")
finalBook_df.transpose()

Unnamed: 0,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,The Ten Commandments,Maid in Waiting,Flowering Wilderness,Anthony Adverse,The Forty Days of Musa Dagh,It Can't Happen Here,Gone with the Wind,The Citadel,All This and Heaven Too,The Nazarene,...,Dead or Alive,11/22/63,The Racketeer,Sycamore Row,Gray Mountain,Rogue Lawyer,The Whistler,Origin,The Reckoning,Where the Crawdads Sing
1,No List Published,The Harbourmaster,The Last Adam,The Thin Man,Heaven's My Destination,If I Have Four Apples,Drums Along the Mohawk,The Prodigal Parents,Rebecca,Kitty Foyle,...,What the Night Knows,77 Shadow Street,Gone Girl,The Invention of Wings,Gone Girl,The Girl on the Train,The Wrong Side of Goodbye,The People vs. Alex Cross,Long Road to Mercy,American Dirt
2,Maid in Waiting,Mr. and Mrs. Pennington,Ann Vickers,Work of Art,Come and Get It,The Last Puritan,Theatre,Action at Aquila,Wickford Point,How Green Was My Valley,...,The Girl Who Kicked the Hornets' Nest,Private: #1 Suspect,A Memory of Light,The Goldfinch,All the Light We Cannot See,Scandalous Behavior,The Mistress,The Woman in the Window,Verses for the Dead,Golden in Death
3,,The End of Desire,The Werewolf of Paris,The Oppermanns,Of Time and the River,Sparkenbroke,The Years,The Yearling,The Grapes of Wrath,Native Son,...,The Inner Circle,Believing the Lie,Private Berlin,Private L.A.,The Girl on the Train,Blue,A Dog's Purpose,City of Endless Night,Turning Point,One Minute Out
4,,Mary's Neck,Rain in the Doorway,Seven Gothic Tales,Green Light,The Weather in the Streets,The Outward Room,"My Son, My Son!",Escape,Stars on the Sea,...,Shadowfever,Taken,Until the End of Time,Concealed in Death,The Liar,NYPD Red 4,Never Never,Judgment Road,Where the Crawdads Sing,Blindside
5,,Magnolia Street,As the Earth Turns,Tender is the Night,Now in November,The Doctor,Northwest Passage,Rebecca,Kitty Foyle,Mrs. Miniver,...,Tick Tock,Home Front,A Week in Winter,The Chance,Memory Man,Brotherhood In Death,Right Behind You,Dark in Death,Connections in Death,House of Earth and Blood
6,,Bright Skin,The Store,Lamb in His Bosom,Young Renny,Sanfelice,The Citadel,All This and Heaven Too,The Nazarene,The Beloved Returns,...,Alone,Kill Shot,"Alex Cross, Run",Words of Radiance,Gathering Prey,Morning Star,Echoes in Death,The Great Alone,Redemption,The Mirror and the Light
7,,A Modern Hero,"Little Man, What Now?",Five Silver Daughters,Paths of Glory,Gone with the Wind,,,Moment in Peking,You Can't Go Home Again,...,Treachery in Death,Private Games,Calculated in Death,Night Broken,14th Deadly Sin,Cometh the hour,Heartbreak Hotel,Burn Bright,Neon Prey,The Boy from the Woods
8,,The Good Earth,Anthony Adverse,"I, Claudius",Lucy Gayheart,,,,,For Whom the Bell Tolls,...,The Wise Man's Fear,Celebrity in Death,Reckless,Missing You,Radiant Angel,Me Before You,The Shack,The Rising Sea,18th Abduction,Little Fires Everywhere
9,,District Nurse,,So Red the Rose,Vein of Iron,,,,,Oliver Wiswell,...,Water for Elephants,Fifty Shades of Grey,Six Years,Shadow Spell,Finders Keepers,The Gangster,Mississippi Blood,Accidental Heroes,Summer of '69,Masked Prey
