## Code for fetching the following listed attributes from librarything.com's website
- library thing page URL of the book
- Some attributes such as Members', 'Reviews', 'Popularity', 'Average rating', 'Conversations / Mentions'
- Summary of the book
- Characters(if any) 
- Awards (if any)

##### Note: We may not use all the above listed attributes in the article, but let's just collect it so as to not run the code again & again

In [15]:
import pandas as pd
import ast
import pickle
import requests
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [11]:
driver_path = "/home/pybeast/chromedriver"
driver = webdriver.Chrome(executable_path=driver_path)

In [3]:
books = pickle.load(open("books_v6.pkl", "rb"))

In [4]:
isbn_list = books['isbn_13'].tolist()
print(len(isbn_list))
isbn_list[:5]

6500


['9780002005883',
 '9780002261982',
 '9780006163831',
 '9780006178736',
 '9780006280897']

In [5]:
library_thing_url = "https://www.librarything.com/search.php?search={}"

In [6]:
d = dict()

In [16]:
cnt = 0
for isbn in isbn_list[:5]:
    status = ""
    cnt += 1
    isbn = str(isbn)
    try:
        url = library_thing_url.format(isbn)
        driver.get(url)
        
        # Wait until the element gets located in the DOM
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "ajaxcontent"))
        )
        
        a_tag = driver.find_element_by_class_name('msg').find_element_by_tag_name("a")
        link = a_tag.get_attribute("href")
        d[isbn] = dict()
        
        # Adding URL of the book's page on librarything.com's website to the dictionary
        d[isbn]['library_thing_url'] = link
        a_tag.click()
        
        # Do not change this as it's taking minimum 3 seconds for loading the page even if internet speed is high
        time.sleep(3)
        
        # Single row table 
        try:
            wsl_header = driver.find_element_by_class_name("wslheader")
            headers = wsl_header.find_elements_by_tag_name("td")
            
            wsl_content = driver.find_element_by_class_name("wslcontent")
            contents = wsl_content.find_elements_by_tag_name('td')
            
            for i, j in zip(headers, contents):
                d[isbn][i.text] = j.text
            status += "Done table, "
        except:
            status += "Failed table, "
        
        # Summary
        try:
            wsl_summary = driver.find_element_by_class_name("wslsummary").get_attribute("innerHTML")
            soup = BeautifulSoup(wsl_summary, "html.parser")
            summary = soup.find("td").text
            d[isbn]['summary'] = summary
            status += "Done summary, "
        except:
            d[isbn]['summary'] = ""
            status += 'Failed summary, '
            
        # People/Characters
        try:
            d[isbn]['characters'] = list()
            characters = driver.find_element_by_xpath("//td[@fieldname='characternames']").get_attribute("innerHTML")
            soup = BeautifulSoup(characters, "html.parser")
            for character in soup.find_all("div", class_="divcharacternames"):
                d[isbn]['characters'].append(character.get_text())
            status += "Done characters, "
        except:
            d[isbn]['characters'] = list()
            status += 'Failed characters, '
        
        # Awards
        try:
            d[isbn]['awards_and_honors'] = list()
            awards = driver.find_element_by_xpath("//td[@fieldname='awards']").get_attribute("innerHTML")
            soup = BeautifulSoup(awards, "html.parser")
            for award in soup.find_all("div", class_="divawards"):
                d[isbn]['awards_and_honors'].append(award.get_text())
            status += "Done awards"
        except:
            d[isbn]['awards_and_honors'] = list()
            status += "Failed awards"
    except:
        status += (isbn + " not found") 
    print("At #" + str(cnt) + " " + status)

At #1 Done table, Done summary, Done characters, Done awards
At #2 Done table, Done summary, Done characters, Done awards
At #3 Done table, Done summary, Done characters, Done awards
At #4 Done table, Done summary, Done characters, Done awards
At #5 Done table, Done summary, Done characters, Done awards


In [17]:
d

{'9780002261982': {'library_thing_url': 'https://www.librarything.com/work/191677',
  'Members': '750',
  'Reviews': '11',
  'Popularity': '22,123',
  'Average rating': '(3.36)',
  'Mentions': '5',
  'summary': "A full-length novel by Charles Osborne adapted from Agatha Christie's stage play, in which a diplomat's wife finds a body that mustn't be discovered... Following BLACK COFFEE and THE UNEXPECTED GUEST comes the final Agatha Christie play novelisation, bringing her superb storytelling to a new legion of fans. Clarissa, the wife of a Foreign Office diplomat, is given to daydreaming. 'Supposing I were to come down one morning and find a dead body in the library, what should I do?' she muses. Clarissa has her chance to find out when she discovers a body in the drawing-room of her house in Kent. Desperate to dispose of the body before her husband comes home with an important foreign politician, Clarissa persuades her three house guests to become accessories and accomplices. It seems 

In [None]:
# Saving the dictionary into the file
f = open("0001-1000.txt", "w")
f.write(str(d))
f.close()