## Code for fetching the following listed attributes from librarything.com's website
- library thing page URL of the book
- Some attributes such as Members': '750', 'Reviews', 'Popularity', 'Average rating', 'Conversations / Mentions'
- Summary of the book
- Characters(if any) 
- Awards (if any)
##### Note: We may not use all the above listed attributes in the article, but let's just collect it so as to not run the code again & again

In [1]:
import pandas as pd
import ast
import pickle
import requests
from selenium import webdriver
import time
from bs4 import BeautifulSoup

In [8]:
driver_path = "/home/pybeast/chromedriver"
driver = webdriver.Chrome(executable_path=driver_path)

In [3]:
books = pickle.load(open("books_v6.pkl", "rb"))

In [4]:
isbn_list = books['isbn_13'].tolist()
print(len(isbn_list))
isbn_list[:5]

6500


['9780002005883',
 '9780002261982',
 '9780006163831',
 '9780006178736',
 '9780006280897']

In [5]:
library_thing_url = "https://www.librarything.com/search.php?search={}"

In [6]:
d = dict()

In [9]:
cnt = 0
for isbn in isbn_list[:1000]:
    status = ""
    cnt += 1
    isbn = str(isbn)
    try:
        url = library_thing_url.format(isbn)
        driver.get(url)
        
        # Do not change this as it's taking minimum 3 seconds for loading the page even if internet speed is high
        time.sleep(5)
        a_tag = driver.find_element_by_class_name('msg').find_element_by_tag_name("a")
        link = a_tag.get_attribute("href")
        d[isbn] = dict()
        d[isbn]['library_thing_url'] = link
        a_tag.click()
        
        # Do not change this as it's taking minimum 3 seconds for loading the page even if internet speed is high
        time.sleep(3)
        
        # Single row table 
        try:
            wsl_header = driver.find_element_by_class_name("wslheader")
            headers = wsl_header.find_elements_by_tag_name("td")
            
            wsl_content = driver.find_element_by_class_name("wslcontent")
            contents = wsl_content.find_elements_by_tag_name('td')
            
            for i, j in zip(headers, contents):
                d[isbn][i.text] = j.text
            status += "Done table, "
        except:
            status += "Failed table, "
        
        # Summary
        try:
            wsl_summary = driver.find_element_by_class_name("wslsummary").get_attribute("innerHTML")
            soup = BeautifulSoup(wsl_summary, "html.parser")
            summary = soup.find("td").text
            d[isbn]['summary'] = summary
            status += "Done summary, "
        except:
            d[isbn]['summary'] = ""
            status += 'Failed summary, '
            
        # People/Characters
        try:
            d[isbn]['characters'] = list()
            characters = driver.find_element_by_xpath("//td[@fieldname='characternames']").get_attribute("innerHTML")
            soup = BeautifulSoup(characters, "html.parser")
            for character in soup.find_all("div", class_="divcharacternames"):
                d[isbn]['characters'].append(character.get_text())
            status += "Done characters, "
        except:
            status += 'Failed characters, '
        
        # Awards
        try:
            d[isbn]['awards'] = list()
            awards = driver.find_element_by_xpath("//td[@fieldname='awards']").get_attribute("innerHTML")
            soup = BeautifulSoup(awards, "html.parser")
            for award in soup.find_all("div", class_="divawards"):
                d[isbn]['awards'].append(award.get_text())
            status += "Done awards"
        except:
            status += "Failed awards"
    except:
        status += (isbn + " not found") 
    print("At #" + str(cnt) + " " + status)

At #1 Done table, Done summary, Done characters, Done awards
At #2 Done table, Done summary, Done characters, Done awards
At #3 Done table, Done summary, Done characters, Done awards
At #4 Done table, Done summary, Done characters, Done awards
At #5 Done table, Done summary, Done characters, Done awards
At #6 Done table, Done summary, Done characters, Done awards
At #7 Done table, Done summary, Done characters, Done awards
At #8 Done table, Done summary, Done characters, Done awards
At #9 Done table, Done summary, Done characters, Done awards
At #10 Done table, Done summary, Done characters, Done awards
At #11 Done table, Done summary, Done characters, Done awards
At #12 Done table, Done summary, Done characters, Done awards
At #13 Done table, Done summary, Done characters, Done awards
At #14 Done table, Done summary, Done characters, Done awards
At #15 Done table, Done summary, Done characters, Done awards
At #16 Done table, Done summary, Done characters, Done awards
At #17 Done table

At #300 9780060898656 not found
At #301 9780060899226 not found
At #302 9780060906825 not found
At #303 9780060913076 not found
At #304 9780060915186 not found
At #305 9780060915414 not found
At #306 9780060915438 not found
At #307 9780060916091 not found
At #308 9780060916466 not found
At #309 9780060916497 not found
At #310 9780060916510 not found
At #311 9780060916817 not found
At #312 9780060919887 not found
At #313 9780060920081 not found
At #314 9780060920647 not found
At #315 9780060921088 not found
At #316 9780060921712 not found
At #317 9780060922559 not found
At #318 9780060924980 not found
At #319 9780060925758 not found
At #320 9780060925819 not found
At #321 9780060926960 not found
At #322 9780060927196 not found
At #323 9780060927516 not found
At #324 9780060927547 not found
At #325 9780060928414 not found
At #326 9780060929596 not found
At #327 9780060929602 not found
At #328 9780060929879 not found
At #329 9780060930059 not found
At #330 9780060930134 not found
At #331 

At #563 9780099468646 not found
At #564 9780099471370 not found
At #565 9780099471431 not found
At #566 9780099472636 not found
At #567 9780099474128 not found
At #568 9780099474395 not found
At #569 9780099474425 not found
At #570 9780099474463 not found
At #571 9780099474548 not found
At #572 9780099476337 not found
At #573 9780099476351 not found
At #574 9780099477310 not found
At #575 9780099478393 not found
At #576 9780099478423 not found
At #577 9780099478447 not found
At #578 9780099481560 not found
At #579 9780099481683 not found
At #580 9780099483472 not found
At #581 9780099489986 not found
At #582 9780099490685 not found
At #583 9780099498582 not found
At #584 9780099498599 not found
At #585 9780099498636 not found
At #586 9780099498667 not found
At #587 9780099556312 not found
At #588 9780099578512 not found
At #589 9780099582014 not found
At #590 9780099595816 not found
At #591 9780099730514 not found
At #592 9780099742005 not found
At #593 9780099748618 not found
At #594 

At #827 9780140714531 not found
At #828 9780140714548 not found
At #829 9780140714555 not found
At #830 9780140771978 not found
At #831 9780140817782 not found
At #832 9780141000183 not found
At #833 9780141000190 not found
At #834 9780141000589 not found
At #835 9780141001821 not found
At #836 9780141001876 not found
At #837 9780141002989 not found
At #838 9780141005348 not found
At #839 9780141007472 not found
At #840 9780141010311 not found
At #841 9780141010373 not found
At #842 9780141011110 not found
At #843 9780141013459 not found
At #844 9780141013954 not found
At #845 9780141014081 not found
At #846 9780141015088 not found
At #847 9780141016702 not found
At #848 9780141019994 not found
At #849 9780141026282 not found
At #850 9780141180366 not found
At #851 9780141180632 not found
At #852 9781101191149 not found
At #853 9780141180946 not found
At #854 9780141181226 not found
At #855 9781101177273 not found
At #856 9780141182490 not found
At #857 9780141182582 not found
At #858 

In [10]:
d

{'9780002261982': {'library_thing_url': 'https://www.librarything.com/work/191677',
  'Members': '750',
  'Reviews': '11',
  'Popularity': '22,123',
  'Average rating': '(3.36)',
  'Mentions': '5',
  'summary': "A full-length novel by Charles Osborne adapted from Agatha Christie's stage play, in which a diplomat's wife finds a body that mustn't be discovered... Following BLACK COFFEE and THE UNEXPECTED GUEST comes the final Agatha Christie play novelisation, bringing her superb storytelling to a new legion of fans. Clarissa, the wife of a Foreign Office diplomat, is given to daydreaming. 'Supposing I were to come down one morning and find a dead body in the library, what should I do?' she muses. Clarissa has her chance to find out when she discovers a body in the drawing-room of her house in Kent. Desperate to dispose of the body before her husband comes home with an important foreign politician, Clarissa persuades her three house guests to become accessories and accomplices. It seems 