## Code for fetching the following listed attributes from librarything.com's website
- library thing page URL of the book
- Some attributes such as Members', 'Reviews', 'Popularity', 'Average rating', 'Conversations / Mentions'
- Summary of the book
- Characters(if any) 
- Awards (if any)

##### Note: We may not use all the above listed attributes in the article, but let's just collect it so as to not run the code again & again

In [1]:
import pandas as pd
import ast
import pickle
import requests
from selenium import webdriver
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
driver_path = "/home/pybeast/chromedriver"
driver = webdriver.Chrome(executable_path=driver_path)

In [3]:
books = pickle.load(open("books_v6.pkl", "rb"))

In [4]:
isbn_list = books['isbn_13'].tolist()
print(len(isbn_list))
isbn_list[:5]

6500


['9780002005883',
 '9780002261982',
 '9780006163831',
 '9780006178736',
 '9780006280897']

In [5]:
library_thing_url = "https://www.librarything.com/search.php?search={}"

In [6]:
d = dict()

In [13]:
cnt = 5000
for isbn in isbn_list[5000:]:
    status = ""
    cnt += 1
    isbn = str(isbn)
    try:
        url = library_thing_url.format(isbn)
        driver.get(url)
        
        # Wait until the element gets located in the DOM
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "ajaxcontent"))
        )
        time.sleep(3)
        
        a_tag = driver.find_element_by_class_name('msg').find_element_by_tag_name("a")
        link = a_tag.get_attribute("href")
        d[isbn] = dict()
        
        # Adding URL of the book's page on librarything.com's website to the dictionary
        d[isbn]['library_thing_url'] = link
        a_tag.click()
        
        # Do not change this as it's taking minimum 3 seconds for loading the page even if internet speed is high
        time.sleep(3)
        
        # Single row table 
        try:
            wsl_header = driver.find_element_by_class_name("wslheader")
            headers = wsl_header.find_elements_by_tag_name("td")
            
            wsl_content = driver.find_element_by_class_name("wslcontent")
            contents = wsl_content.find_elements_by_tag_name('td')
            
            for i, j in zip(headers, contents):
                d[isbn][i.text] = j.text
            status += "Done table, "
        except:
            status += "Failed table, "
        
        # Summary
        try:
            wsl_summary = driver.find_element_by_class_name("wslsummary").get_attribute("innerHTML")
            soup = BeautifulSoup(wsl_summary, "html.parser")
            summary = soup.find("td").text
            d[isbn]['summary'] = summary
            status += "Done summary, "
        except:
            d[isbn]['summary'] = ""
            status += 'Failed summary, '
            
        # People/Characters
        try:
            d[isbn]['characters'] = list()
            characters = driver.find_element_by_xpath("//td[@fieldname='characternames']").get_attribute("innerHTML")
            soup = BeautifulSoup(characters, "html.parser")
            for character in soup.find_all("div", class_="divcharacternames"):
                d[isbn]['characters'].append(character.get_text())
            status += "Done characters, "
        except:
            d[isbn]['characters'] = list()
            status += 'Failed characters, '
        
        # Awards
        try:
            d[isbn]['awards_and_honors'] = list()
            awards = driver.find_element_by_xpath("//td[@fieldname='awards']").get_attribute("innerHTML")
            soup = BeautifulSoup(awards, "html.parser")
            for award in soup.find_all("div", class_="divawards"):
                d[isbn]['awards_and_honors'].append(award.get_text())
            status += "Done awards"
        except:
            d[isbn]['awards_and_honors'] = list()
            status += "Failed awards"
    except:
        status += (isbn + " not found") 
    print("At #" + str(cnt) + " " + status)

At #5001 Done table, Done summary, Done characters, Done awards
At #5002 Done table, Done summary, Done characters, Done awards
At #5003 Done table, Done summary, Done characters, Done awards
At #5004 Done table, Done summary, Done characters, Done awards
At #5005 Done table, Done summary, Done characters, Done awards
At #5006 Done table, Done summary, Done characters, Done awards
At #5007 Done table, Done summary, Done characters, Done awards
At #5008 Done table, Done summary, Done characters, Done awards
At #5009 Done table, Done summary, Done characters, Done awards
At #5010 Done table, Done summary, Done characters, Done awards
At #5011 Done table, Done summary, Done characters, Done awards
At #5012 Done table, Done summary, Done characters, Done awards
At #5013 Done table, Done summary, Done characters, Done awards
At #5014 Done table, Done summary, Done characters, Done awards
At #5015 Done table, Done summary, Done characters, Done awards
At #5016 Done table, Done summary, Done 

At #5130 Done table, Done summary, Done characters, Done awards
At #5131 Done table, Done summary, Done characters, Done awards
At #5132 Done table, Done summary, Done characters, Done awards
At #5133 Done table, Done summary, Done characters, Done awards
At #5134 Done table, Done summary, Done characters, Done awards
At #5135 Done table, Done summary, Done characters, Done awards
At #5136 Done table, Done summary, Done characters, Done awards
At #5137 Done table, Done summary, Done characters, Done awards
At #5138 Done table, Done summary, Done characters, Done awards
At #5139 Done table, Done summary, Done characters, Done awards
At #5140 Done table, Done summary, Done characters, Done awards
At #5141 Done table, Done summary, Done characters, Done awards
At #5142 Done table, Done summary, Done characters, Done awards
At #5143 Done table, Done summary, Done characters, Done awards
At #5144 Done table, Done summary, Done characters, Done awards
At #5145 Done table, Done summary, Done 

At #5259 Done table, Done summary, Done characters, Done awards
At #5260 Done table, Done summary, Done characters, Done awards
At #5261 Done table, Done summary, Done characters, Done awards
At #5262 Done table, Done summary, Done characters, Done awards
At #5263 Done table, Done summary, Done characters, Done awards
At #5264 Done table, Done summary, Done characters, Done awards
At #5265 Done table, Done summary, Done characters, Done awards
At #5266 Done table, Done summary, Done characters, Done awards
At #5267 Done table, Done summary, Done characters, Done awards
At #5268 Done table, Done summary, Done characters, Done awards
At #5269 Done table, Done summary, Done characters, Done awards
At #5270 Done table, Done summary, Done characters, Done awards
At #5271 Done table, Done summary, Done characters, Done awards
At #5272 Done table, Done summary, Done characters, Done awards
At #5273 Done table, Done summary, Done characters, Done awards
At #5274 Done table, Done summary, Done 

At #5388 Done table, Done summary, Done characters, Done awards
At #5389 Done table, Done summary, Done characters, Done awards
At #5390 Done table, Done summary, Done characters, Done awards
At #5391 Done table, Done summary, Done characters, Done awards
At #5392 Done table, Done summary, Done characters, Done awards
At #5393 Done table, Done summary, Done characters, Done awards
At #5394 Done table, Done summary, Done characters, Done awards
At #5395 Done table, Done summary, Done characters, Done awards
At #5396 Done table, Done summary, Done characters, Done awards
At #5397 Done table, Done summary, Done characters, Done awards
At #5398 Done table, Done summary, Done characters, Done awards
At #5399 Done table, Done summary, Done characters, Done awards
At #5400 Done table, Done summary, Done characters, Done awards
At #5401 Done table, Done summary, Done characters, Done awards
At #5402 Done table, Done summary, Done characters, Done awards
At #5403 Done table, Done summary, Done 

At #5517 Done table, Done summary, Done characters, Done awards
At #5518 Done table, Done summary, Done characters, Done awards
At #5519 Done table, Done summary, Done characters, Done awards
At #5520 Done table, Done summary, Done characters, Done awards
At #5521 Done table, Done summary, Done characters, Done awards
At #5522 Done table, Done summary, Done characters, Done awards
At #5523 Done table, Done summary, Done characters, Done awards
At #5524 9781400078677 not found
At #5525 9781400078745 not found
At #5526 Done table, Done summary, Done characters, Done awards
At #5527 Done table, Done summary, Done characters, Done awards
At #5528 9781400079278 not found
At #5529 9781400079377 not found
At #5530 Done table, Done summary, Done characters, Done awards
At #5531 Done table, Done summary, Done characters, Done awards
At #5532 Done table, Done summary, Done characters, Done awards
At #5533 Done table, Done summary, Done characters, Done awards
At #5534 Done table, Done summary, D

At #5647 Done table, Done summary, Done characters, Done awards
At #5648 Done table, Done summary, Done characters, Done awards
At #5649 Done table, Done summary, Done characters, Done awards
At #5650 Done table, Done summary, Done characters, Done awards
At #5651 Done table, Done summary, Done characters, Done awards
At #5652 Done table, Done summary, Done characters, Done awards
At #5653 Done table, Done summary, Done characters, Done awards
At #5654 Done table, Done summary, Done characters, Done awards
At #5655 Done table, Done summary, Done characters, Done awards
At #5656 Done table, Done summary, Done characters, Done awards
At #5657 Done table, Done summary, Done characters, Done awards
At #5658 Done table, Done summary, Done characters, Done awards
At #5659 Done table, Done summary, Done characters, Done awards
At #5660 Done table, Done summary, Done characters, Done awards
At #5661 Done table, Done summary, Done characters, Done awards
At #5662 Done table, Done summary, Done 

At #5777 Done table, Done summary, Done characters, Done awards
At #5778 Done table, Done summary, Done characters, Done awards
At #5779 Done table, Done summary, Done characters, Done awards
At #5780 Done table, Done summary, Done characters, Done awards
At #5781 Done table, Done summary, Done characters, Done awards
At #5782 Done table, Done summary, Done characters, Done awards
At #5783 Done table, Done summary, Done characters, Done awards
At #5784 Done table, Done summary, Done characters, Done awards
At #5785 Done table, Done summary, Done characters, Done awards
At #5786 Done table, Done summary, Done characters, Done awards
At #5787 Done table, Done summary, Done characters, Done awards
At #5788 Done table, Done summary, Done characters, Done awards
At #5789 Done table, Done summary, Done characters, Done awards
At #5790 Done table, Done summary, Done characters, Done awards
At #5791 Done table, Done summary, Done characters, Done awards
At #5792 Done table, Done summary, Done 

At #5906 Done table, Done summary, Done characters, Done awards
At #5907 Done table, Done summary, Done characters, Done awards
At #5908 Done table, Done summary, Done characters, Done awards
At #5909 Done table, Done summary, Done characters, Done awards
At #5910 Done table, Done summary, Done characters, Done awards
At #5911 Done table, Done summary, Done characters, Done awards
At #5912 Done table, Done summary, Done characters, Done awards
At #5913 Done table, Done summary, Done characters, Done awards
At #5914 Done table, Done summary, Done characters, Done awards
At #5915 Done table, Done summary, Done characters, Done awards
At #5916 Done table, Done summary, Done characters, Done awards
At #5917 Done table, Done summary, Done characters, Done awards
At #5918 Done table, Done summary, Done characters, Done awards
At #5919 Done table, Done summary, Done characters, Done awards
At #5920 Done table, Done summary, Done characters, Done awards
At #5921 Done table, Done summary, Done 

At #6035 Done table, Done summary, Done characters, Done awards
At #6036 Done table, Done summary, Done characters, Done awards
At #6037 Done table, Done summary, Done characters, Done awards
At #6038 Done table, Done summary, Done characters, Done awards
At #6039 Done table, Done summary, Done characters, Done awards
At #6040 Done table, Done summary, Done characters, Done awards
At #6041 Done table, Done summary, Done characters, Done awards
At #6042 Done table, Done summary, Done characters, Done awards
At #6043 Done table, Done summary, Done characters, Done awards
At #6044 Done table, Done summary, Done characters, Done awards
At #6045 Done table, Done summary, Done characters, Done awards
At #6046 Done table, Done summary, Done characters, Done awards
At #6047 Done table, Done summary, Done characters, Done awards
At #6048 Done table, Done summary, Done characters, Done awards
At #6049 Done table, Done summary, Done characters, Done awards
At #6050 Done table, Done summary, Done 

At #6164 Done table, Done summary, Done characters, Done awards
At #6165 Done table, Done summary, Done characters, Done awards
At #6166 Done table, Done summary, Done characters, Done awards
At #6167 Done table, Done summary, Done characters, Done awards
At #6168 Done table, Done summary, Done characters, Done awards
At #6169 Done table, Done summary, Done characters, Done awards
At #6170 Done table, Done summary, Done characters, Done awards
At #6171 Done table, Done summary, Done characters, Done awards
At #6172 Done table, Done summary, Done characters, Done awards
At #6173 Done table, Done summary, Done characters, Done awards
At #6174 Done table, Done summary, Done characters, Done awards
At #6175 Done table, Done summary, Done characters, Done awards
At #6176 Done table, Done summary, Done characters, Done awards
At #6177 Done table, Done summary, Done characters, Done awards
At #6178 Done table, Done summary, Done characters, Done awards
At #6179 Done table, Done summary, Done 

At #6295 Done table, Done summary, Done characters, Done awards
At #6296 Done table, Done summary, Done characters, Done awards
At #6297 Done table, Done summary, Done characters, Done awards
At #6298 Done table, Done summary, Done characters, Done awards
At #6299 Done table, Done summary, Done characters, Done awards
At #6300 Done table, Done summary, Done characters, Done awards
At #6301 Done table, Done summary, Done characters, Done awards
At #6302 Done table, Done summary, Done characters, Done awards
At #6303 Done table, Done summary, Done characters, Done awards
At #6304 Done table, Done summary, Done characters, Done awards
At #6305 Done table, Done summary, Done characters, Done awards
At #6306 Done table, Done summary, Done characters, Done awards
At #6307 Done table, Done summary, Done characters, Done awards
At #6308 Done table, Done summary, Done characters, Done awards
At #6309 Done table, Done summary, Done characters, Done awards
At #6310 Done table, Done summary, Done 

At #6424 Done table, Done summary, Done characters, Done awards
At #6425 Done table, Done summary, Done characters, Done awards
At #6426 Done table, Done summary, Done characters, Done awards
At #6427 Done table, Done summary, Done characters, Done awards
At #6428 Done table, Done summary, Done characters, Done awards
At #6429 Done table, Done summary, Done characters, Done awards
At #6430 Done table, Done summary, Done characters, Done awards
At #6431 Done table, Done summary, Done characters, Done awards
At #6432 Done table, Done summary, Done characters, Done awards
At #6433 Done table, Done summary, Done characters, Done awards
At #6434 9781932206081 not found
At #6435 9781932206104 not found
At #6436 Done table, Done summary, Done characters, Done awards
At #6437 Done table, Done summary, Done characters, Done awards
At #6438 Done table, Done summary, Done characters, Done awards
At #6439 Done table, Done summary, Done characters, Done awards
At #6440 Done table, Done summary, Don

In [16]:
"9781741045918" in d

False

In [15]:
# Saving the dictionary into the file
f = open("/home/pybeast/Desktop/IndicWiki/BooksProject/librarything_dot_com/generated_files/5001-6500.txt", "w")
f.write(str(d))
f.close()

In [7]:
# Running code for missing books(found missing_list from generated_files/merging_all_generated_files)
missing_list = ['9780060245603', '9780060527990', '9780060936662', '9780060938109', '9780062700254', '9780064406031', '9780064408585', '9780064434980', '9780064435260', '9780071608916', '9780140153224', '9780198321477', '9780198711551', '9780198752752', '9780199120093', '9780199203611', '9780312352189', '9780312935757', '9780312937720', '9780312941468', '9780312948962', '9780312965785', '9780345384225', '9780345461360', '9780345461612', '9780345478252', '9780349113463', '9780374253844', '9780374506803', '9780375700125', '9780375700262', '9780375703874', '9780375704253', '9780375752674', '9780375756603', '9780385333818', '9780393703351', '9780465069903', '9780465081424', '9780505525154', '9780515139709', '9780517588376', '9780521532488', '9780521532525', '9780523401218', '9780571224388', '9780671708634', '9780671722074', '9781439117088', '9780679724773', '9780679727293', '9780744005615', '9780756400439', '9780756401108', '9780756401535', '9780756402211', '9780756403188', '9780756403256', '9781101487556', '9781400076192', '9781400078677', '9781400078745', '9781400079278', '9781400079377', '9781439116982', '9781439117019', '9781480105690', '9781551052700', '9781741045918', '9781741142396', '9781790877799', '9781840220551', '9781840224535', '9781930901353', '9781932206081', '9781932206104']
cnt = 0
for isbn in missing_list:
    status = ""
    cnt += 1
    isbn = str(isbn)
    try:
        url = library_thing_url.format(isbn)
        driver.get(url)
        
        # Wait until the element gets located in the DOM
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "ajaxcontent"))
        )
        time.sleep(3)
        
        a_tag = driver.find_element_by_class_name('msg').find_element_by_tag_name("a")
        link = a_tag.get_attribute("href")
        d[isbn] = dict()
        
        # Adding URL of the book's page on librarything.com's website to the dictionary
        d[isbn]['library_thing_url'] = link
        a_tag.click()
        
        # Do not change this as it's taking minimum 3 seconds for loading the page even if internet speed is high
        time.sleep(3)
        
        # Single row table 
        try:
            wsl_header = driver.find_element_by_class_name("wslheader")
            headers = wsl_header.find_elements_by_tag_name("td")
            
            wsl_content = driver.find_element_by_class_name("wslcontent")
            contents = wsl_content.find_elements_by_tag_name('td')
            
            for i, j in zip(headers, contents):
                d[isbn][i.text] = j.text
            status += "Done table, "
        except:
            status += "Failed table, "
        
        # Summary
        try:
            wsl_summary = driver.find_element_by_class_name("wslsummary").get_attribute("innerHTML")
            soup = BeautifulSoup(wsl_summary, "html.parser")
            summary = soup.find("td").text
            d[isbn]['summary'] = summary
            status += "Done summary, "
        except:
            d[isbn]['summary'] = ""
            status += 'Failed summary, '
            
        # People/Characters
        try:
            d[isbn]['characters'] = list()
            characters = driver.find_element_by_xpath("//td[@fieldname='characternames']").get_attribute("innerHTML")
            soup = BeautifulSoup(characters, "html.parser")
            for character in soup.find_all("div", class_="divcharacternames"):
                d[isbn]['characters'].append(character.get_text())
            status += "Done characters, "
        except:
            d[isbn]['characters'] = list()
            status += 'Failed characters, '
        
        # Awards
        try:
            d[isbn]['awards_and_honors'] = list()
            awards = driver.find_element_by_xpath("//td[@fieldname='awards']").get_attribute("innerHTML")
            soup = BeautifulSoup(awards, "html.parser")
            for award in soup.find_all("div", class_="divawards"):
                d[isbn]['awards_and_honors'].append(award.get_text())
            status += "Done awards"
        except:
            d[isbn]['awards_and_honors'] = list()
            status += "Failed awards"
    except:
        status += (isbn + " not found") 
    print("At #" + str(cnt) + " " + status)

At #1 Done table, Done summary, Done characters, Done awards
At #2 Done table, Done summary, Done characters, Done awards
At #3 Done table, Done summary, Done characters, Done awards
At #4 Done table, Done summary, Done characters, Done awards
At #5 Done table, Done summary, Done characters, Done awards
At #6 Done table, Done summary, Done characters, Done awards
At #7 Done table, Done summary, Done characters, Done awards
At #8 Done table, Done summary, Done characters, Done awards
At #9 Done table, Done summary, Done characters, Done awards
At #10 9780071608916 not found
At #11 Done table, Done summary, Done characters, Done awards
At #12 Done table, Done summary, Done characters, Done awards
At #13 Done table, Done summary, Done characters, Done awards
At #14 Done table, Done summary, Done characters, Done awards
At #15 Done table, Done summary, Done characters, Done awards
At #16 Done table, Done summary, Done characters, Done awards
At #17 Done table, Done summary, Done characters

In [9]:
f = open("generated_files/missing-list.txt", "w")
f.write(str(d))
f.close()

### Note: Out of all the 6500 books, 13 books information are not present on librarything.com's website