### Code used to obtain data from Google Books

Attributes obtained: 
 - ISBN13, ISBN10, Google Books ID ( Primary Keys of the Book )
 - Name, Subtitle, Description, Page Count of the book 
 - Author(s), Publisher(s), Originally Published Date of the book
 - Language, Genre(s), Maturity Rating of the book
 - Availability of the book ( as an eBook, ePub and PDF )
 - Thumbnail Links to the Cover Images of the book
 - List of books belonging to the same franchise/series if any
 - List of books published by the same publisher(s) if any 
 - List of books written by the same first author if any
 - List of editions of the same book, and details on their ISBN IDs, formats, date of publishing, publisher and page count if any


Also, this Notebook also has a piece of code used to obtain links from OpenLibrary.

#### Google Books Data Collection - Attributes Part 1

The Google Books API was used to obtain data here, and this data was read using urllib and json, and parsed.

In [None]:
"""
@author: Pranav
"""

import pandas as pd
import urllib.request
import json
import textwrap
import time

def api_details_provider(isbn):
    base_api_link = "https://www.googleapis.com/books/v1/volumes?q=isbn:"
    user_input = str(isbn)
    with urllib.request.urlopen(base_api_link + user_input) as f:
        text = f.read()

    decoded_text = text.decode("utf-8")
    obj = json.loads(decoded_text) # deserializes decoded_text to a Python object
    
    try:
        detailsdict = {}
        detailsdict = {'status':1}
        volume_info = obj["items"][0] 

        try:
            detailsdict.update({'isbn_13':volume_info["volumeInfo"]["industryIdentifiers"][0]["identifier"]})
        except:
            detailsdict.update({'isbn_13':""})
                        
        try:
            detailsdict.update({'isbn_10':volume_info["volumeInfo"]["industryIdentifiers"][1]["identifier"]})
        except:
            detailsdict.update({'isbn_10':""})
            
        try:
            detailsdict.update({'book_title':volume_info["volumeInfo"]["title"]})
        except:
            detailsdict.update({'book_title':""})
        
        try:
            detailsdict.update({'subtitle':volume_info["volumeInfo"]["subtitle"]})
        except:
            detailsdict.update({'subtitle':""})
            
        try:
            detailsdict.update({'authors':volume_info["volumeInfo"]["authors"]})
        except:
            detailsdict.update({'authors':""})
            
        try:
            detailsdict.update({'publisher':volume_info["volumeInfo"]["publisher"]})
        except:
            detailsdict.update({'publisher':""})
            
           
        try:
            detailsdict.update({'published_date':volume_info["volumeInfo"]["publishedDate"]})
        except:
            detailsdict.update({'published_date':""})
            
        try:
            detailsdict.update({'page_count':volume_info["volumeInfo"]["pageCount"]})
        except:
            detailsdict.update({'page_count':""})
            
        try:
            detailsdict.update({'categories':volume_info["volumeInfo"]["categories"]})
        except:
            detailsdict.update({'categories':""})
            
        try:
            detailsdict.update({'small_thumbnail_link':volume_info["volumeInfo"]["imageLinks"]["smallThumbnail"]})
        except:
            detailsdict.update({'small_thumbnail_link':""})
        
        try:
            detailsdict.update({'large_thumbnail_link':volume_info["volumeInfo"]["imageLinks"]["thumbnail"]})
        except:
            detailsdict.update({'large_thumbnail_link':""})
        
        try:
            detailsdict.update({'average_rating':volume_info["volumeInfo"]["averageRating"]})
        except:
            detailsdict.update({'average_rating':-1})
            
        try:
            detailsdict.update({'ratings_count':volume_info["volumeInfo"]["ratingsCount"]})
        except:
            detailsdict.update({'ratings_count':-1})
            
        try:
            detailsdict.update({'maturity_rating':volume_info["volumeInfo"]["maturityRating"]})
        except:
            detailsdict.update({'maturity_rating':""})
        
        try:
            detailsdict.update({'description':volume_info["volumeInfo"]["description"]})
        except:
            detailsdict.update({'description':""})
            
        try:
            detailsdict.update({'language':volume_info["volumeInfo"]["language"]})
        except:
            detailsdict.update({'language':""})
            
        try:
            detailsdict.update({'preview_link':volume_info["volumeInfo"]["previewLink"]})
        except:
            detailsdict.update({'preview_link':""})
            
        try:
            detailsdict.update({'info_link':volume_info["volumeInfo"]["infoLink"]})
        except:
            detailsdict.update({'info_link':""})
            
        try:
            detailsdict.update({'canonical_volume_link':volume_info["volumeInfo"]["canonicalVolumeLink"]})
        except:
            detailsdict.update({'canonical_volume_link':""})
            
        try:
            detailsdict.update({'self_link':volume_info["selfLink"]})
        except:
            detailsdict.update({'self_link':""})
            
        try:
            detailsdict.update({'etag':volume_info["etag"]})
        except:
            detailsdict.update({'etag':""})
            
        try:
            detailsdict.update({'availability_as_ebook':volume_info["saleInfo"]["isEbook"]})
        except:
            detailsdict.update({'availability_as_ebook':""})
            
        try:
            detailsdict.update({'cost':volume_info["saleInfo"]["retailPrice"]["amount"]})
        except:
            detailsdict.update({'cost':""})
            
            
    except:
        detailsdict.update({'status':0})
    
    return detailsdict


df = pd.read_csv("isbn_input.csv")
print(df.iloc[0]['isbn13'])
df = df.astype(str)
for i in range(2001,4001):
    time.sleep(0.75)
    if True:
        df.to_csv('new_output.csv')
    x = df.iloc[i]['isbn13']
    d = api_details_provider(x)
    if d['status']==0:
        print(i,"     ",x," not found.")
        continue
    else:
        print(i,"     ",x, "found. Update in progress...",end='')
        df.at[i,'isbn_13'] = d['isbn_13']
        df.at[i,'isbn_10'] = d['isbn_10']
        df.at[i,'book_title'] = d['book_title']
        df.at[i,'subtitle'] = d['subtitle']
        df.at[i,'authors'] = d['authors']
        df.at[i,'publisher'] = d['publisher']
        df.at[i,'published_date'] = d['published_date']
        df.at[i,'page_count'] = d['page_count']
        df.at[i,'categories'] = d['categories']
        df.at[i,'small_thumbnail_link'] = d['small_thumbnail_link']
        df.at[i,'large_thumbnail_link'] = d['large_thumbnail_link']
        df.at[i,'average_rating'] = int(d['average_rating'])
        df.at[i,'ratings_count'] = d['ratings_count']
        df.at[i,'maturity_rating'] = d['maturity_rating']
        df.at[i,'description'] = d['description']
        df.at[i,'language'] = d['language']
        df.at[i,'preview_link'] = d['preview_link']
        df.at[i,'info_link'] = d['info_link']
        df.at[i,'canonical_volume_link'] = d['canonical_volume_link']
        df.at[i,'self_link'] = d['self_link']
        df.at[i,'etag'] = d['etag']
        df.at[i,'availability_as_ebook'] = d['availability_as_ebook']
        df.at[i,'cost'] = d['cost']

        

        print("Finished.")
    

#### Google Books Data Collection - Attributes Part 2

The Google Books new website was used to search for each ISBN and scrape details using Selenium.

In [None]:
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
from datetime import date 
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd 


#book_editions,book_series,more_by_author,similar_books,publisher_collection
driver = webdriver.Chrome(ChromeDriverManager().install())
df = pd.read_csv("books_v5.csv")
df = df.astype({'book_editions':str,'book_series':str,'more_by_author':str,'similar_books':str,'publisher_collection':str})


for j in range(2804,3251):
    
    title = df.iloc[j]['book_title']
    driver.get(r'https://www.google.co.in/books/edition/The_Four_Loves/69_-CwAAQBAJ?hl=en')
    
    search_box = driver.find_element_by_xpath("//form[@action='/search']//input[1]")
    search_box.send_keys(title)
    search_box.send_keys(Keys.RETURN)
    print("###############################")
    print("### ",title," #####")
    print("###############################")

    try:
        time.sleep(2)
        other_editions = driver.find_element_by_xpath("//span[text()='More editions']") 
        other_editions.click()

        
        
        #//div[@id='bep-editions']/div[1]/div[2]/div[1]/div[1]/div[1]/a[1]/div[1]
        lis = []
        for i in range(1,15):
            sublis = []
            try:
                
                edition_title = driver.find_element_by_xpath("//div[@id='bep-editions']/div[1]/div[{}]/div[1]/div[1]/div[1]/a[1]/div[1]".format(i)).text
                print("-------------EDITION {}-----------".format(i))
                try:
                    edition_ISBN = driver.find_element_by_xpath("(//div[text()='ISBN:  '])[{}]".format(i)).text.split(':  ')[1]
                    sublis.append(edition_ISBN)
                except:
                    sublis.append("")

                try:
                    edition_format = driver.find_element_by_xpath("(//div[text()='Format:  '])[{}]".format(i)).text.split(':  ')[1]
                    sublis.append(edition_format)
                except:
                    sublis.append("")

                try:
                    edition_publisher = driver.find_element_by_xpath("(//div[text()='Publisher:  '])[{}]".format(i)).text.split(':  ')[1]
                    sublis.append(edition_publisher)
                except:
                    sublis.append("")

                try:
                    edition_published = driver.find_element_by_xpath("(//div[text()='Published:  '])[{}]".format(i)).text.split(':  ')[1]
                    sublis.append(edition_published)
                except:
                    sublis.append("")

                try:
                    edition_length = driver.find_element_by_xpath("(//div[text()='Length:  '])[{}]".format(i)).text.split(':  ')[1]
                    sublis.append(edition_length)
                except:
                    sublis.append("")

                lis.append(sublis)
                
            except:
                print("-------NUMBER OF EDITIONS ENDS HERE-----")
                break

        print(lis)
        df.at[j,'book_editions'] = lis
        
    except:
        print("------EDITIONS PAGE DOES NOT EXIST-------")

         
    try:
        series = driver.find_element_by_xpath("//span[text()='Series']")
        series.click()
        time.sleep(2)
        slis = []
        for i in range(1,10):
            try:
                series_name = driver.find_element_by_xpath("//div[@id='bep-tab-series']/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[{}]/div[1]/div[1]/a[1]/div[1]".format(i)).text
                print("-------------SERIES - LISTING {}-----------".format(i))
                slis.append(series_name)
            except:
                print("-------NUMBER OF SERIES LISTINGS END HERE-----")
                break
        print(slis)
        df.at[j,'book_series'] = slis
    except:
        print("------SERIES PAGE DOES NOT EXIST-------")
    

    try:
        more_by_author = driver.find_element_by_xpath("//span[text()='More by author']")
        more_by_author.click()
        time.sleep(2)
        mlis = []
        for i in range(1,10):
            try:
                more_name = driver.find_element_by_xpath("//div[@id='bep-tab-content']/g-flippy-carousel[1]/div[1]/div[1]/ol[1]/li[5]/span[1]/div[1]/div[1]/div[1]/div[1]/div[3]/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[{}]/div[1]/div[1]/a[1]/div[1]".format(i)).text
                print("-------------MORE BY THE AUTHOR - LISTING {}-----------".format(i))
                mlis.append(more_name)
            except:
                print("-------NUMBER OF MORE BY THE AUTHOR LISTINGS END HERE-----")
                break
        print(mlis)
        df.at[j,'more_by_author'] = mlis
    except:
         print("------MORE BY THE AUTHOR PAGE DOES NOT EXIST-------")

    
   
    try:
        similar_books = driver.find_element_by_xpath("//span[text()='Similar books']")
        similar_books.click()
        time.sleep(2)
        simlis = []
        for i in range(1,15):
            try:
                sim_name = driver.find_element_by_xpath("//div[@id='bep-tab-sideways']/div[1]/div[2]/div[1]/div[1]/div[1]/div[1]/div[{}]/div[1]/div[1]/a[1]/div[1]".format(i)).text
                print("-------------SIMILAR BOOKS - LISTING {}-----------".format(i))
                simlis.append(sim_name)
            except:
                print("-------NUMBER OF SIMILAR BOOKS LISTINGS END HERE-----")
                break
        print(simlis)
        df.at[j,'similar_books'] = simlis
    except:
        print("------SIMILAR BOOKS PAGE DOES NOT EXIST-------")


    try:
        publisher_collection = driver.find_element_by_xpath("//span[text()='Publisher collection']")
        publisher_collection.click()
        time.sleep(2)
        publis = []
        for i in range(1,15):
            try:
                pub_name = driver.find_element_by_xpath("//div[@id='bep-tab-content']/g-flippy-carousel[1]/div[1]/div[1]/ol[1]/li[5]/span[1]/div[1]/div[1]/div[1]/div[1]/div[3]/div[1]/div[2]/div[1]/div[4]/div[1]/div[1]/div[{}]/div[1]/div[1]/a[1]/div[1]".format(i)).text
                print("-------------PUBLISHER COLLECTION - LISTING {}-----------".format(i))
                publis.append(pub_name)
            except:
                print("-------PUBLISHER COLLECTION LISTINGS END HERE-----")
                break
        print(publis)
        df.at[j,'publisher_collection'] = simlis
    except:
        print("------PUBLISHER COLLECTION PAGE DOES NOT EXIST-------")
    
    df.to_csv("books_v5.csv")


#### OpenLibrary Code

In [None]:
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
from datetime import date 
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
import time
import pandas as pd 

df = pd.read_csv("books_v5.csv")
df = df.astype({'univ_link':str})
driver = webdriver.Chrome(ChromeDriverManager().install())
for i in range(0,3251):
    title = df.iloc[i]['book_title']
    driver.get(r'https://openlibrary.org/advancedsearch')

    search_title = driver.find_element_by_xpath("(//input[@name='title'])[1]")
    search_title.send_keys(title)

    submit_button = driver.find_element_by_xpath("(//input[@type='submit'])[2]")
    submit_button.click()

    try:
        first_result = driver.find_element_by_xpath("(//a[@class='results'])[1]")
        first_result.click()
        x = driver.current_url
        print(i,"  ",title,"   ",x)
        df.at[i,'univ_link']=x
    except:
        print("{} URL not available".format(title))
    df.to_csv("books_v5.csv")
    