# Extend Data

Extend to book data with data from OpenLibrary https://openlibrary.org.

```bash
curl 'https://openlibrary.org/api/books?bibkeys=ISBN:9789513114725&jscmd=data&format=json'
```

Original book data structure:  
`"ISBN";"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"`

OpenLibrary:   
```json
{
   "ISBN:9789513114725":{
    "url":"https://openlibrary.org/books/OL37028318M/Harry_Potter_ja_salaisuuksien_kammio",
      "key":"/books/OL37028318M",
      "title":"Harry Potter ja salaisuuksien kammio",
      "authors":[
         {
            "url":"https://openlibrary.org/authors/OL23919A/J._K._Rowling",
            "name":"J. K. Rowling"
         }
      ],
      "number_of_pages":365,
      "pagination":"365 pages",
      "by_statement":"J.K. Rowling ; suomentanut Jaana Kapari",
      "identifiers":{
         "goodreads":[
            "60408839"
         ],
         "isbn_10":[
            "9513114724"
         ],
      },
      "classifications":{
         "dewey_decimal_class":[
            "Fic"
         ]
      },
      "publishers":[
         {
            "name":"Kustannusosakeyhtiö Tammi"
         }
      ],
      "publish_places":[
         {
            "name":"Helsinki"
         }
      ],
      "publish_date":"2001",
      "subjects": [
         {
            "name":"Fantasy fiction",
            "url":"https://openlibrary.org/subjects/fantasy_fiction"
         },
         {
            "name":"school stories",
            "url":"https://openlibrary.org/subjects/school_stories"
         },
         {
            "name":"Fiction",
            "url":"https://openlibrary.org/subjects/fiction"
         },
      ],
      "excerpts":[
         {
            "text":"Not for the first time, an argument had broken out over breakfast at number four, Privet Drive.",
            "comment":"first sentence"
         }
      ],
      "notes":"Translation of: Harry Potter and the chamber of secrets.\r\n\r\nSequel to: Harry Potter and the philosopher's stone.",
      "ebooks":[
         {
            "preview_url":"https://archive.org/details/harrypotterjasal0000rowl",
            "availability":"borrow",
            "formats":{
               
            },
            "borrow_url":"https://openlibrary.org/books/OL37028318M/Harry_Potter_ja_salaisuuksien_kammio/borrow",
            "checkedout":false
         }
      ],
      "cover":{
         "small":"https://covers.openlibrary.org/b/id/12606939-S.jpg",
         "medium":"https://covers.openlibrary.org/b/id/12606939-M.jpg",
         "large":"https://covers.openlibrary.org/b/id/12606939-L.jpg"
      }
    }
}
```

In [18]:
# Load Data.
import pandas as pd

books = pd.read_csv('./BX-Books-cleaned.csv', sep=";", dtype=str, encoding="utf-8", escapechar='\\')

In [12]:
import requests
import multiprocessing.pool
import multiprocessing as mp
import csv

open_library_api = "https://openlibrary.org/api/books"

def process_response(response, idx, isbn, return_dict, book_data):
    additional_book_data = {
        'excerpts': "",
        'number_of_pages': 0,
        'subjects': [],
        'publish_places': []
    }

    try:
        response.raise_for_status()  # raises exception when not a 2xx response
        if response.status_code != 204:
            response = response.json()
        else:
            print("response error: "+isbn)
            print(response)
    except:
        print("exception: "+isbn)
        
    excerpt = ""
    number_of_pages = 0
    publish_places = []
    subjects = []

    # if book response is not empty (book cannot be found e.g. ISBN:0965404560)
    if(isbn in response):
        if 'excerpts' in response[isbn]:
            excerpt = response[isbn]["excerpts"][0]["text"]

        
        if 'number_of_pages' in response[isbn]:
            number_of_pages = response[isbn]["number_of_pages"]

        
        if 'publish_places' in response[isbn]:
            for place in response[isbn]["publish_places"]:
                publish_places.append(place["name"])

        
        if 'subjects' in response[isbn]:
            for sub in response[isbn]["subjects"]:
                subjects.append(sub["name"])

    additional_book_data["excerpt"] = excerpt.replace('\r','').replace('\n',' ')
    additional_book_data["number_of_pages"] = number_of_pages
    additional_book_data["publish_places"] = publish_places
    additional_book_data["subjects"] = subjects

    return_dict[isbn.split("ISBN:")[1]] = additional_book_data
    book_data["excerpts"].append(excerpt.replace('\r','').replace('\n',' '))
    book_data["number_of_pages"].append(number_of_pages)
    book_data["publish_places"].append(publish_places)
    book_data["subjects"].append(subjects)
    

def fetch_book_data(return_dict, start, end):
    book_data = {} 

    for idx in books[start:end].index:
        isbn_nr = books["ISBN"][idx]
        isbn = f"ISBN:{isbn_nr}"
        book_data[idx] = {
            'excerpts': [],
            'number_of_pages': [],
            'subjects': [],
            'publish_places': [],
            'fails': []
        }

        request = f'{open_library_api}?bibkeys={isbn}&jscmd=data&format=json'

        try:
            response = requests.get(request, headers={'User-Agent': 'Mozilla/5.0'})
            process_response(response, idx, isbn, return_dict, book_data)
        except:
            print("\nfetch book data exception: "+isbn)
            book_data[idx]['fails'].append(isbn)
        finally:
            append_book_data(book_data, idx, start, end)

def append_book_data(data, index, start=0, end=267959):
    book_df = books[index: index+1]

    # print(f"data[index] {data[index]}")

    if data[index]['fails']:
        failures_isbns = {'fails': data[index]["fails"] }
        print(failures_isbns)
        pd.DataFrame(failures_isbns).to_csv(f'./BX-Books-extended-{start}-{end}-fails.csv', index=False, mode='a', header=False, quoting=csv.QUOTE_ALL, sep=';')
        return

    book_df.loc[:,'Excerpt'] = data[index]["excerpts"]
    book_df.loc[:, "Pages"] = data[index]["number_of_pages"]
    book_df.loc[:, "Publish-Places"] = data[index]["publish_places"]
    book_df.loc[:, "Subjects"] = data[index]["subjects"]

    book_df.to_csv(f'./BX-Books-extended-{start}-{end}.csv', index=False, mode='a', header=False, quoting=csv.QUOTE_ALL, sep=';')


# --------------------------------------------------------------------------------
# Parallel part

# protect the entry point
if __name__ == '__main__':
    # create and configure the process pool
    manager = multiprocessing.Manager()
    pool = mp.Pool(5)
    return_dict = manager.dict()
    jobs = []
    offset=0
    rang=45729 # adapt to cleaned book dataset length 228643 / 5 = 45728,6 

    for i in range(5):
        pool.apply_async(fetch_book_data, args=(return_dict, (i*rang)+offset, (i*rang)+rang+offset))

    pool.close()
    pool.join() 

    print('All tasks are done', flush=True)
    print(len(return_dict.keys()))

8

fetch book data exception: ISBN:0195153448
fetch book data exception: ISBN:080652121X

{'fails': ['ISBN:0195153448']}{'fails': ['ISBN:080652121X']}


fetch book data exception: ISBN:0425176428
fetch book data exception: ISBN:1881320189

{'fails': ['ISBN:0425176428']}{'fails': ['ISBN:1881320189']}


fetch book data exception: ISBN:0671870432
fetch book data exception: ISBN:0887841740

{'fails': ['ISBN:0887841740']}{'fails': ['ISBN:0671870432']}


fetch book data exception: ISBN:0452264464
fetch book data exception: ISBN:0002005018

{'fails': ['ISBN:0452264464']}
{'fails': ['ISBN:0002005018']}

fetch book data exception: ISBN:0060973129
{'fails': ['ISBN:0060973129']}

fetch book data exception: ISBN:0061076031
{'fails': ['ISBN:0061076031']}

fetch book data exception: ISBN:0439095026
{'fails': ['ISBN:0439095026']}

fetch book data exception: ISBN:0679425608
{'fails': ['ISBN:0679425608']}

fetch book data exception: ISBN:0609804618
{'fails': ['ISBN:0609804618']}

fetch book data except

In [37]:
# Merge Extended Data to one file.

import pandas as pd 
import csv

# Load extended data
extended_b1 = pd.read_csv('./BX-Books-extended-0-45729.csv', sep=";", dtype=str, encoding="iso-8859-1", escapechar='\\')
extended_b2 = pd.read_csv('./BX-Books-extended-45729-91458.csv', sep=";", dtype=str, encoding="iso-8859-1", escapechar='\\')
extended_b3 = pd.read_csv('./BX-Books-extended-91458-137187.csv', sep=";", dtype=str, encoding="iso-8859-1", escapechar='\\')
extended_b4 = pd.read_csv('./BX-Books-extended-137187-182916.csv', sep=";", dtype=str, encoding="iso-8859-1", escapechar='\\')
extended_b5 = pd.read_csv('./BX-Books-extended-182916-228645.csv', sep=";", dtype=str, encoding="iso-8859-1", escapechar='\\')

full_extended_books = pd.concat([extended_b1, extended_b2, extended_b3, extended_b4, extended_b5], axis=0, join='inner', ignore_index=True, keys=None,
               levels=None, names=None, verify_integrity=False, sort=False, copy=True)

# Save extended data in new file
full_extended_books.to_csv('./BX-Books-extended.csv', header=True, index=False, quoting=csv.QUOTE_ALL, sep=';')

display(full_extended_books[45725:45730])
print(f"Length of books {len(books)}")
print(f"Length of extended books {len(full_extended_books)}")

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Excerpt,Number-Of-Pages,Publish-Places,Tags
45725,1857997638,The Pit (Phoenix 60P Paperbacks),Doris Lessing,1996,Orion Publishing Co,http://images.amazon.com/images/P/1857997638.0...,http://images.amazon.com/images/P/1857997638.0...,http://images.amazon.com/images/P/1857997638.0...,,64,[],['English fiction']
45726,0140106286,Invention Of Solitude,Paul Auster,1988,Penguin Books,http://images.amazon.com/images/P/0140106286.0...,http://images.amazon.com/images/P/0140106286.0...,http://images.amazon.com/images/P/0140106286.0...,One day there is life.,173,"['New York, N.Y., U.S.A']","['American Authors', 'Authors, American', 'Bio..."
45727,034064415X,Beginner'S Japanese (Teach Yourself: Beginner'S),Helen Gilhooly,1996,Hodder Arnold Teach Yourself,http://images.amazon.com/images/P/034064415X.0...,http://images.amazon.com/images/P/034064415X.0...,http://images.amazon.com/images/P/034064415X.0...,,296,"['London', 'Lincolnwood, Ill']","['English', 'Japanese language', 'Self-instruc..."
45728,006093025X,The Mother Dance: How Children Change Your Life,Harriet Lerner,1999,Perennial,http://images.amazon.com/images/P/006093025X.0...,http://images.amazon.com/images/P/006093025X.0...,http://images.amazon.com/images/P/006093025X.0...,,336,[],"['Family & Relationships', 'Nonfiction', 'Self..."
45729,0753109263,Zelda'S Cut,Philippa Gregory,2003,Ulverscroft Large Print,http://images.amazon.com/images/P/0753109263.0...,http://images.amazon.com/images/P/0753109263.0...,http://images.amazon.com/images/P/0753109263.0...,,0,[],"['Fiction', 'Women novelists in fiction', 'Wom..."


Length of books 228643
Length of extended books 228643
