In [3]:
import requests, zipfile, io, os
import pandas as pd

In [3]:
zip_url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'

In [1]:
!mkdir -p data

In [4]:
if not os.path.exists('data/BX-Books.csv'):
  r = requests.get(zip_url)
  z = zipfile.ZipFile(io.BytesIO(r.content))
  z.extractall("data/")

In [5]:
books_df = pd.read_csv('data/BX-Books.csv', escapechar="\\", sep=';', encoding='latin-1')
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
print(f'The number of rows in the dataset is {books_df.shape[0]}')
print(f'The number of columns in the dataset is {books_df.shape[1]}')

The number of rows in the dataset is 271379
The number of columns in the dataset is 8


In [7]:
print(f'The number of Books in the dataset is {books_df.ISBN.nunique()}')

The number of Books in the dataset is 271379


No duplicate book records in our dataset

In [8]:
books_df.isna().sum()

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            0
dtype: int64

In [9]:
for col in books_df.columns:
  print(f'Unique values in column "{col}" is {books_df[col].nunique()}')

Unique values in column "ISBN" is 271379
Unique values in column "Book-Title" is 242154
Unique values in column "Book-Author" is 102027
Unique values in column "Year-Of-Publication" is 116
Unique values in column "Publisher" is 16806
Unique values in column "Image-URL-S" is 271063
Unique values in column "Image-URL-M" is 271063
Unique values in column "Image-URL-L" is 271063


In [10]:
books_df[books_df[['Image-URL-S']].duplicated()][['ISBN', 'Image-URL-S']]

Unnamed: 0,ISBN,Image-URL-S
6636,002542730x,http://images.amazon.com/images/P/002542730X.0...
11926,014062080x,http://images.amazon.com/images/P/014062080X.0...
16299,097089726X,http://images.amazon.com/images/P/097089726X.0...
19267,039592720x,http://images.amazon.com/images/P/039592720X.0...
22340,042512164X,http://images.amazon.com/images/P/042512164X.0...
...,...,...
267039,088404632x,http://images.amazon.com/images/P/088404632X.0...
269316,055215038X,http://images.amazon.com/images/P/055215038X.0...
269571,037376099x,http://images.amazon.com/images/P/037376099X.0...
270304,189481505X,http://images.amazon.com/images/P/189481505X.0...


In [11]:
books_df[books_df['Image-URL-S'] == books_df[books_df['ISBN'] == '002542730x']['Image-URL-S'].values[0]]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
3739,002542730X,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...
6636,002542730x,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...,http://images.amazon.com/images/P/002542730X.0...


ISBN seems to be duplicated with different cases

In [12]:
books_df['ISBN'] = books_df['ISBN'].str.upper()

In [13]:
print(f'The number of Books in the dataset is {books_df.ISBN.nunique()}')

The number of Books in the dataset is 271065


In [14]:
books_df.nunique()

ISBN                   271065
Book-Title             242154
Book-Author            102027
Year-Of-Publication       116
Publisher               16806
Image-URL-S            271063
Image-URL-M            271063
Image-URL-L            271063
dtype: int64

In [15]:
books_df.drop_duplicates(['ISBN'], inplace=True)

In [16]:
print(f'The number of rows in the dataset is {books_df.shape[0]}')
print(f'The number of columns in the dataset is {books_df.shape[1]}')

The number of rows in the dataset is 271065
The number of columns in the dataset is 8


In [17]:
print(f'The number of Books in the dataset is {books_df.ISBN.nunique()}')

The number of Books in the dataset is 271065


In [18]:
books_df.nunique()

ISBN                   271065
Book-Title             242153
Book-Author            102027
Year-Of-Publication       116
Publisher               16806
Image-URL-S            271063
Image-URL-M            271063
Image-URL-L            271063
dtype: int64

In [19]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [20]:
import bs4
from bs4 import BeautifulSoup
import requests

In [21]:
def get_html(ISBN):
    URL = f"https://www.goodreads.com/search?q={ISBN}"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup

In [26]:
ISBN = '0195153448'
html_text = get_html(ISBN)

In [None]:
genresDiv = html_text.select('div[data-testid="genresList"]')
genresDiv

[<div class="BookPageMetadataSection__genres" data-testid="genresList"><ul aria-label="Top genres for this book" class="CollapsableList"><span tabindex="-1"><span class="BookPageMetadataSection__genrePlainText"><span class="Text Text__body3 Text__subdued">Genres</span></span><span class="BookPageMetadataSection__genreButton"><a class="Button Button--tag-inline Button--small" href="https://www.goodreads.com/genres/mythology"><span class="Button__labelItem">Mythology</span></a></span><span class="BookPageMetadataSection__genreButton"><a class="Button Button--tag-inline Button--small" href="https://www.goodreads.com/genres/history"><span class="Button__labelItem">History</span></a></span><span class="BookPageMetadataSection__genreButton"><a class="Button Button--tag-inline Button--small" href="https://www.goodreads.com/genres/classics"><span class="Button__labelItem">Classics</span></a></span><span class="BookPageMetadataSection__genreButton"><a class="Button Button--tag-inline Button--sm

In [None]:
print(ISBN, end=' - ')
for genreSpan in genresDiv[0].find('ul').find('span').findAll('span'):
  if genreSpan.get('class')[0]  == 'BookPageMetadataSection__genreButton':
    print(genreSpan.find('span').getText(), end=', ')

0195153448 - Mythology, History, Classics, Nonfiction, Reference, Textbooks, Religion, 

In [22]:
def getGenres(ISBN, name=''): 
  try:
    html_text = get_html(ISBN)
    genresDiv = html_text.select('div[data-testid="genresList"]')

    try_limit = 5
    while not genresDiv:
      html_text = get_html(ISBN)
      genresDiv = html_text.select('div[data-testid="genresList"]')
      try_limit -= 1
      if try_limit == 0:
        break

    print(ISBN, name, end=' - ')

    # count = 0
    genres = []
    for genreSpan in genresDiv[0].find('ul').find('span').findAll('span'):
      if genreSpan.get('class'):
        if genreSpan.get('class')[0]  == 'BookPageMetadataSection__genreButton':
          # count += 1
          genre = genreSpan.find('span').getText()
          genres.append(genre)
          print(genre, end=', ')
          # if count > 4:
          #   break
    print()
    return ', '.join(genres)
  except:
    print('Not able to get')
    return []

In [25]:
books_genre = {'ISBN': [], 'genres': []}
for ind, ISBN in enumerate(books_df.ISBN.values[:1000]):
  print(ind, end=' ')
  name = books_df[books_df['ISBN'] == ISBN]['Book-Title'].values[0]
  genres = getGenres(ISBN, name)
  books_genre['ISBN'].append(ISBN)
  books_genre['genres'].append(genres)

0 0195153448 Classical Mythology - Mythology, History, Classics, Nonfiction, Reference, Textbooks, Religion, 
1 0002005018 Clara Callan - Fiction, Canada, Historical Fiction, Literary Fiction, Canadian Literature, Historical, Novels, 
2 0060973129 Decision in Normandy - History, World War II, Nonfiction, Military History, War, Military Fiction, World History, 
3 0374157065 Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It - History, Nonfiction, Science, Medical, Medicine, Health, Disease, 
4 0393045218 The Mummies of Urumchi - History, Archaeology, Nonfiction, China, Anthropology, Asia, Ancient History, 
5 0399135782 The Kitchen God's Wife - Fiction, Historical Fiction, China, Asia, Contemporary, Asian Literature, Novels, 
6 0425176428 What If?: The World's Foremost Military Historians Imagine What Might Have Been - History, Nonfiction, Alternate History, Military Fiction, Military History, War, Essays, 
7 0671870432 PLEADING GUILTY - Fi

In [26]:
books_genres_df = pd.DataFrame.from_dict(books_genre)
books_genres_df.head()

Unnamed: 0,ISBN,genres
0,195153448,"Mythology, History, Classics, Nonfiction, Refe..."
1,2005018,"Fiction, Canada, Historical Fiction, Literary ..."
2,60973129,"History, World War II, Nonfiction, Military Hi..."
3,374157065,"History, Nonfiction, Science, Medical, Medicin..."
4,393045218,"History, Archaeology, Nonfiction, China, Anthr..."


 <font color='red'>need to check if any of the genres has to be elimiated after pulling all the genres</font> 

In [28]:
books_genres_df.shape

(1000, 2)

In [29]:
books_genres_df.to_csv('books_genres.csv')