<a href="https://colab.research.google.com/github/jan-de-trop/Goodreads-Scraping-and-EDA/blob/main/Goodreads_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Goodreads Best Books Ever Analysis

### Importing Libraries

In [21]:
#Import libraries
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import time, requests
import seaborn as sns
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

### Scraping the Best Books Ever List

In [2]:
#Getting the url using requests module
URLSTART="https://www.goodreads.com"
BESTBOOKS="/list/show/1.Best_Books_Ever?page="
url = URLSTART+BESTBOOKS+'1'
page = requests.get(url)

In [3]:


page_dict={}
for i in range(2):
    url=URLSTART+BESTBOOKS+str(i+1)
    page_dict[i]=requests.get(url).text



In [4]:
#Import BeautifulSoup
from bs4 import BeautifulSoup

In [5]:
urldict={}
for i in range(2):
    soup= BeautifulSoup(page_dict[i], 'html.parser')
    dfinder = lambda tag:tag.name=="a" and tag.get('class') == ['bookTitle']
    table_demographics = soup.find_all(dfinder)
    refs=[ref['href'] for ref in table_demographics]
    urldict[i]=refs


100


In [6]:
from IPython.display import HTML
HTML(str(table_demographics))

### Scraping the webpage of each book (well only a few, really)

In [7]:

#Scraping one of the files
URLSTART="https://www.goodreads.com"
book_url=URLSTART+urldict[1][0]
stuff=requests.get(book_url)

#Check the status of the page
print(stuff.status_code)

#All OK!

200


In [8]:
#Fetching the actual 200 book pages
#In the interest of time, we are taking just the first 10 of each page. Running this for 200 books takes 25 min!
bookdict={}
URLSTART="https://www.goodreads.com"
for i in range(10):
    book_url=URLSTART+urldict[0][i]
    stuff=requests.get(book_url).text
    bookdict[i]=stuff



### Parsing each book page to extract information

Now that we have the HTML text for the books, we can extract information from these web pages. 
The following data is extracted:

- Published year
- Rating
- ISBN 
- Title of the book
- Author
- Genres this book fits in.
- Rating count, the number of people who have rated this book

#### Extracting Genres

In [9]:
#Extracting genre
def get_genre(d):
    genre=[]
    gen= BeautifulSoup(d, 'html.parser').find_all("a", class_="actionLinkLite bookPageGenreLink")
    genre=[ref['href']+'|' for ref in gen]
    return genre
print(get_genre(bookdict[1]))

['/genres/fantasy|', '/genres/young-adult|', '/genres/fiction|', '/genres/fantasy|', '/genres/magic|', '/genres/childrens|', '/genres/adventure|', '/genres/audiobook|', '/genres/childrens|', '/genres/middle-grade|', '/genres/classics|', '/genres/science-fiction-fantasy|']


In [10]:
BeautifulSoup(bookdict[1], 'html.parser').find_all("a", class_="actionLinkLite bookPageGenreLink")

[<a class="actionLinkLite bookPageGenreLink" href="/genres/fantasy">Fantasy</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/young-adult">Young Adult</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/fiction">Fiction</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/fantasy">Fantasy</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/magic">Magic</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/childrens">Childrens</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/adventure">Adventure</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/audiobook">Audiobook</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/childrens">Childrens</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/middle-grade">Middle Grade</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/classics">Classics</a>,
 <a class="actionLinkLite bookPageGenreLink" href="/genres/science-fiction-fa

#### Extract Publishing Year

In [11]:
#Extracting published year
yearre = r'\d{4}'
def get_year(d):

  d = BeautifulSoup(d,'html.parser')

  years=d.find("div", attrs={"class": "uitext darkGreyText"})
  years=years.findChildren("div")[1].text
  yearmatch=re.findall(yearre,years)
  years_original=d.find_all("nobr", attrs={"class": "greyText"})
  if years_original!=[]:
    finalyear=yearmatch[1]
    return finalyear
  else:
      if len(yearmatch) > 0:
          finalyear=yearmatch[0]
      else:
          finalyear="NA"
      return finalyear

#### Extracting the Rating, ISBN, Title of the book, Author and Rating Count

In [13]:
def get_author(d):
    genre=[]
    gen= BeautifulSoup(d, 'html.parser').find_all("a", class_="authorName")
    genre=[ref['href'] for ref in gen]
    return genre


In [14]:
def get_rating(d):
    genre=[]
    gen= BeautifulSoup(d, 'html.parser').find_all("span", itemprop="ratingValue")
    genre=[ref.text.replace("\n","") for ref in gen]
    return genre

In [15]:
def get_isbn(d):
    genre=[]
    gen= BeautifulSoup(d, 'html.parser').find_all("span", itemprop="isbn")
    genre=[ref.text.replace("\n","") for ref in gen]
    return genre
get_isbn(bookdict[1])

['9780439358071']

In [16]:
def get_ratingCount(d):
    genre=[]
    gen= BeautifulSoup(d, 'html.parser').find_all("meta", itemprop="ratingCount")
    genre=[ref['content']for ref in gen]
    return genre
get_ratingCount(bookdict[1])

['2612025']

In [17]:
def get_title(d):
    gen= BeautifulSoup(d, 'html.parser').find("h1", itemprop="name",id="bookTitle",class_="gr-h1 gr-h1--serif").get_text()
    return gen.strip().replace("\n","")
get_title(bookdict[1])

'Harry Potter and the Order of the Phoenix'

In [18]:
listofdicts=[]
book={}
for i in range(10):
    book={'title':get_title(bookdict[i]),'year': get_year(bookdict[i]),'genre':get_genre(bookdict[i]),'authors':get_author(bookdict[i]),'ratings':get_rating(bookdict[i]),'isbn':get_isbn(bookdict[i]),'ratingCount':get_ratingCount(bookdict[i])}
    listofdicts.append(book)
print(listofdicts[0:3])

[{'title': 'The Hunger Games', 'year': '2008', 'genre': ['/genres/young-adult|', '/genres/fiction|', '/genres/science-fiction|', '/genres/dystopia|', '/genres/fantasy|', '/genres/science-fiction|', '/genres/romance|', '/genres/adventure|', '/genres/young-adult|', '/genres/teen|', '/genres/apocalyptic|', '/genres/post-apocalyptic|', '/genres/action|'], 'authors': ['https://www.goodreads.com/author/show/153394.Suzanne_Collins'], 'ratings': ['  4.32'], 'isbn': ['9780439023481'], 'ratingCount': ['6597683']}, {'title': 'Harry Potter and the Order of the Phoenix', 'year': '2003', 'genre': ['/genres/fantasy|', '/genres/young-adult|', '/genres/fiction|', '/genres/fantasy|', '/genres/magic|', '/genres/childrens|', '/genres/adventure|', '/genres/audiobook|', '/genres/childrens|', '/genres/middle-grade|', '/genres/classics|', '/genres/science-fiction-fantasy|'], 'authors': ['https://www.goodreads.com/author/show/1077326.J_K_Rowling', 'https://www.goodreads.com/author/show/2927.Mary_GrandPr_'], 'r

### Creating a dataframe

In [19]:
df = pd.DataFrame.from_records(listofdicts)
df.head()

Unnamed: 0,title,year,genre,authors,ratings,isbn,ratingCount
0,The Hunger Games,2008,"[/genres/young-adult|, /genres/fiction|, /genr...",[https://www.goodreads.com/author/show/153394....,[ 4.32],[9780439023481],[6597683]
1,Harry Potter and the Order of the Phoenix,2003,"[/genres/fantasy|, /genres/young-adult|, /genr...",[https://www.goodreads.com/author/show/1077326...,[ 4.50],[9780439358071],[2612025]
2,To Kill a Mockingbird,1960,"[/genres/classics|, /genres/fiction|, /genres/...",[https://www.goodreads.com/author/show/1825.Ha...,[ 4.28],[],[4679544]
3,Pride and Prejudice,1813,"[/genres/classics|, /genres/fiction|, /genres/...",[https://www.goodreads.com/author/show/1265.Ja...,[ 4.27],[],[3113658]
4,Twilight,2005,"[/genres/young-adult|, /genres/fantasy|, /genr...",[https://www.goodreads.com/author/show/941441....,[ 3.61],[9780316015844],[5138241]


In [20]:
df.to_csv("Goodreads.csv", index=False, header=True)

### Loading and examining the data

In [23]:
#Read the data into a dataframe
df = pd.read_csv("Goodreads.csv")
df.head()

Unnamed: 0,title,year,genre,authors,ratings,isbn,ratingCount
0,The Hunger Games,2008,"['/genres/young-adult|', '/genres/fiction|', '...",['https://www.goodreads.com/author/show/153394...,[' 4.32'],['9780439023481'],['6597683']
1,Harry Potter and the Order of the Phoenix,2003,"['/genres/fantasy|', '/genres/young-adult|', '...",['https://www.goodreads.com/author/show/107732...,[' 4.50'],['9780439358071'],['2612025']
2,To Kill a Mockingbird,1960,"['/genres/classics|', '/genres/fiction|', '/ge...",['https://www.goodreads.com/author/show/1825.H...,[' 4.28'],[],['4679544']
3,Pride and Prejudice,1813,"['/genres/classics|', '/genres/fiction|', '/ge...",['https://www.goodreads.com/author/show/1265.J...,[' 4.27'],[],['3113658']
4,Twilight,2005,"['/genres/young-adult|', '/genres/fantasy|', '...",['https://www.goodreads.com/author/show/941441...,[' 3.61'],['9780316015844'],['5138241']
