# Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

import langid

import warnings
warnings.filterwarnings("ignore")

-----

###### Importing large dataset for books, with goodreads info:
- https://www.kaggle.com/datasets/khushdassani/goodreads-300k-dataset?select=goodreads.csv

In [2]:
data=pd.read_csv("goodreads.csv",index_col=0)
data=data.sort_values(by="rating_count", ascending=False)
data=data.reset_index(drop=True)

data

Unnamed: 0,id,title,authorName,rating,description,rating_count,review_count,img_url,url
0,196458,Harry Potter en die Towenaar se Steen,J.K. Rowling,4.48,Harry Potter dink hy is n doodgewone seun H...,8381457,132374,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/196458.Har...
1,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379630,132348,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/3.Harry_Po...
2,86940,هاري بوتر وحجر الفيلسوف,J.K. Rowling,4.48,هاري بوتر وحجر الفلسوف هو الكتاب الأول من س...,8379441,132345,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/86940._
3,63637,Harry Potter agus an Órchloch,J.K. Rowling,4.48,The Irish translation of the first Harry Pot...,8379251,132338,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/63637.Harr...
4,49811,הארי פוטר ואבן החכמים,J.K. Rowling,4.48,הארי פוטר מעולם לא היה כוכב של נבחרת קווידיץ ...,8379139,132337,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/49811._
...,...,...,...,...,...,...,...,...,...
306239,30505,"Rogue Real Estate Investor Collection, 2004 Ed...",Bryan Rundell,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30505.Rogu...
306240,30473,"James Harrison, Pioneering Genius",W.R Lang,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30473.Jame...
306241,30451,Murky Waters: Environmental Effects Of Aquacul...,Rebecca Goldburg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30451.Murk...
306242,30450,Medlab,Stephen Goldberg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30450.Medlab


In [3]:
## Ignoring non latin data

data=data[data['title'].str.contains(r'[a-zA-Z]+')]
data=data.reset_index(drop=True)
data

Unnamed: 0,id,title,authorName,rating,description,rating_count,review_count,img_url,url
0,196458,Harry Potter en die Towenaar se Steen,J.K. Rowling,4.48,Harry Potter dink hy is n doodgewone seun H...,8381457,132374,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/196458.Har...
1,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379630,132348,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/3.Harry_Po...
2,63637,Harry Potter agus an Órchloch,J.K. Rowling,4.48,The Irish translation of the first Harry Pot...,8379251,132338,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/63637.Harr...
3,49773,Harry Potter e la Pietra Filosofale,J.K. Rowling,4.48,Harry Potter è un predestinato ha una cicatr...,8379139,132337,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/49773.Harr...
4,47522,Harry Potter und der Stein der Weisen,J.K. Rowling,4.48,Eigentlich hatte Harry geglaubt er sei ein g...,8379100,132337,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/47522.Harr...
...,...,...,...,...,...,...,...,...,...
305245,30505,"Rogue Real Estate Investor Collection, 2004 Ed...",Bryan Rundell,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30505.Rogu...
305246,30473,"James Harrison, Pioneering Genius",W.R Lang,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30473.Jame...
305247,30451,Murky Waters: Environmental Effects Of Aquacul...,Rebecca Goldburg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30451.Murk...
305248,30450,Medlab,Stephen Goldberg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30450.Medlab


In [4]:
## getting only titles in english

def detect_language(text):
    lang, _ = langid.classify(text)
    return lang

# Apply the function on the 'title' column
data['title_language'] = data['title'].apply(detect_language)

# Filter only English titles
en_data = data[data['title_language'] == 'en']
en_data

Unnamed: 0,id,title,authorName,rating,description,rating_count,review_count,img_url,url,title_language
1,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379630,132348,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/3.Harry_Po...,en
6,43504,Harry Potter and the Philosopher's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379055,132336,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/43504.Harr...,en
7,49772,Harri Potter a Maen yr Athronydd,J.K. Rowling,4.48,Rescued from the outrageous neglect of his au...,8378971,132333,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/49772.Harr...,en
12,2767052,The Hunger Games,Suzanne Collins,4.32,Could you survive on your own in the wild w...,7288063,183095,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/2767052-th...,en
14,12024,Twilight,Stephenie Meyer,3.62,About three things I was absolutely positive ...,5667779,113710,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/12024.Twil...,en
...,...,...,...,...,...,...,...,...,...,...
305240,30878,Formula Three.,Ron White,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30878.Form...,en
305243,30719,"The Rise, Fall, and Future of the Internet Eco...",Dan Burstein,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30719.The_...,en
305247,30451,Murky Waters: Environmental Effects Of Aquacul...,Rebecca Goldburg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30451.Murk...,en
305248,30450,Medlab,Stephen Goldberg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30450.Medlab,en


In [5]:
## looping a second time

en_data['title_language'] = en_data['title'].apply(detect_language)
en_data2 = en_data[en_data['title_language'] == 'en']
en_data2

Unnamed: 0,id,title,authorName,rating,description,rating_count,review_count,img_url,url,title_language
1,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379630,132348,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/3.Harry_Po...,en
6,43504,Harry Potter and the Philosopher's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379055,132336,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/43504.Harr...,en
7,49772,Harri Potter a Maen yr Athronydd,J.K. Rowling,4.48,Rescued from the outrageous neglect of his au...,8378971,132333,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/49772.Harr...,en
12,2767052,The Hunger Games,Suzanne Collins,4.32,Could you survive on your own in the wild w...,7288063,183095,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/2767052-th...,en
14,12024,Twilight,Stephenie Meyer,3.62,About three things I was absolutely positive ...,5667779,113710,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/12024.Twil...,en
...,...,...,...,...,...,...,...,...,...,...
305240,30878,Formula Three.,Ron White,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30878.Form...,en
305243,30719,"The Rise, Fall, and Future of the Internet Eco...",Dan Burstein,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30719.The_...,en
305247,30451,Murky Waters: Environmental Effects Of Aquacul...,Rebecca Goldburg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30451.Murk...,en
305248,30450,Medlab,Stephen Goldberg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30450.Medlab,en


In [6]:
### Removing duplicates

# Create temporary columns
en_data2['lower_title'] = en_data2['title'].str.lower()
en_data2['lower_authorName'] = en_data2['authorName'].str.lower()

# Remove duplicates
en_data2 = en_data2.drop_duplicates(subset=['lower_title', 'lower_authorName'])

# Drop temporary columns
en_data2 = en_data2.drop(columns=['lower_title', 'lower_authorName'])
en_data2

Unnamed: 0,id,title,authorName,rating,description,rating_count,review_count,img_url,url,title_language
1,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379630,132348,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/3.Harry_Po...,en
6,43504,Harry Potter and the Philosopher's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379055,132336,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/43504.Harr...,en
7,49772,Harri Potter a Maen yr Athronydd,J.K. Rowling,4.48,Rescued from the outrageous neglect of his au...,8378971,132333,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/49772.Harr...,en
12,2767052,The Hunger Games,Suzanne Collins,4.32,Could you survive on your own in the wild w...,7288063,183095,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/2767052-th...,en
14,12024,Twilight,Stephenie Meyer,3.62,About three things I was absolutely positive ...,5667779,113710,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/12024.Twil...,en
...,...,...,...,...,...,...,...,...,...,...
305240,30878,Formula Three.,Ron White,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30878.Form...,en
305243,30719,"The Rise, Fall, and Future of the Internet Eco...",Dan Burstein,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30719.The_...,en
305247,30451,Murky Waters: Environmental Effects Of Aquacul...,Rebecca Goldburg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30451.Murk...,en
305248,30450,Medlab,Stephen Goldberg,0.00,empty,0,0,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/30450.Medlab,en


---

###### Best books by defining a rating - to reduce the dataframe

In [7]:
data=en_data2

min_rating=3.5
min_votes=1000

best_data=data[(data["rating"]>=min_rating)&(data["rating_count"]>=min_votes)]
best_data=best_data.reset_index(drop=True)
best_data

Unnamed: 0,id,title,authorName,rating,description,rating_count,review_count,img_url,url,title_language
0,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379630,132348,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/3.Harry_Po...,en
1,43504,Harry Potter and the Philosopher's Stone,J.K. Rowling,4.48,Harry Potter s life is miserable His parents...,8379055,132336,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/43504.Harr...,en
2,49772,Harri Potter a Maen yr Athronydd,J.K. Rowling,4.48,Rescued from the outrageous neglect of his au...,8378971,132333,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/49772.Harr...,en
3,2767052,The Hunger Games,Suzanne Collins,4.32,Could you survive on your own in the wild w...,7288063,183095,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/2767052-th...,en
4,12024,Twilight,Stephenie Meyer,3.62,About three things I was absolutely positive ...,5667779,113710,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/12024.Twil...,en
...,...,...,...,...,...,...,...,...,...,...
44458,3401778,The Book on Leadership,John F. MacArthur Jr.,4.14,In The Book on Leadership best selling aut...,1000,89,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/3401778-th...,en
44459,153664,The Scavenger's Guide to Haute Cuisine,Steven Rinella,4.35,Describes one man s efforts to live on the la...,1000,99,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/153664.The...,en
44460,421398,China Marine: An Infantryman's Life After Worl...,Eugene B. Sledge,4.25,See E B Sledge s story in the HBO miniserie...,1000,76,https://i.gr-assets.com/images/S/compressed.ph...,https://www.goodreads.com/book/show/421398.Chi...,en
44461,38757,The Price,Arthur Miller,3.82,In a building slated for imminent demolition ...,1000,91,https://d15be2nos83ntc.cloudfront.net/images/n...,https://www.goodreads.com/book/show/38757.The_...,en


In [8]:
best_data.to_csv("best_books.csv")

----
----
----

# Authors birthdays

In [9]:
unique_names = best_data['authorName'].unique()
unique_names

array(['J.K. Rowling', 'Suzanne Collins', 'Stephenie Meyer', ...,
       'Edward Rice', 'Mary Tyler Moore', 'Steven Rinella'], dtype=object)

In [10]:
# Function to get birthdays of the authors from wikipedia

def get_birthdate_from_wikipedia(person_name):
    try:
        url = f"https://en.wikipedia.org/wiki/{person_name.replace(' ', '_')}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        infobox = soup.find('table', class_='infobox')
        birthdate_row = infobox.find('th', text='Born').parent
        birthdate = birthdate_row.find('span', class_='bday').text

        return birthdate
    except:
        return np.nan

In [11]:
df_birthdays=pd.DataFrame(unique_names, columns=['Name'])
df_birthdays['Birthdate'] = df_birthdays['Name'].apply(get_birthdate_from_wikipedia)
df_birthdays

Unnamed: 0,Name,Birthdate
0,J.K. Rowling,1965-07-31
1,Suzanne Collins,1962-08-10
2,Stephenie Meyer,1973-12-24
3,Harper Lee,1926-04-28
4,F. Scott Fitzgerald,1896-09-24
...,...,...
12363,Frances Farmer,1913-09-19
12364,Duane Elmer,
12365,Edward Rice,
12366,Mary Tyler Moore,1936-12-29


In [12]:
df_birthdays_notnan = df_birthdays[df_birthdays['Birthdate'].notnull()].copy().reset_index(drop=True)
df_birthdays_notnan

Unnamed: 0,Name,Birthdate
0,J.K. Rowling,1965-07-31
1,Suzanne Collins,1962-08-10
2,Stephenie Meyer,1973-12-24
3,Harper Lee,1926-04-28
4,F. Scott Fitzgerald,1896-09-24
...,...,...
5951,Hervé This,1955-06-05
5952,Charles Le Gai Eaton,1921-01-01
5953,Frances Farmer,1913-09-19
5954,Mary Tyler Moore,1936-12-29


In [13]:
df_birthdays_notnan.to_csv("df_birthdays_books.csv")

----
----
