In [1]:
# import libraries
import re
import numpy as np
import pandas as pd
import seaborn as sns
import requests
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from difflib import get_close_matches
from scipy.sparse import csr_matrix
from IPython.display import Image, display
from IPython.core.display import HTML
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
df_book_tags =  pd.read_csv("D:/hp/Documents/CDS513/Assignment/goodbooks-10k-master/book_tags.csv")
df_tags =  pd.read_csv("D:/hp/Documents/CDS513/Assignment/goodbooks-10k-master/tags.csv")
df_books =  pd.read_csv("D:/hp/Documents/CDS513/Assignment/goodbooks-10k-master/books.csv")
df_books_cleaned =  pd.read_csv("cleaned_books.csv")

In [4]:
# Defines a function to remove punctuation and numbers from a given text string.
def remove_punctuation_numbers(text):
  text = re.sub(r'[^\w]', '', text)
  text = re.sub(r'\d+', '', text)
  return text.lower()


In [7]:
df_book_tags_merged = pd.merge(df_book_tags, df_tags, how = 'left', left_on = 'tag_id'
                               , right_on = 'tag_id')


In [8]:
# Applies the `remove_punctuation_numbers` function to the 'tag_name' column of the Book Tags Merged DataFrame.
df_book_tags_merged['tag_name'] = df_book_tags_merged['tag_name'].apply(remove_punctuation_numbers)

In [9]:
df_book_tags_merged.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,toread
1,1,11305,37174,fantasy
2,1,11557,34173,favorites
3,1,8717,12986,currentlyreading
4,1,33114,12716,youngadult


In [10]:
# Retrieve common genres from https://www.goodreads.com/genres?ref=nav_brws_genres
url = "https://www.goodreads.com/genres?ref=nav_brws_genres"
# Set up headers to mimic a browser request
# This is important to avoid being blocked by the server for making automated requests.
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}  # Add User-Agent header to mimic a browser request
# Pass the headers to the request
response = requests.get(url, headers=headers)  
# Raise an exception for bad status codes
response.raise_for_status()  

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Find all genre links in the HTML content
# The CSS selector is used to find all anchor tags within the specified div structure.
genre_links = soup.select("div.bigBoxContent > div.left a.gr-hyperlink")
# Extract the text from each link and strip any leading/trailing whitespace
genres = [link.text.strip() for link in genre_links]

print(genres)

['Art', 'Biography', 'Business', 'Chick Lit', "Children's", 'Christian', 'Classics', 'Comics', 'Contemporary', 'Cookbooks', 'Crime', 'Ebooks', 'Fantasy', 'Fiction', 'Gay and Lesbian', 'Graphic Novels', 'Historical Fiction', 'History', 'Horror', 'Humor and Comedy', 'Manga', 'Memoir', 'Music', 'Mystery', 'Nonfiction', 'Paranormal', 'Philosophy', 'Poetry', 'Psychology', 'Religion', 'Romance', 'Science', 'Science Fiction', 'Self Help', 'Suspense', 'Spirituality', 'Sports', 'Thriller', 'Travel', 'Young Adult']


In [11]:
genre_lexicon = set(genres)
genre_lexicon = {remove_punctuation_numbers(genre) for genre in genre_lexicon}
print(genre_lexicon)

{'romance', 'comics', 'humorandcomedy', 'sports', 'business', 'mystery', 'religion', 'historicalfiction', 'horror', 'history', 'crime', 'nonfiction', 'chicklit', 'selfhelp', 'manga', 'fiction', 'philosophy', 'art', 'cookbooks', 'science', 'travel', 'poetry', 'paranormal', 'youngadult', 'christian', 'childrens', 'fantasy', 'music', 'psychology', 'biography', 'ebooks', 'contemporary', 'graphicnovels', 'suspense', 'sciencefiction', 'spirituality', 'thriller', 'memoir', 'classics', 'gayandlesbian'}


In [12]:
# Filters the Book Tags Merged DataFrame to keep only the tags that are present in the retrieved genre list.
df_book_tags_cleaned = df_book_tags_merged[df_book_tags_merged['tag_name'].isin([genre for genre in genre_lexicon])]


In [18]:
# Merges the cleaned Book Tags DataFrame with the Books DataFrame to replace 'goodreads_book_id' with 'book_id'.
df_book_tags = pd.merge(df_book_tags_cleaned, df_books[['book_id', 'goodreads_book_id']], on='goodreads_book_id', how='left')
df_book_tags.drop(columns=['goodreads_book_id'], inplace=True)
df_book_tags.head()

Unnamed: 0,tag_id,count,tag_name,book_id
0,11305,37174,fantasy,27
1,33114,12716,youngadult,27
2,11743,9954,fiction,27
3,6953,2408,childrens,27
4,6888,1095,childrens,27


In [19]:
df_book_tags = df_book_tags[df_book_tags['book_id'].isin(df_books_cleaned['book_id'])]

In [25]:
# Group by 'book_id' and join the 'tag_name' strings with a space
combined_tags = df_book_tags.groupby('book_id')['tag_name'].apply(lambda tags: ' '.join(tags))

In [27]:
df_combined_tags = combined_tags.reset_index()
df_combined_tags.columns = ['book_id', 'combined_tags']
df_combined_tags.head()

Unnamed: 0,book_id,combined_tags
0,1,youngadult fiction fantasy sciencefiction roma...
1,2,fantasy youngadult fiction childrens classics ...
2,4,classics historicalfiction youngadult fiction ...
3,5,classics fiction historicalfiction romance you...
4,6,youngadult fiction romance contemporary ebooks...


In [29]:
df_combined_tags.to_csv("cleaned_book_tags.csv", index=False)