In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/books_data.csv')
df = df.dropna()
df.rename(columns={'publishedDate': 'yearPublished', 'ratingsCount': 'avgRating', 
                   'description': 'synopsis', 'image': 'coverImg'}, inplace=True)
df.drop(columns=['previewLink'], inplace=True)
df.head()

Unnamed: 0,Title,synopsis,authors,coverImg,publisher,yearPublished,infoLink,categories,avgRating
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],http://books.google.com/books/content?id=kVqRa...,Wm. B. Eerdmans Publishing,1996,http://books.google.nl/books?id=kVqRaiPlx88C&d...,['Religion'],5.0
31,Voices from the Farm: Adventures in Community ...,"Twenty-five years ago, at the height of the co...",['Rupert Fike'],http://books.google.com/books/content?id=IjTAB...,Book Publishing Company,2012-08-21,https://play.google.com/store/books/details?id...,['Biography & Autobiography'],1.0
33,The Battleship Bismarck,The Bismarck is perhaps the most famous – and ...,['Stefan Draminski'],http://books.google.com/books/content?id=nxttD...,Bloomsbury Publishing,2018-09-20,https://play.google.com/store/books/details?id...,['History'],1.0
42,Tess and the Highlander,"In 1543, on a windswept isle off of Scotland, ...",['May Mcgoldrick'],http://books.google.com/books/content?id=VmCRS...,Harper Collins,2002-11,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,['Juvenile Fiction'],2.0
43,Beginner's Yoruba (Hippocrene Beginner's Series),"""Beginner's Yoruba"" is now available with two ...",['Kayode J. Fakinlede'],http://books.google.com/books/content?id=xLe4n...,Hippocrene Books,2005,http://books.google.nl/books?id=xLe4nWzeSw0C&d...,['Foreign Language Study'],1.0


Clear avgRating to 0, and convert published dates to year published

In [3]:
df['avgRating'] = 0
df['avgRating'] = df['avgRating'].astype(int)
df['yearPublished'] = df['yearPublished'].apply(lambda x: x.split('-')[0])
df['yearPublished'] = df['yearPublished'].astype(int)


Clean up categories and authors. Remove any titles with multiple categories and authors

In [4]:
df['categories'] = df['categories'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", "").split(','))
df = df[df['categories'].apply(len) == 1]
df['categories'] = df['categories'].apply(lambda x: x[0])

df['authors'] = df['authors'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", "").split(','))
df = df[df['authors'].apply(len) == 1]
df['authors'] = df['authors'].apply(lambda x: x[0].replace('"', ''))

df['publisher'] = df['publisher'].apply(lambda x: x.strip().replace(',', ''))

df.rename(columns={'categories': 'category', 'authors': 'author'}, inplace=True)


Filter to Fiction books

In [5]:
fiction_df = df[df['category'] == "Fiction"]
fiction_df.drop(columns=['category'], inplace=True)
fiction_df['genre'] = ''
print(f'{fiction_df.shape[0]} fiction books')
fiction_df.head()

10354 fiction books


Unnamed: 0,Title,synopsis,author,coverImg,publisher,yearPublished,infoLink,avgRating,genre
83,Perry Mason in the Case of Too Many Murders,A Los Angeles businessman kills his dinner com...,Thomas Chastain,http://books.google.com/books/content?id=oOdmG...,Avon Books,1990,http://books.google.nl/books?id=oOdmGU8i_H8C&d...,0,
115,From Potter's Field,"The sixth book in the Kay Scarpetta series, fr...",Patricia Cornwell,http://books.google.com/books/content?id=prefg...,Hachette UK,2008,https://play.google.com/store/books/details?id...,0,
144,Edge of Danger,The action-packed new Sean Dillon novel. Maste...,Jack Higgins,http://books.google.com/books/content?id=OgwoC...,HarperCollins UK,2015,https://play.google.com/store/books/details?id...,0,
155,Dead Sexy,In The Still Of The Night The city is in a pan...,Amanda Ashley,http://books.google.com/books/content?id=AiRjA...,Zebra Books,2013,https://play.google.com/store/books/details?id...,0,
196,Days of Grass,"The free humans lived underground, secretive, ...",Tanith Lee,http://books.google.com/books/content?id=r2mgn...,Hachette UK,2013,https://play.google.com/store/books/details?id...,0,


Randomly sample 500 books


In [6]:
fiction_small_df = fiction_df.sample(n=500, random_state=42)

Define 13 genres

In [7]:
genre_labels = [
    "Contemporary Fiction",
    "Literary Fiction",
    "Mystery/Thriller",
    "Historical Fiction",
    "Science Fiction",
    "Fantasy",
    "Romance",
    "Young Adult Fiction",
    "Dystopian Fiction",
    "Horror",
    "Magical Realism",
    "Adventure Fiction",
    "Miscellaneous Fiction"
]

Invite gpt-4o-mini to categorize our books into genres

In [8]:
from openai import OpenAI
from key import key


def make_genres(df, column, labels):
    n = df.shape[0]
    client = OpenAI(api_key=key)

    llm_topics = []
    count = 1
    for _, row in df.iterrows():
        count += 1
        article = row[column]
        
        prompt = f'''
    Categorize the following synopsis:

    ```{article}```

    Into one of the following genres:

    {labels}

    Any example that does not fit the others goes in the last genre. Return only the genre name.

    '''
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages = [
                {'role': 'user', 'content': prompt}
            ]
        )

        llm_topics.append(completion.choices[0].message.content)
        print(f'\rEvaluated {int(count / n * 100)}%', end='')
    print('\ndone!')
        
    df['genre'] = llm_topics
    

In [9]:
make_genres(fiction_small_df, 'synopsis', genre_labels)


Evaluated 100%
done!


In [10]:
authors = fiction_small_df['author'].unique().tolist()
authors.sort()
fiction_small_df['authorId'] = fiction_small_df['author'].apply(lambda x: authors.index(x))

publishers = fiction_small_df['publisher'].unique().tolist()
publishers.sort()
fiction_small_df['publisherId'] = fiction_small_df['publisher'].apply(lambda x: publishers.index(x))

genres = fiction_small_df['genre'].unique().tolist()
genres.sort()
fiction_small_df['genreId'] = fiction_small_df['genre'].apply(lambda x: genres.index(x))

fiction_small_df.drop(columns=['author', 'publisher', 'genre'], inplace=True)
fiction_small_df

Unnamed: 0,Title,synopsis,coverImg,yearPublished,infoLink,avgRating,authorId,publisherId,genreId
166091,The Hound of the Baskervilles (Signet Classics),Sir Charles Baskerville's sudden heart attack ...,http://books.google.com/books/content?id=JTrUz...,2001,http://books.google.com/books?id=JTrUzgEACAAJ&...,0,25,154,9
159242,The Merchant Prince,“Feist has a natural talent for keeping the re...,http://books.google.com/books/content?id=1A5fU...,2009,https://play.google.com/store/books/details?id...,0,339,87,3
25596,Love is a Four Letter Word,Sex. Yes. She remembered that. Wasn't that the...,http://books.google.com/books/content?id=1p4RE...,2020,https://play.google.com/store/books/details?id...,0,67,28,1
24906,The Return from Troy,PART FOUR OF THE TROY QUARTET Bringing ancient...,http://books.google.com/books/content?id=R9GbD...,2019,https://play.google.com/store/books/details?id...,0,256,91,4
86771,Blind Date Disasters / Eat Your Heart Out) (Ha...,Blind Date As an interior designer and an iden...,http://books.google.com/books/content?id=voEDK...,2010,https://play.google.com/store/books/details?id...,0,193,84,10
...,...,...,...,...,...,...,...,...,...
6499,The Heart is a Lonely Hunter,The Heart is a Lonely Hunter was Carson McCull...,http://books.google.com/books/content?id=eCPaw...,2012,https://play.google.com/store/books/details?id...,0,50,136,6
171946,Whose body?: A Lord Peter Wimsey novel,"There's a corpse in the bathtub, wearing nothi...",http://books.google.com/books/content?id=PkvVB...,2015,https://play.google.com/store/books/details?id...,0,100,157,9
54333,Staring at the Light: A Sarah Fortune Mystery ...,Someone has stolen the only person John Smith ...,http://books.google.com/books/content?id=57hXA...,2014,https://play.google.com/store/books/details?id...,0,129,87,9
55365,The Deep Green Sea: A Novel.,Traces the romance between a Vietnamese woman ...,http://books.google.com/books/content?id=sRpb5...,1998,http://books.google.com/books?id=sRpb55PrO2AC&...,0,352,77,4


Write out CSV files

In [11]:
fiction_small_df.to_csv('../data/books.csv', index=False)

authors_df = pd.DataFrame(authors, columns=['authorName'])
authors_df.to_csv('../data/authors.csv', index=False)

publishers_df = pd.DataFrame(publishers, columns=['publisherName'])
publishers_df.to_csv('../data/publishers.csv', index=False)

genres_df = pd.DataFrame(genres, columns=['genre'])
genres_df.to_csv('../data/genres.csv', index=False)