In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/books_data.csv')
df = df.dropna()
df.rename(columns={'publishedDate': 'yearPublished', 'ratingsCount': 'avgRating', 
                   'description': 'synopsis', 'image': 'coverImg'}, inplace=True)
df.drop(columns=['previewLink'], inplace=True)
df.head()

Unnamed: 0,Title,synopsis,authors,coverImg,publisher,yearPublished,infoLink,categories,avgRating
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],http://books.google.com/books/content?id=kVqRa...,Wm. B. Eerdmans Publishing,1996,http://books.google.nl/books?id=kVqRaiPlx88C&d...,['Religion'],5.0
31,Voices from the Farm: Adventures in Community ...,"Twenty-five years ago, at the height of the co...",['Rupert Fike'],http://books.google.com/books/content?id=IjTAB...,Book Publishing Company,2012-08-21,https://play.google.com/store/books/details?id...,['Biography & Autobiography'],1.0
33,The Battleship Bismarck,The Bismarck is perhaps the most famous – and ...,['Stefan Draminski'],http://books.google.com/books/content?id=nxttD...,Bloomsbury Publishing,2018-09-20,https://play.google.com/store/books/details?id...,['History'],1.0
42,Tess and the Highlander,"In 1543, on a windswept isle off of Scotland, ...",['May Mcgoldrick'],http://books.google.com/books/content?id=VmCRS...,Harper Collins,2002-11,http://books.google.nl/books?id=VmCRSPmY3WkC&d...,['Juvenile Fiction'],2.0
43,Beginner's Yoruba (Hippocrene Beginner's Series),"""Beginner's Yoruba"" is now available with two ...",['Kayode J. Fakinlede'],http://books.google.com/books/content?id=xLe4n...,Hippocrene Books,2005,http://books.google.nl/books?id=xLe4nWzeSw0C&d...,['Foreign Language Study'],1.0


Clear avgRating to 0, and convert published dates to year published

In [3]:
df['avgRating'] = 0
df['avgRating'] = df['avgRating'].astype(int)
df['yearPublished'] = df['yearPublished'].apply(lambda x: x.split('-')[0])
df['yearPublished'] = df['yearPublished'].astype(int)


Clean up categories and authors. Remove any titles with multiple categories and authors

In [4]:
df['categories'] = df['categories'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", "").split(','))
df = df[df['categories'].apply(len) == 1]
df['categories'] = df['categories'].apply(lambda x: x[0])

df['authors'] = df['authors'].apply(lambda x: x.replace('[', '').replace(']', '').replace("'", "").split(','))
df = df[df['authors'].apply(len) == 1]
df['authors'] = df['authors'].apply(lambda x: x[0].replace('"', ''))

df['publisher'] = df['publisher'].apply(lambda x: x.strip().replace(',', ''))

df.rename(columns={'categories': 'category', 'authors': 'author'}, inplace=True)


Filter to Fiction books

In [5]:
fiction_df = df[df['category'] == "Fiction"]
fiction_df.drop(columns=['category'], inplace=True)
fiction_df['genre'] = ''
print(f'{fiction_df.shape[0]} fiction books')
fiction_df.head()

10354 fiction books


Unnamed: 0,Title,synopsis,author,coverImg,publisher,yearPublished,infoLink,avgRating,genre
83,Perry Mason in the Case of Too Many Murders,A Los Angeles businessman kills his dinner com...,Thomas Chastain,http://books.google.com/books/content?id=oOdmG...,Avon Books,1990,http://books.google.nl/books?id=oOdmGU8i_H8C&d...,0,
115,From Potter's Field,"The sixth book in the Kay Scarpetta series, fr...",Patricia Cornwell,http://books.google.com/books/content?id=prefg...,Hachette UK,2008,https://play.google.com/store/books/details?id...,0,
144,Edge of Danger,The action-packed new Sean Dillon novel. Maste...,Jack Higgins,http://books.google.com/books/content?id=OgwoC...,HarperCollins UK,2015,https://play.google.com/store/books/details?id...,0,
155,Dead Sexy,In The Still Of The Night The city is in a pan...,Amanda Ashley,http://books.google.com/books/content?id=AiRjA...,Zebra Books,2013,https://play.google.com/store/books/details?id...,0,
196,Days of Grass,"The free humans lived underground, secretive, ...",Tanith Lee,http://books.google.com/books/content?id=r2mgn...,Hachette UK,2013,https://play.google.com/store/books/details?id...,0,


Randomly sample 500 books


In [23]:
fiction_small_df = fiction_df.sample(n=500, random_state=5)

Define 13 genres

In [25]:
genre_labels = [
    "Contemporary Fiction",
    "Literary Fiction",
    "Mystery/Thriller",
    "Historical Fiction",
    "Science Fiction",
    "Fantasy",
    "Romance",
    "Young Adult Fiction",
    "Dystopian Fiction",
    "Horror",
    "Magical Realism",
    "Adventure Fiction",
    "Miscellaneous Fiction",
    "Nonfiction"
]

Invite gpt-4o-mini to categorize our books into genres

In [None]:
from openai import OpenAI
from key import key


def make_genres(df, columns, labels):
    n = df.shape[0]
    client = OpenAI(api_key=key)

    genres = []
    count = 1
    for _, row in df.iterrows():
        count += 1
        texts = []
        for column in columns:
            texts.append(row[column])
        article = ' - '.join(texts)
        
        prompt = f'''
    Categorize the following book, by <author> - <title> - <synopsis>:

    ```{article}```

    Into one of the following genres:

    {labels}

    Any example that does not fit the others goes into "Miscellaneous fiction". Return only the genre name.

    '''
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            temperature=0,
            messages = [
                {'role': 'user', 'content': prompt}
            ]
        )

        genres.append(completion.choices[0].message.content)
        print(f'\rEvaluated {int(count / n * 100)}%', end='')
    print('\ndone!')
        
    df['genre'] = genres
    

In [27]:
make_genres(fiction_small_df, ['author', 'Title', 'synopsis'], genre_labels)


Evaluated 100%
done!


In [28]:
fiction_small_df.head(10)

Unnamed: 0,Title,synopsis,author,coverImg,publisher,yearPublished,infoLink,avgRating,genre
161419,I am the Clay,“[Chaim] Potok writes powerfully about the suf...,Chaim Potok,http://books.google.com/books/content?id=Tl6FU...,Fawcett,2010,https://play.google.com/store/books/details?id...,0,Historical Fiction
120175,Bergdorf Blondes : A Novel,"Meet moi, 'a champagne bubble of a girl about ...",Plum Sykes,http://books.google.com/books/content?id=DmRMD...,Bloomsbury Publishing,2018,https://play.google.com/store/books/details?id...,0,Contemporary Fiction
66430,O Little Town of Maggody: An Arly Hanks Mystery,"Country music, greed, and the unique madness o...",Joan Hess,http://books.google.com/books/content?id=AUAED...,Open Road Media,2016,https://play.google.com/store/books/details?id...,0,Mystery/Thriller
87894,Little Myth Marker,"Skeeve, a young magician, and Aahz, his demon ...",Robert Asprin,http://books.google.com/books/content?id=Ta_z_...,Ace Books,2006,http://books.google.com/books?id=Ta_z_X30_eoC&...,0,Fantasy
99265,Winter Winds (Seaside Seasons #4),"In Winter Winds, Gayle Roper concludes her del...",Gayle Roper,http://books.google.com/books/content?id=RQuZc...,Multnomah,2011,https://play.google.com/store/books/details?id...,0,Romance
34674,The Jury (Paul Madriani Novels),"The Attorney, which marked the return of Steve...",Steve Martini,http://books.google.com/books/content?id=8Cc_L...,Penguin,2002,https://play.google.com/store/books/details?id...,0,Mystery/Thriller
32453,Defector,In Moscow Rules Gabriel Allon went up against ...,Daniel Silva,http://books.google.com/books/content?id=FpJbi...,Penguin,2009,https://play.google.com/store/books/details?id...,0,Mystery/Thriller
100535,The Shadow Box,Preparing for an exhibit that includes a piece...,Luanne Rice,http://books.google.com/books/content?id=5PRhz...,Thomas & Mercer,2021,http://books.google.com/books?id=5PRhzQEACAAJ&...,0,Mystery/Thriller
137674,Adolescente: La Clave para ser Libre y Feliz (...,"""Nineteen Eighty-Four: A Novel"", often publish...",George Orwell,http://books.google.com/books/content?id=PEpoE...,epubli,2021,https://play.google.com/store/books/details?id...,0,Dystopian Fiction
20898,"Blow Negative ! (""The Best Novel of the Navy S...","James Webb’s classic, scorching novel of the V...",James Webb,http://books.google.com/books/content?id=uZSOD...,Canelo,2019,http://books.google.com/books?id=uZSODwAAQBAJ&...,0,Historical Fiction


A few non-fiction books may have snuck by

In [29]:
fiction_small_df[fiction_small_df['genre'] == 'Nonfiction']

Unnamed: 0,Title,synopsis,author,coverImg,publisher,yearPublished,infoLink,avgRating,genre
120351,Sex a Baller,Mysterious Luva has sexed them all! Ball playe...,Mysterious Luva,http://books.google.com/books/content?id=MMJ1A...,Black Pearl Books,2004,http://books.google.nl/books?id=MMJ1AAAACAAJ&d...,0,Nonfiction
37135,The Portable Enlightenment Reader (The Viking ...,Presents writings from such figures as Immanue...,Various,http://books.google.com/books/content?id=f4oFA...,Penguin Classics,1995,http://books.google.com/books?id=f4oFAQAAIAAJ&...,0,Nonfiction
86584,My Secret Book (Hesperus Classics),Written by one of the greatest poets of all ti...,Francesco Petrarca,http://books.google.com/books/content?id=YkpdA...,Hesperus Press,2002,http://books.google.com/books?id=YkpdAAAAMAAJ&...,0,Nonfiction
187902,The pump house gang,A sprawling collection of essays about the sub...,Tom Wolfe,http://books.google.com/books/content?id=Mjh3E...,Picador,2022,http://books.google.com/books?id=Mjh3EAAAQBAJ&...,0,Nonfiction
195810,Shakespeare Never Did This,An account of Charles Bukowski's 1978 European...,Charles Bukowski,http://books.google.com/books/content?id=f-bqs...,Ecco,2002,http://books.google.com/books?id=f-bqsgEACAAJ&...,0,Nonfiction


In [30]:
fiction_small_df = fiction_small_df[fiction_small_df['genre'] != 'Nonfiction']
fiction_small_df.shape

(495, 9)

In [31]:
authors = fiction_small_df['author'].unique().tolist()
authors.sort()
fiction_small_df['authorId'] = fiction_small_df['author'].apply(lambda x: authors.index(x))

publishers = fiction_small_df['publisher'].unique().tolist()
publishers.sort()
fiction_small_df['publisherId'] = fiction_small_df['publisher'].apply(lambda x: publishers.index(x))

genres = fiction_small_df['genre'].unique().tolist()
genres.sort()
fiction_small_df['genreId'] = fiction_small_df['genre'].apply(lambda x: genres.index(x))

fiction_small_df.drop(columns=['author', 'publisher', 'genre'], inplace=True)
fiction_small_df

Unnamed: 0,Title,synopsis,coverImg,yearPublished,infoLink,avgRating,authorId,publisherId,genreId
161419,I am the Clay,“[Chaim] Potok writes powerfully about the suf...,http://books.google.com/books/content?id=Tl6FU...,2010,https://play.google.com/store/books/details?id...,0,66,68,4
120175,Bergdorf Blondes : A Novel,"Meet moi, 'a champagne bubble of a girl about ...",http://books.google.com/books/content?id=DmRMD...,2018,https://play.google.com/store/books/details?id...,0,340,32,1
66430,O Little Town of Maggody: An Arly Hanks Mystery,"Country music, greed, and the unique madness o...",http://books.google.com/books/content?id=AUAED...,2016,https://play.google.com/store/books/details?id...,0,199,130,9
87894,Little Myth Marker,"Skeeve, a young magician, and Aahz, his demon ...",http://books.google.com/books/content?id=Ta_z_...,2006,http://books.google.com/books?id=Ta_z_X30_eoC&...,0,358,5,3
99265,Winter Winds (Seaside Seasons #4),"In Winter Winds, Gayle Roper concludes her del...",http://books.google.com/books/content?id=RQuZc...,2011,https://play.google.com/store/books/details?id...,0,148,117,10
...,...,...,...,...,...,...,...,...,...
12546,A Season of Grace,"In 1910 Minnesota, Nilda Carlson's dreams are ...",http://books.google.com/books/content?id=dhFYD...,2018,http://books.google.nl/books?id=dhFYDwAAQBAJ&d...,0,250,17,4
173549,The Horus Heresy Vol. III: Visions of Treachery,A stunning artefact book for fans of the Horus...,http://books.google.com/books/content?id=x6e0N...,2014,http://books.google.com/books?id=x6e0NAEACAAJ&...,0,8,71,11
206363,Sisters Found,"Follows triplets Hope, Faith, and Charity, who...",http://books.google.com/books/content?id=87wyA...,2013,http://books.google.com/books?id=87wyAgAAQBAJ&...,0,200,81,10
156006,The Story of Ruth,PEN/Hemingway Award Winner: An “enthralling” n...,http://books.google.com/books/content?id=Qp7iB...,2014,https://play.google.com/store/books/details?id...,0,183,80,6


Write out CSV files

In [32]:
fiction_small_df.to_csv('../data/books.csv', index=False)

authors_df = pd.DataFrame(authors, columns=['authorName'])
authors_df.to_csv('../data/authors.csv', index=False)

publishers_df = pd.DataFrame(publishers, columns=['publisherName'])
publishers_df.to_csv('../data/publishers.csv', index=False)

genres_df = pd.DataFrame(genres, columns=['genre'])
genres_df.to_csv('../data/genres.csv', index=False)