In [1]:
# import necessary libraries
import pandas as pd
import openai
import concurrent.futures

***For this task, I tried using a few websites/API such as Google Books API which successfully produced some good results but it has a limit of 1000 queries per day so it ran into lots of error (429 specifically) shortly. I also experimented using either oclc numbers or book titles. It turned out that each has its own pros and cons. Oclc numbers are numerical so they are less likely prone to spelling errors than book titles; however, the pattern of oclc numbers need to be fixed.***

***Eventually, I ran into a github repository that listed all sorts of free APIs including Gutendex - JSON web API for Project Gutenberg ebook metadata. I did some experiments and thankfully, the openai lending key and model were compatible. Initially, I did not limit the length for answer so I got a long answer for each question about genre such as: "The book titled "The Minor Mathers: A List of Their Works" likely falls within the genre of non-fiction, specifically in the realms of literary criticism or bibliography. It seems to provide a catalog or examination of the works of the Minor Mathers." This is a detailed answer but since we are interested in the specific genre, it would require a lot of manual work to extract the genre from that answer (though manual work might also mean more accurate.) Thus, I made some changes to the code and now it only extracted the key word for genre.***

***Note: For this task, I copied the entire column "title_display" and paste it into a new csv file named title.csv.***

In [2]:
openai.api_key = ''

# Below code produced more detailed answers 
# def get_genre_openai(title):
#     try:
#         prompt = f"Determine the genre of the book titled '{title}'."
#         response = openai.ChatCompletion.create(
#             model="gpt-4o-mini",
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant."},
#                 {"role": "user", "content": prompt}
#             ],
#             max_tokens=50
#         )
#         genre = response['choices'][0]['message']['content'].strip()
#         return genre
#     except Exception as e:
#         print(f"Error fetching genre from OpenAI for title '{title}': {e}")
#     return None

# def fetch_genres_concurrently(titles):
#     results = []

#     def fetch_genre(title):
#         genre = get_genre_openai(title)
#         results.append((title, genre))
#         print(f"Title '{title}', Genre: {genre}")

#     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
#         executor.map(fetch_genre, titles)

#     return results

def get_genre_openai(title):
    try:
        prompt = f"Determine the genre of the book titled '{title}'. Provide a one-word genre."
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=50
        )
        genre = response['choices'][0]['message']['content'].strip()
        return genre
    except Exception as e:
        print(f"Error fetching genre from OpenAI for title '{title}': {e}")
    return None

def fetch_genres_concurrently(titles):
    results = []

    def fetch_genre(title):
        genre = get_genre_openai(title)
        results.append((title, genre))
        print(f"Title '{title}', Genre: {genre}")

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(fetch_genre, titles)

    return results

# Load the CSV file with titles -- the titles are extracted from the title columns in the loans_merged.dta with duplicates dropped
file_path = '/Users/haivanle/Documents/Van_coding_assignment/title.csv'
titles_data = pd.read_csv(file_path, header=None)

# Extract titles
titles = titles_data[0].tolist()

# Fetch genres concurrently
genres = fetch_genres_concurrently(titles)

# Create a DataFrame with the results
df_genres = pd.DataFrame(genres, columns=['title', 'genre'])

# Display the final DataFrame
print(df_genres)

# Save the DataFrame to a CSV file -- then import to save as dta in stata
df_genres.to_csv('/Users/haivanle/Documents/Van_coding_assignment/genres.csv', index=False)

Title 'The minor Mathers : a list of their works', Genre: Reference.Title 'The American Academy In Rome at the New York World's Fair and the Golden Gate Exposition 1939', Genre: Nonfiction

Title 'The centaur's booty', Genre: Fantasy
Title 'Four quartets', Genre: Poetry.
Title 'Guide to the Appalachian trail in New England', Genre: Nonfiction
Title 'Juan de Mairena, sentencias, donaires, apuntes y recuerdos de un profesor apócrifo', Genre: Philosophy.
Title 'Diana : the sonnets, and other poems', Genre: Poetry
Title 'Achilles in Scyros', Genre: Historical.
Title 'Science and revolution', Genre: Nonfiction
Title 'Chills and fever, poems', Genre: Poetry
Title 'Two gentlemen in bonds', Genre: Comedy
Title 'Pindare', Genre: Poetry.
Title 'Making printers' typefaces', Genre: Nonfiction
Title 'Specimen of types in general use at the Condé Nast Press', Genre: Typography
Title 'Vor dem Leben : Erzählungen', Genre: Fiction
Title 'Le coup de grâce : roman', Genre: Novel
Title 'The olive tree : a