## Song Info Retrieval
#### Author: Kenneth Leung
- Using LLM agents to retrieve song information

In [1]:
import csv
import json
import os

import box
import pandas as pd
import yaml
from dotenv import load_dotenv
from langchain.agents import (AgentExecutor, AgentType, OpenAIFunctionsAgent,
                              Tool, initialize_agent)
from langchain.agents.format_scratchpad import \
    format_to_openai_function_messages
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain.prompts import (ChatPromptTemplate, MessagesPlaceholder,
                               PromptTemplate)
from langchain.tools.render import format_tool_to_openai_function
from langchain_community.tools.ddg_search.tool import DuckDuckGoSearchRun
from langchain_community.utilities.wikipedia import WikipediaAPIWrapper

In [None]:
os.chdir('..')

with open('config/config.yaml', 'r', encoding='utf8') as ymlfile:
    cfg = box.Box(yaml.safe_load(ymlfile))
load_dotenv(dotenv_path=cfg.ENVDIR, verbose=False)

In [None]:
df = pd.read_csv('data/input/chart2000-song-2010-decade-0-3-0070.csv')
df.head()

___

In [5]:
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", 
                 temperature=0,
                 model_kwargs={"seed": 0}
                 )

In [6]:
wikipedia = WikipediaAPIWrapper()
wikipedia_tool = Tool(
    name='wikipedia',
    func= wikipedia.run,
    description="Useful for when you need to look up the songwriters, genre, and producers for a song on wikipedia"
)

In [7]:
tools = [wikipedia_tool]
llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])

In [8]:
system_prompt = """
You are a helpful assistant and an expert in all things related to music and songs.
The output must be in a JSON format enclosed in curly brackets, and does not contain any additional details or explanation.

Example:
'genre': 'Disco, Pop',
'label': 'Sony Music',
'language': 'Korean'
'producers': 'Alex Boh, Betty, Germaine',
'songwriters': 'John Johnson, Adam Smith'
"""

In [9]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ])

In [10]:
agent = ({
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_function_messages(
                                    x["intermediate_steps"]
        ),}
        | prompt
        | llm_with_tools
        | OpenAIFunctionsAgentOutputParser()
    )

agent_executor = AgentExecutor(agent=agent, 
                               tools=tools, 
                               verbose=False)

In [11]:
def generate_input_prompt(song, artist):
    input_prompt = f"""
    This is the song to review:
    - Song title: {song}, performed by {artist}

    Based on the above song information, accurately answer the following questions in order:
    - What is the genre of the song?
    - What is the name of the record label company?
    - What is the main language of the song?
    - Who are the producers of the song?
    - Who are the songwriters of the song?

    If you do not know the answer to any of these questions, return the answer as 'Unknown'. Do not make up any answers.

    Output the above answer strictly in JSON format enclosed with curly brackets. Do not include anything like ```json in the output.
    """
    
    return input_prompt

In [None]:
output_file = 'data/output/songs_metadata.csv'

for _, row in df.iterrows():
    song, artist = row["song"], row["artist"]
    if os.path.exists(output_file):
        df_song_info = pd.read_csv(output_file, encoding="utf-8")
    else:
        df_song_info = pd.DataFrame(
            columns=[
                "artist",
                "song",
                "genre",
                "label",
                "language",
                "llm_cost",
                "llm_tokens",
                "producers",
                "songwriters",
            ]
        )
        df_song_info.to_csv(output_file, index=False)

    if song not in df_song_info["song"].tolist():
        print(f"***** Processing: {song} by {artist} *****")
        input_prompt = generate_input_prompt(song, artist)
        with get_openai_callback() as cb:
            response = agent_executor.invoke({"input": input_prompt})
            cost = cb.total_cost
            tokens = cb.total_tokens
            output = response["output"]
            print(output)
            output_dict = json.loads(output)

            new_row = {
                "artist": artist,
                "song": song,
                "genre": output_dict.get("genre"),
                "label": output_dict.get("label"),
                "language": output_dict.get("language"),
                "llm_cost": cost,
                "llm_tokens": tokens,
                "producers": output_dict.get("producers"),
                "songwriters": output_dict.get("songwriters"),
            }

            with open(output_file, "a", newline="", encoding="utf-8") as file:
                writer = csv.writer(file)
                writer.writerow(new_row.values())
