In [2]:
import sys, json, re, collections
import pandas as pd
from pathlib import Path
from decouple import config
sys.path.append("../src/")
from llm_helpers import openai_client

pd.set_option('display.max_columns', 1000, 'display.width', 1000, 'display.max_rows',1000)

data_dir = Path(".").absolute().parent/"data"
ls = lambda p:print("\n".join(map(str,p.iterdir())))

ls(data_dir)

/home/idan/Documents/llm_workshop/data/sample_apps.parquet


In [3]:
df = pd.read_parquet(data_dir / "sample_apps.parquet").sample(9)
df

Unnamed: 0,bundle_id,title,description,store_url,category_names,ios
39813,com.playrix.township,Township,Township is a unique blend of city-building an...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False
35146,com.moonactive.coinmaster,Coin Master,Join your Facebook friends and millions of pla...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False
29751,com.king.candycrushsaga,Candy Crush Saga,Master the legendary match 3 puzzle game from ...,https://play.google.com/store/apps/details?id=...,"GAME_CASUAL,GAME",False
49136,com.tripledot.woodoku,Woodoku - Block Puzzle Games,Woodoku: a wood block puzzle game meets a sudo...,https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
56472,in.playsimple.wordtrip,Word Trip,WINNER OF THE PRESTIGIOUS ACADEMICS' CHOICE MI...,https://play.google.com/store/apps/details?id=...,"GAME_WORD,GAME",False
39423,com.pinterest,Pinterest,Pinterest is the place to explore inspiration....,https://play.google.com/store/apps/details?id=...,"LIFESTYLE,APPLICATION",False
56776,io.randomco.travel,Travel Town - Merge Adventure,"Explore Travel Town, where you can combine eve...",https://play.google.com/store/apps/details?id=...,"GAME_PUZZLE,GAME",False
38654,com.pandora.android,Pandora - Music & Podcasts,Pandora gives you a personalized listening exp...,https://play.google.com/store/apps/details?id=...,"MUSIC_AND_AUDIO,APPLICATION",False
32145,com.macys.android,Macy's,The latest version of the Macy’s app is better...,https://play.google.com/store/apps/details?id=...,"SHOPPING,APPLICATION",False


In [4]:
categories = df["category_names"].str.lower().str.split(',').explode().value_counts()
categories

game               6
game_casual        3
application        3
game_puzzle        2
game_word          1
lifestyle          1
music_and_audio    1
shopping           1
Name: category_names, dtype: int64

## Naive approach, just Ask nicely.

In [5]:
def openai_ask(prompts):
    response = openai_client.completions.create(
        model="text-davinci-003",
        prompt=prompts,
    )
    ret = [choice.text.strip().lower() for choice in response.choices]
    return ret


In [6]:
prompt = "please choose the most likely category that apply to 'Crossword Jam' from the following list:\n"
prompt+="\n".join(categories.index)
openai_ask(prompt)

['game_puzzle']

In [7]:
openai_ask(prompt)

['game_puzzle']

In [8]:
openai_ask(prompt)

['game_puzzle']

Seem to work pretty, well - let's try multi label?

In [9]:
prompt = "please choose all the categories that apply to 'Crossword Jam' from the following list:\n"
prompt+="\n".join(categories.index)
[sorted(map(str.strip,l.strip().replace(",", "\n").split("\n"))) for l in openai_ask([prompt]*10)]

[['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game_casual', 'game_puzzle', 'game_word'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle'],
 ['game', 'game_', 'game_casual', 'game_puzzle']]

In [10]:
"game casual" in categories

False

## Using function calls
### Choose most likely class

In [11]:
def classify_most_likely(prompt):
    messages = [{"role": "user", "content": prompt}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "classify_app",
                "description": "Classify to an enum type",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "class": {"type": "string", "enum": list(categories.index)},
                    },
                    "required": ["class"],
                },
            },
        }
    ]
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "classify_app"}}
    )
    ret =  response.choices[0].message.tool_calls[0].function.arguments
    return json.loads(ret)["class"]

classify_most_likely("please choose the most likely category that apply to 'Crossword Jam'")

'game_word'

In [12]:
def classify_multiclass(prompt):
    messages = [{"role": "user", "content": prompt}]
    tools = [
        {
            "type": "function",
            "function": {
                "name": "classify_app",
                "description": "Classify to an enum type",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "classes": {"type": "array", "items": {"type": "string", "enum": list(categories.index)}},
                    },
                    "required": ["classes"],
                },
            },
        }
    ]
    response = openai_client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        tools=tools,
        tool_choice={"type": "function", "function": {"name": "classify_app"}}
    )
    ret =  response.choices[0].message.tool_calls[0].function.arguments
    return json.loads(ret)["classes"]

classify_multiclass("please choose the all the categories that apply to 'Crossword Jam'")

['game', 'game_word', 'game_puzzle', 'application']

In [13]:
classify_multiclass("please choose the all the categories that apply to 'Crossword Jam'")

['game', 'game_word', 'puzzle']

## Question:
Please write ask the model to classify all of the apps using the methods we learnt.

Then create an additional column "jaccard" indicating whether intersection over union of the actual categories and the predicted ones.

Which method was most successful?

In [18]:
print(categories.index)

Index(['game', 'game_casual', 'application', 'game_puzzle', 'game_word', 'lifestyle', 'music_and_audio', 'shopping'], dtype='object')


In [26]:
i = 6
app_description = df["description"].iloc[i]
ground_truth_categories = df["category_names"].iloc[i].lower().split(",")
predicted_categories = classify_multiclass(f"please choose the all the categories that apply to '{app_description}'")

def jaccard(a,b):
    a = set(a)
    b = set(b)
    return len(a.intersection(b))/len(a.union(b))

print(f"{app_description=}")
print(f"{ground_truth_categories=}")
print(f"{predicted_categories=}")
print(f"{jaccard(ground_truth_categories, predicted_categories)=}")


APIConnectionError: Connection error.