# Ronnie Scott's

1. Read data
  a. Combine
2. Generate embeddings
3. Assign and save embeddings
4. train test split
5. train random forest 
6. assess, visualise, interpret

In [1]:
import pandas as pd
import tiktoken

from utils.embeddings_utils import get_embedding

import numpy as np
from ast import literal_eval

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('data/ronnie_scotts_graded_booter_100.csv')

df['combined'] = "title: " + df['title'] + " description: " + df['description']

print(len(df))
df.head(2)

100


Unnamed: 0,Position,title,description,date,stage,image_url,Interested,combined
0,1,Natalie Williams' Soul Family,Soul at its best: Natalie Williams hosts the m...,Sun 19 May - Sun 21 Jul 2024,Main Show,https://cdn.ronniescotts.co.uk/uploads/_listin...,Yes,title: Natalie Williams' Soul Family descripti...
1,2,Sunday Live Music Sessions: Brass Volcanoes Ne...,London's most joyful and spirited New Orleans ...,Sun 19 May 2024,Upstairs @ Ronnie's,https://cdn.ronniescotts.co.uk/uploads/_listin...,No,title: Sunday Live Music Sessions: Brass Volca...


In [8]:
import json

df = pd.read_csv('data/ronnie_scotts_graded_booter_100.csv')[:10]
output = df.rename(columns={'Position': 'id'})
output = output.drop(["Interested"], axis=1)
json_output = literal_eval(output.to_json(orient='records'))
json_output

# File path
file_path = "output.json"

# Write JSON to file
with open(file_path, "w") as file:
  json.dump(json_output, file)

## Embeddings

In [6]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens]
len(df)

100

No entries exceed 8000 tokens

In [7]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage

# This may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=embedding_model))
df.to_csv("data/ronnie_scotts_graded_booter_embeddings.csv")

## Split Data

In [9]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    list(df.embedding.values), df.Interested, test_size=0.2, random_state=42
)

In [17]:
# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

              precision    recall  f1-score   support

          No       0.88      1.00      0.93        14
         Yes       1.00      0.67      0.80         6

    accuracy                           0.90        20
   macro avg       0.94      0.83      0.87        20
weighted avg       0.91      0.90      0.89        20

