In [1]:
import pandas as pd
import tiktoken

from utils.embeddings_utils import get_embedding

import numpy as np
from ast import literal_eval

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
favourites = pd.read_json('data/edinburgh_favourites.json')
labelled = pd.read_json('data/edinburgh_mommy_50.json')

In [3]:
labelled = labelled[['title', 'description', 'liked']]
favourites = favourites[['title', 'description']]
favourites['liked'] = True



## Data Selection

I have 45 events that she likes, and around 18 that she doesnt. 

In [4]:
data = pd.concat([labelled, favourites.sample(25)], ignore_index=True)

In [5]:
data

Unnamed: 0,title,description,liked
0,Tatty Macleod: Fugue,Time Out 10 Best Comedy Shows at the Fringe 20...,False
1,Jin Hao Li: Swimming in a Submarine,"In his debut hour, Jin Hao walks you through t...",False
2,The Jewel of Africa CANCELLED,A unique physical dance show produced to carry...,False
3,EIFF: Animation Shorts,Animations to arouse the senses. Encounter exc...,True
4,Juliette Burton's Talking Crazy CANCELLED,"In crazy times, what keeps us sane? Comedian, ...",True
...,...,...,...
80,The Big Bite-Size Breakfast Show,Good morning Edinburgh! We're back bringing yo...,True
81,The Sound Inside,UK premiere. Nominated for six Tony Awards inc...,True
82,12th Year! John Hunt Four O'Clock Afternoon Bl...,'Fantastic!' (Jools Holland). 'Boogie-woogie s...,True
83,Every Brilliant Thing,"Ten years on from the play’s debut, a new prod...",True


## Data Preparation

In [6]:
data['combined'] = data['title'] + '; ' + data['description']

In [7]:
embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
data["n_tokens"] = data.combined.apply(lambda x: len(encoding.encode(x)))
data = data[data.n_tokens <= max_tokens]
len(data)

85

### Generate Embeddings

In [8]:
# This may take a few minutes
data["embedding"] = data.combined.apply(lambda x: get_embedding(x, model=embedding_model))
data.to_csv("data/edinburgh_training_data_embeddings.csv")

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    list(data.embedding.values), data.liked, test_size=0.2, random_state=42
)

# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

              precision    recall  f1-score   support

       False       0.50      0.20      0.29         5
        True       0.73      0.92      0.81        12

    accuracy                           0.71        17
   macro avg       0.62      0.56      0.55        17
weighted avg       0.66      0.71      0.66        17



In [10]:
y_train

77     True
42    False
49     True
11    False
30    False
      ...  
20     True
60     True
71     True
14     True
51    False
Name: liked, Length: 68, dtype: bool