In [35]:
from datasets import Dataset, DatasetDict
import pandas as pd

In [36]:
datasets = DatasetDict({
    'train': Dataset.from_pandas(pd.read_csv('../data/train.csv')),
    'eval': Dataset.from_pandas(pd.read_csv('../data/eval.csv')),
    'test': Dataset.from_pandas(pd.read_csv('../data/test.csv'))
})

A simple llm based classifier

In [52]:
from ollama import chat
from pydantic import BaseModel, Field

class Classification(BaseModel):
    message: str = Field(description="The message to be classified")
    label: bool = Field(description="True if the message is calendar event related and False otherwise")
    reason: str = Field(description="The reason for classification")


sample_true = datasets['train'].filter(lambda x: x['calendar_event'] == True)['message'][:10]
sample_false = datasets['train'].filter(lambda x: x['calendar_event'] == False)['message'][:10]


def classify_message(msg: str, model: str) -> Classification:
    prompt = f"""
        You are an expert classifier. Your task is to decide whether a given message is related to calendar event or not 
        use labels: "True" if it is calendar event related and "False" if it is not. 

        Use the examples provided below as guidance.
        True Examples:
        {sample_true}

        False Examples:
        {sample_false}

        Now, classify the following message:
        messsage: {msg}
    """
    response = chat(
        messages = [
            {'role': 'user', 'content': prompt},
        ],
        model=model,
        format=Classification.model_json_schema(),
        options={
            'temperature': 0
        }
    )
    return Classification.model_validate_json(response.message.content)


Filter:   0%|          | 0/3924 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3924 [00:00<?, ? examples/s]

In [53]:
classify_message("Let's get on a call", "llama3.1:latest")

Classification(message="Let's get on a call", label=True, reason='The message suggests scheduling a call, which is related to calendar events.')

In [54]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


def compute_binary_classification_metrics(callable_func, dataset: Dataset) -> dict:
    """
    Computes binary classification metrics for a given callable function and dataset.
    
    Args:
        callable_func (callable): A function that takes a message and a model name as input and returns a Classification object.
        dataset (Dataset): The evaluation dataset containing messages to be classified.
        
    Returns:
        dict: A dictionary containing the computed metrics.
    """
    predictions = []
    
    for message in dataset['message']:
        response = callable_func(message)  # Replace 'your_model_name' with the actual model name
        predictions.append(response.label)
    
    predictions = np.array(predictions)
    true_labels = np.array(dataset['calendar_event'])
    
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, pos_label=True)
    recall = recall_score(true_labels, predictions, pos_label=True)
    f1 = f1_score(true_labels, predictions, pos_label=True)
    conf_matrix = confusion_matrix(true_labels, predictions, labels=[True, False])
    
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix
    }
    
    return metrics, predictions


In [55]:
from functools import partial

llm_model = partial(classify_message, model='llama3.1:latest')


metrics, predictions = compute_binary_classification_metrics(llm_model, datasets['eval'])
metrics

{'accuracy': 0.7857142857142857,
 'precision': 0.7758620689655172,
 'recall': 0.8035714285714286,
 'f1_score': 0.7894736842105263,
 'confusion_matrix': array([[225,  55],
        [ 65, 215]])}

In [46]:
# is it consistent?
# compute_binary_classification_metrics(llm_model, datasets['eval']) 

# {'accuracy': 0.7375,
#  'precision': 0.726962457337884,
#  'recall': 0.7607142857142857,
#  'f1_score': 0.743455497382199,
#  'confusion_matrix': array([[213,  67],
#         [ 80, 200]])}

In [51]:
def unmatched_predictions(predictions, dataset):
    data = dataset.to_pandas()
    data['predictions'] = predictions
    return data[data['predictions'] != data['calendar_event']]    

unmatched_predictions(predictions, datasets['eval']).sample(10)

Unnamed: 0,filename,message,user,calendar_event,predictions
65,chat_12.json,Hi alice! What's up?,bob,True,False
368,,prolly not,geneo91,False,True
104,chat_90.json,hey there! anyone available for a quick help? ...,e_7,True,False
364,,?,edgarin,False,True
445,,hehe,geneo91,False,True
293,,LOL,drspin,False,True
193,chat_101.json,"also, make sure your database user has the rig...",sarahconnor,True,False
516,,"Orbo: that's gzip, not bzip",eyequeue,False,True
559,,terminal is better,Adrenal,False,True
304,,oh,jdub,False,True


It's slow takes (2min for a single run on the eval set) without reasoning 

metrics:
```json
    {'accuracy': 0.7375,
     'precision': 0.726962457337884,
    'recall': 0.7607142857142857,
    'f1_score': 0.743455497382199,
    'confusion_matrix': array(
          [[213,  67],
           [ 80, 200]])}
```
With reasoning it takes 5 mins with improvement in performance 

metrics:
```json
    {'accuracy': 0.7857142857142857,
    'precision': 0.7758620689655172,
    'recall': 0.8035714285714286,
    'f1_score': 0.7894736842105263,
    'confusion_matrix': array([
            [225,  55],
            [ 65, 215]])}
```