In this notebook we will be testing chain of thought prompting.

In [1]:
import sys
sys.path.append("..")
from llama_cpp import Llama
import pandas as pd
from langdetect import detect
from mlflow_experiment import MlflowExperiment, EvaluationPipeline, InferencePipeline
from mlflow_experiment.evaluation.metrics import (accuracy_score, 
                                                  precision,
                                                  recall,
                                                  false_negative_rate,
                                                  false_positive_rate,
                                                  median_response_token_count,
                                                  median_query_token_count, 
                                                  median_processing_time, 
                                                  median_tokens_per_second, 
                                                  hallucination_rate, 
                                                  bad_output_format_rate)
from mlflow_experiment.inference.prompt_building.basic_user_prompt_builder import (
    BasicUserPromptBuilder,
)
from mlflow_experiment.inference.postprocessing.end_of_cot_postprocessing import (
    EndOfCotPostprocessing, 
)
from mlflow_experiment.inference.model.basic_llama_cpp_model import BasicLlamaCppModel
import dotenv
import pandas as pd
from itertools import product

In [2]:
dotenv.load_dotenv("../.env",override=True)

True

In [3]:
data = pd.read_json("../data/subsample.jsonl", lines = True)

In [4]:
ev_pipeline = EvaluationPipeline(accuracy_score,
                                 precision,
                                 recall,
                                 false_negative_rate,
                                 false_positive_rate,
                                 median_response_token_count,
                                 median_query_token_count,
                                 median_processing_time,
                                 median_tokens_per_second,
                                 hallucination_rate,
                                 bad_output_format_rate,
                                 
                                )

In [14]:
MODEL_NAME = "Qwen2.5-1.5B-Instruct-Q5_K_M.gguf"
MODEL_PATH = f"../models/{MODEL_NAME}"
model = Llama(MODEL_PATH, verbose=False, n_ctx=5000, n_gpu_layers=-1, n_batch=4048)

prompt_builder = BasicUserPromptBuilder(
    """This is a movie review: {review}.
    ----
    Determine whether the overall sentiment of the review is positive or negative. You must look for evaluatory statements about the movie itself - e.g. "thrilling", "terrible acting", etc. These evaluatory statements are distinct from the movie genre! So be careful not to mistake descriptions about the genre. Take into account that different evaluatory statements might have different connation for different genres - e.g. a "terrifying movie" in the horror genre means a perfect movie, but in some other genre like commedy it might very well mean - rubbish movie.
    ----
    Summarize the parts of the review which provide such evaluatory statements in a few words (just a few words, please!) and after that state you final estimation given the statements you have just extracted. Your output should be in the following format:
    EXAMPLE of correct format 1:
    "The person generally loves action movies and this one provided great action sequences, beautiful cast and a lot of fast cars. Given these observations, my classification is: positive."
    EXAMPLE of correct format 2:
    "The person found this movie dull and lackluster. Given these observations, my classification is: negative."
    ----
    If you are wrong 100000 little cute kittens will die a terrible death!!! YOU MUST BE CORRECT!
    ----
    You must end your statement with: "my classification is:" followed by the "positive" (when the overall sentiment is positive) and "negative" (when the overall sentiment is negative). Follow the format correctly! Be very concise in you summary - just a few words, nothing more!!!
    """
)
llama_model = BasicLlamaCppModel(model, user_prompt_builder=prompt_builder, model_name = MODEL_NAME)

postprocessing_fn = EndOfCotPostprocessing(label_mapping={"negative": 1, "positive": 0}, end_of_cot_pattern="my classification is:")

inf_pipeline = InferencePipeline(
    llama_model,
    system_prompt="You are a sentiment analysis system. Your goal is to categorize movie reviews into positive or negative. Follow the instructions given precisely!",
    postprocessing_fn=postprocessing_fn,
)



llama_init_from_model: n_ctx_per_seq (5024) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


In [15]:
exp = MlflowExperiment(
    inference_pipeline=inf_pipeline,
    evaluation_pipeline=ev_pipeline,
    experiment_name="zero-shot-few-shot-prompting",
)

In [16]:
temperature = 0.2
top_k = 40
top_p = 0.95
inference_outputs, evaluation_results = exp.run(
    data[["review"]].to_dict(orient="records"),
    y_true=data.label,
    experiment_run_tags={"justification": "Provide description of the task without concrete examples. "},
    run_name=f"chain-of-thought-kittens",
    temperature = temperature,
    top_k = top_k,
    top_p = top_p
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1122/1122 [08:24<00:00,  2.23it/s]

🏃 View run chain-of-thought-kittens at: http://localhost:5000/#/experiments/173405608153165389/runs/03ab424d42bc4a64a1e37daaad1fe887
🧪 View experiment at: http://localhost:5000/#/experiments/173405608153165389





In [17]:
evaluation_results

{'accuracy_score': 0.7005347593582888,
 'precision': np.float64(0.6481927710843374),
 'recall': np.float64(0.9590017825311943),
 'false_negative_rate': np.float64(0.04099821746880572),
 'false_positive_rate': np.float64(0.5204991087344029),
 'median_response_token_count': np.float64(33.0),
 'median_query_token_count': np.float64(601.0),
 'median_processing_time': np.float64(0.2654992341995239),
 'median_tokens_per_second': np.float64(232.05322751152633),
 'hallucination_rate': np.float64(0.0),
 'bad_output_format_rate': np.float64(0.035650623885918005)}

In [9]:
inference_outputs.loc[:, "true"] = data.label

In [10]:
inference_outputs.iloc[0].output_string

'my classification is: positive'

In [11]:
inference_outputs[(inference_outputs.label != inference_outputs.true)].label.value_counts()

label
 1    139
-1     82
 0     11
Name: count, dtype: int64

In [12]:
inference_outputs[inference_outputs.is_hallucination]

Unnamed: 0,output_string,label,is_hallucination,is_wrong_format,input_token_count,output_token_count,elapsed_time,true


In [13]:
inference_outputs[inference_outputs.is_wrong_format]

Unnamed: 0,output_string,label,is_hallucination,is_wrong_format,input_token_count,output_token_count,elapsed_time,true
14,positive,-1,False,True,965,1,0.054796,1
16,positive,-1,False,True,1167,1,0.067690,0
17,The overall sentiment of the review is positive.,-1,False,True,1451,9,0.112268,0
18,positive,-1,False,True,654,1,0.045040,0
24,negative,-1,False,True,1189,1,0.067887,1
...,...,...,...,...,...,...,...,...
1050,positive,-1,False,True,591,1,0.042982,0
1057,positive,-1,False,True,980,1,0.055128,0
1072,positive,-1,False,True,1246,1,0.069126,0
1092,positive,-1,False,True,810,1,0.047748,0
