In this notebook we will be testing zero-shot / few-shot prompting

In [1]:
import sys
sys.path.append("..")
from llama_cpp import Llama
import pandas as pd
from langdetect import detect
from mlflow_experiment import MlflowExperiment, EvaluationPipeline, InferencePipeline
from mlflow_experiment.evaluation.metrics import (accuracy_score, 
                                                  precision,
                                                  recall,
                                                  false_negative_rate,
                                                  false_positive_rate,
                                                  median_response_token_count,
                                                  median_query_token_count, 
                                                  median_processing_time, 
                                                  median_tokens_per_second, 
                                                  hallucination_rate, 
                                                  bad_output_format_rate)
from mlflow_experiment.inference.prompt_building.basic_user_prompt_builder import (
    BasicUserPromptBuilder,
)
from mlflow_experiment.inference.postprocessing.basic_postprocessing import (
    BasicPostprocessing,
)
from mlflow_experiment.inference.model.basic_llama_cpp_model import BasicLlamaCppModel
import dotenv
import pandas as pd
from itertools import product

In [2]:
dotenv.load_dotenv("../.env",override=True)

True

In [3]:
data = pd.read_json("../data/imdb-movie-reviews/test.jsonl", lines = True)

In [4]:
MODEL_NAME = "Qwen2.5-1.5B-Instruct-Q5_K_M.gguf"

In [5]:
MODEL_PATH = f"../models/{MODEL_NAME}"
model = Llama(MODEL_PATH, verbose=False, n_ctx=32768, n_gpu_layers=-1, n_batch=2048)

In [6]:
ev_pipeline = EvaluationPipeline(accuracy_score,
                                 precision,
                                 recall,
                                 false_negative_rate,
                                 false_positive_rate,
                                 median_response_token_count,
                                 median_query_token_count,
                                 median_processing_time,
                                 median_tokens_per_second,
                                 hallucination_rate,
                                 bad_output_format_rate,
                                 
                                )

In [10]:
prompt_builder = BasicUserPromptBuilder(
    """This is a movie review: {review}.
    ----
    Determine whether the overall sentiment of the review is positive or negative. You must look for evaluatory statements about the movie itself - e.g. "thrilling", "terrible acting", etc. These evaluatory statements are distinct from the movie genre! So be careful not to mistake descriptions about the genre. Take into account that different evaluatory statements might have different connation for different genres - e.g. a "terrifying movie" in the horror genre means a perfect movie, but in some other genre like commedy it might very well mean - rubbish movie.
    ----
    Your output: your output should be either "positive" (if the overall sentiment is positive) or "negative" (if the overall sentiment is negative). Do not output nothing more. Output only "positive" or "negative"!
    """
)
llama_model = BasicLlamaCppModel(model, user_prompt_builder=prompt_builder, model_name = MODEL_NAME)

postprocessing_fn = BasicPostprocessing({"negative": 1, "positive": 0})

inf_pipeline = InferencePipeline(
    llama_model,
    system_prompt="You are a sentiment analysis system. Your goal is to categorize movie reviews into positive or negative. Follow the instructions given precisely!",
    postprocessing_fn=postprocessing_fn,
)


In [11]:
exp = MlflowExperiment(
    inference_pipeline=inf_pipeline,
    evaluation_pipeline=ev_pipeline,
    experiment_name="zero-shot-few-shot-prompting",
)

In [12]:
temperature = 0.2
top_k = 40
top_p = 0.95
inference_outputs, evaluation_results = exp.run(
    data[["review"]].to_dict(orient="records"),
    y_true=data.label,
    experiment_run_tags={"justification": "Provide description of the task without concrete examples. Add info about impending danger."},
    run_name=f"zero-shot-danger-upon-writer",
    temperature = temperature,
    top_k = top_k,
    top_p = top_p
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [10:49<00:00, 15.39it/s]

🏃 View run zero-shot-danger-upon-writer at: http://localhost:5000/#/experiments/173405608153165389/runs/68f8b09db41d47bf91edd1ed8041bc4f
🧪 View experiment at: http://localhost:5000/#/experiments/173405608153165389





In [13]:
evaluation_results

{'accuracy_score': 0.9321,
 'precision': np.float64(0.9679445527398743),
 'recall': np.float64(0.8938),
 'false_negative_rate': np.float64(0.10619999999999996),
 'false_positive_rate': np.float64(0.0296),
 'median_response_token_count': np.float64(1.0),
 'median_query_token_count': np.float64(433.5),
 'median_processing_time': np.float64(0.059711456298828125),
 'median_tokens_per_second': np.float64(33.49440949048703),
 'hallucination_rate': np.float64(0.0),
 'bad_output_format_rate': np.float64(0.0)}

In [119]:
inference_outputs.loc[:, "true"] = data.label

In [120]:
inference_outputs

Unnamed: 0,output_string,label,is_hallucination,is_wrong_format,input_token_count,output_token_count,elapsed_time,true
0,positive,0,False,False,421,1,0.103696,0
1,negative,1,False,False,600,1,0.095301,1
2,negative,1,False,False,639,1,0.095017,1
3,negative,1,False,False,416,1,0.067424,0
4,positive,0,False,False,570,1,0.080919,0
...,...,...,...,...,...,...,...,...
1116,negative,1,False,False,394,1,0.058962,1
1117,positive,0,False,False,573,1,0.072158,0
1118,negative,1,False,False,800,1,0.083261,1
1119,positive,0,False,False,574,1,0.072770,0
