In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
from datetime import datetime
import json
from tqdm import tqdm
from functools import partial
import json
import numpy as np
from enum import Enum

In [None]:
from langchain import HuggingFaceHub
from langchain import PromptTemplate, LLMChain
from langchain import OpenAI
from langchain.chat_models.openai import ChatOpenAI
from langchain.llms import HuggingFaceTextGenInference

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [None]:
class NpEncoder(json.JSONEncoder):
    """Encoder to ensure numpy objects are serialised to JSON correctly"""
    
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [None]:
from api_key import HUGGINGFACE_API_KEY
from api_key import OPENAI_API_KEY
from api_key import RUNPOD_API_KEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACE_API_KEY
os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
os.environ["RUNPOD_AI_API_KEY"] = RUNPOD_API_KEY

In [None]:
import runpod

In [None]:
from ai_utils import perform_message_inference, tidy_responses, STANDARD_RESPONSES

### Setup

In [None]:
word_cleaner = partial(tidy_responses, keywords=STANDARD_RESPONSES)

In [None]:
blank_cleaner = lambda x: x

In [None]:
bmks = pd.read_csv('clean_bmk.csv')

In [None]:
class RSVP(Enum):
    Attend = 1
    Not_attend = 2
    Possibly_attend = 3
    Other = 4

In [None]:
def map_rsvp_value(text):

    if text is None:
        return RSVP.Other.value
    
    tidy_val = text.replace(' ','_')
    return RSVP[tidy_val].value

In [None]:
def map_numeric_value(text):

    if text is None:
        return RSVP.Other.value

    if text.strip() == '1':
        return RSVP.Attend.value
    elif text.strip() == '2':
        return RSVP.Not_attend.value
    elif text.strip() == '3':
        return RSVP.Possibly_attend.value
    else:
        return RSVP.Other.value

In [None]:
bmks['review_class'] = bmks['review'].apply(map_rsvp_value)

In [None]:
bmks.review.value_counts()

In [None]:
def prepare_metrics(true, pred, class_labels):
    """https://programtalk.com/python-more-examples/sklearn.metrics.precision_recall_fscore_support/?ipage=4&utm_content=cmp-true"""

    # Prepare headers
    table_three_headers = tuple(["", "Accuracy", "Macro-F"] + sorted(class_labels))
    results_headers = ("Precision", "Recall", "F-score", "Support")
    
    results={}

    print("\nResults on testing set")

    test_accuracy = accuracy_score(true, pred)
    print("\nAccuracy =", test_accuracy)
    
    results['accuracy']=test_accuracy

    print("\nMacro-average:")
    macroavg_prfs = precision_recall_fscore_support(true, pred, average='macro')
    for lab, val in zip(results_headers, macroavg_prfs):
        if val is not None:
            print("%-12s%-12.3f" % (lab, val))
        else:
            print("%-12s%-12s" % (lab, "--"))
            
    results['macro']=macroavg_prfs

    print("\nPer-class:")
    perclass_prfs = precision_recall_fscore_support(true, pred)
    print("%-12s%-12s%-12s%-12s%-12s" % tuple([""] + sorted(class_labels)))
    for lab, vals in zip(results_headers, perclass_prfs):
        if lab == "Support":
            print("%-12s%-12i%-12i%-12i%-12i" % (lab, vals[0], vals[1], vals[2], vals[3]))
        else:
            print("%-12s%-12.3f%-12.3f%-12.3f%-12.3f" % (lab, vals[0], vals[1], vals[2], vals[3]))

    per_metric={}
    for lab, vals in zip(results_headers, perclass_prfs):
        per_class={}
        for i,cls in enumerate(class_labels):
            per_class[cls]=vals[i]
        
        per_metric[lab]=per_class
    
    results['per_class']=per_metric
            
    return results

### database

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from database_models import  Experiments_Base, Prompts, Models, Experiments
from sqlalchemy import select

In [None]:
from db_utils import get_prompt, add_prompt, get_prompt_names
from db_utils import get_model, add_model
from db_utils import add_experiment, get_experiment

In [None]:
engine = create_engine('sqlite:///experiments.db', json_serializer=lambda obj: json.dumps(obj, cls=NpEncoder))
Experiments_Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

In [None]:
get_prompt_names(session)

### Run experiments

In [None]:
def run_benchmarking(exp, llm_link=None):

    assert type(exp)==dict

    print(exp['model'])
    
    model_obj = get_model(session,exp['model'])

    platform_type = exp['platform']

    if platform_type == 'openai':
        llm=OpenAI(temperature=0.5)
    elif platform_type == 'chat_openai':
        llm=ChatOpenAI(model='gpt-4',temperature=0.5)
    elif platform_type == 'hf_hub':
        repo_id = exp['model']
        llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 128})
    elif platform_type == 'run_pod':
        llm = llm_link
    
    for prompt_data in exp['prompts']:
        short_name = prompt_data["short_name"]
        cleaner = prompt_data['cleaner']
        mapper = prompt_data['mapper']
                
        print(f'prompt name: {short_name}')
        
        prompt_obj = get_prompt(session,short_name)

        experiment_check = get_experiment(session,model_obj.id,prompt_obj.id)
        
        if experiment_check is not None:
            acc = experiment_check.results['accuracy']
            print(f'Experiment previously performed with accuracy {acc}')

        else:
            preds=[]
            for tm in tqdm(bmks['message'].to_list()):
                # cycle messages
                raw = perform_message_inference(llm,prompt_obj.prompt_text,tm)

                print(f'raw: {raw}')
                clean = cleaner(raw)
                print(f'clean: {clean}')
                mapped = mapper(clean)
                print(f'mapped: {mapped}')
                
                preds.append(mapped)

            metrics = prepare_metrics(bmks.review_class.to_list(), preds,[1,2,3,4])

            add_experiment(session,model_obj.id,prompt_obj.id,'bmks_81',metrics)

    return model_obj.id

### Ready External LLMs

#### prepare runpod and connection

In [None]:
runpod.api_key = os.getenv("RUNPOD_AI_API_KEY", "your_runpod_api_key")

if runpod.api_key == "your_runpod_api_key":
    print("It appears that you don't have a RunPod API key")

    raise AssertionError("Missing RunPod API key")

In [None]:
# smaller model parameters
gpu_count = 1
gpu_type="NVIDIA RTX A6000"
# repo_id = 'meta-llama/Llama-2-7b-chat-hf' # ~ 24GB?
repo_id = 'meta-llama/Llama-2-13b-chat-hf' # ~ 26GB?
# repo_id = 'tiiuae/falcon-7b-instruct' # ~ 24GB?

In [None]:
# larger model parameters
# gpu_count = 2
# gpu_type = "NVIDIA A100 80GB PCIe"
# repo_id = 'meta-llama/Llama-2-70b-chat-hf' # ~ 140GB?
# repo_id = 'tiiuae/falcon-40b-instruct' # ~ 140GB?

In [None]:
gpu_type

In [None]:
model_obj = get_model(session, repo_id)
if model_obj is None:
    add_model(session,repo_id)

In [None]:
pod = runpod.create_pod(
    name="Party Bot Benchmarking",
    image_name="ghcr.io/huggingface/text-generation-inference",
    gpu_type_id=gpu_type, 
    cloud_type="COMMUNITY",
    docker_args=f"--model-id {repo_id} --num-shard {gpu_count}",
    gpu_count=gpu_count,
    volume_in_gb=195,
    container_disk_in_gb=5,
    ports="80/http",
    volume_mount_path="/data",
    env={'HUGGING_FACE_HUB_TOKEN':HUGGINGFACE_API_KEY, 'MAX_JOBS':4, 'shm-size':'1g'}
)

In [None]:
pod

In [None]:
inference_server_url = f'https://{pod["id"]}-80.proxy.runpod.net'
llm = HuggingFaceTextGenInference(
    inference_server_url=inference_server_url,
    max_new_tokens=100,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.5,
    repetition_penalty=1.03,
)

#### perform run_pod benchmarking

In [None]:
experiment_card={'model':repo_id,'platform':'run_pod',
           'prompts':[
               {'short_name': 'alpaca style','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'alpaca succint','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'example','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'original bmk','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'numeric succint','mapper':map_numeric_value, 'cleaner':blank_cleaner},
                ]}

In [None]:
experiment_card

In [None]:
model_obj = get_model(session, experiment_card['model'])
if model_obj is None:
    add_model(session, experiment_card['model'])

In [None]:
model_id = run_benchmarking(experiment_card, llm)

#### close down run_pod

In [None]:
runpod.stop_pod(pod["id"])

In [None]:
runpod.terminate_pod(pod["id"])

### run against API

In [None]:
experiment_card={'model':'google/flan-t5-xxl','platform':'hf_hub',
          'prompts':[
               {'short_name': 'alpaca style','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'alpaca succint','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'example','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'original bmk','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'numeric succint','mapper':map_numeric_value, 'cleaner':blank_cleaner},
                ]}

In [None]:
model_obj = get_model(session, experiment_card['model'])
if model_obj is None:
    add_model(session, experiment_card['model'])

In [None]:
model_id = run_benchmarking(experiment_card, None)

### Local model running

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

In [None]:
repo_id = "databricks/dolly-v2-3b"

In [None]:
generate_text = pipeline(model=repo_id, torch_dtype=torch.bfloat16,
                         trust_remote_code=True, device_map="auto", return_full_text=True)

In [None]:
model_obj = get_model(session, repo_id)
if model_obj is None:
    add_model(session,repo_id)

In [None]:
experiment_card={'model':repo_id,'platform':'run_pod',
           'prompts':[
               {'short_name': 'alpaca style','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'alpaca succint','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'example','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'original bmk','mapper':map_rsvp_value, 'cleaner':word_cleaner},
               {'short_name': 'numeric succint','mapper':map_numeric_value, 'cleaner':blank_cleaner},
                ]}

In [None]:
llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
model_id = run_benchmarking(experiment_card, llm)

### Review results

In [None]:
from pandas import json_normalize

In [None]:
con = session.connection()

In [None]:
results_df = pd.read_sql(sql="""select short_name, model_name, results
from experiments
left join models on experiments.model_id = models.id
left join prompts on experiments.prompt_id = prompts.id
order by model_name
""", con=con)

In [None]:
results_df.head(3)

In [None]:
results_dict = results_df['results'].to_dict()

In [None]:
all_results =[]
for k,v in results_dict.items():
    # print(k)
    js = json.loads(v)
    flat_js = json_normalize(js,sep='_')
    flat_js['idx']=k
    all_results.append(flat_js)

all_results_df = pd.concat(all_results)
all_results_df.set_index('idx',inplace=True)
all_results_df.head(3)

In [None]:
results_df_join = results_df.join(all_results_df)

In [None]:
results_df_join

In [None]:
results_df_join.to_csv('experiment_results.csv')