In [1]:
!pip install -q predibase datasets

& was unexpected at this time.
The value specified in an AutoRun registry key could not be parsed.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 4.37.2 requires urllib3~=2.0, but you have urllib3 1.26.12 which is incompatible.


In [1]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
from datasets import load_dataset
import csv

def hfdataset_to_csv(datalist: list, csv_file_name, max=-1):
  template = {
        "prompt":
        """<|im_start|>system\nYou are an bunisee law expert. Based on the given question, generate a single line answer concisely.  <|im_end|>
<|im_start|>Question\n {content}
<|im_start|>Answer\n""",
        "completion": "{headline}<|im_end|>",
        "split": "train"}


  with open(csv_file_name, 'w', newline='') as csvfile:
    fieldnames = template.keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for i, d in enumerate(datalist):
      if i >= max:
        break
      row = {
          "prompt": template["prompt"].format(content=d["Question"]),
          "completion": template["completion"].format(headline=d["Answer"]),
          "split": "train"
      }
      writer.writerow(row)

In [4]:
def validate_data_csv(csv_file_name):
  """ Make sure it has prompt, completion, and split with all values """
  with open(csv_file_name, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
      assert row['prompt']
      assert row['completion']
      assert row['split']

  return True

In [5]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_pretrained("upstage/solar-1-mini-tokenizer")

def compute_cost(csv_file_name, price_per_million_tokens=0.5):
  """ Compute the cost of the dataset """

  total_num_of_tokens = 0
  with open(csv_file_name, 'r') as f:
    reader = csv.DictReader(f)
    # get all values
    values = [row['completion']+ " " + row['prompt'] for row in reader]
    for value in values:
      # tokenize
      enc = tokenizer.encode(value)
      num_of_tokens = len(enc.tokens)
      total_num_of_tokens += num_of_tokens


  return total_num_of_tokens / 1000000 * price_per_million_tokens

In [6]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

# Load the dataset
hfdataset = load_dataset("csv", data_files="KBL_dataset.csv")
train_hfdataset = hfdataset["train"]
print(train_hfdataset)
# Convert the dataset to a pandas DataFrame for easier manipulation
train_df = train_hfdataset.to_pandas()

# Split the train dataset into 80% train and 20% test
train_split, test_split = train_test_split(train_df, test_size=0.1, random_state=42)

train_split.reset_index(drop=True, inplace=True)
test_split.reset_index(drop=True, inplace=True)

# Convert back to the Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_split)
test_dataset = Dataset.from_pandas(test_split)

# Print the sizes of the new datasets
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(train_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Question', 'Answer'],
    num_rows: 1468
})
Train dataset size: 1321
Test dataset size: 147
Dataset({
    features: ['Question', 'Answer'],
    num_rows: 1321
})


In [7]:
dataset_name = "NMT_train_dataset"
csv_file_name = f"{dataset_name}.csv"
try:
  pb_dataset = pb.datasets.get(dataset_name)
  print(f"Dataset found: {pb_dataset}")
except RuntimeError:
  print("Dataset not found, creating...")

  hfdataset_to_csv(train_dataset, csv_file_name, max=1321)

  print(f"Dataset Validation: {validate_data_csv(csv_file_name)}")
  print(f"One step FT Cost: {compute_cost(csv_file_name)} USD")

  #print("Uploading daatset...")
  pb_dataset = pb.datasets.from_file(csv_file_name, name=dataset_name)

Dataset not found, creating...
Dataset Validation: True
One step FT Cost: 0.0592745 USD


In [8]:
repo_name = "KBL-01"
repo = pb.repos.create(name=repo_name, description="Upstage_Hackathon_KBL_Adapter", exists_ok=True)
print(repo)

uuid='3857a37e-6853-45e8-bc2a-b90fe6233237' name='KBL-01' description='Upstage_Hackathon_KBL_Adapter'


In [9]:
adapter = pb.adapters.create(
    config=FinetuningConfig(
        base_model="solar-1-mini-chat-240612",
        epochs=3, # default: 3
        rank=16, # default: 16
    ),
    dataset=pb_dataset, # Also accepts the dataset name as a string
    repo=repo,
    description="initial model with defaults"
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `KBL-01/1`. (Job UUID: 6b60911e-8b86-46f1-a785-6e01b6a55a93).

Watching progress of finetuning job 6b60911e-8b86-46f1-a785-6e01b6a55a93. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.

Job is starting. Total queue time: 0:00:45         
Waiting to receive training metrics...

┌────────────┬────────────┬─────────────────┐
│ checkpoint [0m│ train_loss [0m│ validation_loss [0m│
├────────────┼────────────┼─────────────────┤
│     1      [0m│   1.4888   [0m│        --       [0m│
│     2      [0m│   1.5464   [0m│        --       [0m│
└────────────┴────────────┴─────────────────┘


In [10]:
adapter

Adapter(repo='KBL-01', tag=1, archived=False, base_model='solar-1-mini-chat-240612', description='initial model with defaults', artifact_path='6b60911e-8b86-46f1-a785-6e01b6a55a93/9dbcddf4fb634382897ebba29d8f59dd/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='6b60911e-8b86-46f1-a785-6e01b6a55a93')

In [11]:
adapter_id = adapter.repo + "/" + str(adapter.tag)
adapter_id

'KBL-01/1'

In [12]:
input_prompt="""
<|im_start|>system\nYou are an bunisee law expert. Based on the given question, generate a single line answer concisely..<|im_end|>
<|im_start|>Question
What is the website address for the Korean branch of PwC? <|im_end|>
<|im_start|>Answer
"""


In [13]:
# CURL test
import requests
import json

url = "https://serving.app.predibase.com/7ea6d0/deployments/v2/llms/solar-1-mini-chat-240612/generate"


payload = {
    "inputs": input_prompt,
    "parameters": {
        "adapter_id": adapter_id,
        "adapter_source": "pbase",
        "max_new_tokens": 60,
        "temperature": 0.1
    }
}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_token}"
}

response = requests.post(url, data=json.dumps(payload), headers=headers)

print(response.text)

{"generated_text":"www.pwc.com/kr"}


In [15]:
!pip -q install langchain langchain-upstage solar-as-judge

& was unexpected at this time.
The value specified in an AutoRun registry key could not be parsed.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 4.37.2 requires urllib3~=2.0, but you have urllib3 1.26.12 which is incompatible.


In [15]:
lorax_client = pb.deployments.client("solar-1-mini-chat-240612")

In [20]:
# Create test dataset
test_csv_file_name = f"KBL_test_dataset"
hfdataset_to_csv(test_dataset, test_csv_file_name, max=147)

win_results = {"A_wins":0, "B_wins": 0, "tie": 0,  "A_score": 0, "B_score": 0}
with open(test_csv_file_name, 'r') as f:
  reader = csv.DictReader(f)
  for row in reader:
    prompt = row['prompt']
    ground_truth = row['completion']
    A_answer = lorax_client.generate(prompt, max_new_tokens=10).generated_text
    B_answer = lorax_client.generate(prompt, adapter_id=adapter_id, max_new_tokens=1000).generated_text

    A_score, B_score = saj.judge(prompt, A_answer, B_answer, ground_truth)
    print( A_score, B_score, A_answer, B_answer)
    win_results["A_score"] += A_score
    win_results["B_score"] += B_score
    if A_score > B_score:
      win_results["A_wins"] += 1
    elif B_score > A_score:
      win_results["B_wins"] += 1
    else:
      win_results["tie"] += 1
    print(win_results)


0 0 Over the years, Korean products have significantly improved in The quality and popularity of Korean products have significantly improved over the years, with Korean products now being highly popular and sought after in the global market.
{'A_wins': 0, 'B_wins': 0, 'tie': 1, 'A_score': 0, 'B_score': 0}
8 2 Fennec Shand Jang
{'A_wins': 1, 'B_wins': 0, 'tie': 1, 'A_score': 8, 'B_score': 2}
0 0 Yes, there are regulations for advertising certain types of Yes, there are regulations for advertising certain types of products in Korea. For example, advertising for alcoholic beverages is prohibited on television between 6:00 a.m. and 8:00 p.m. and on radio between 6:00 a.m. and 9:00 a.m. and 1:00 p.m. to 5:00 p.m.
{'A_wins': 1, 'B_wins': 0, 'tie': 2, 'A_score': 8, 'B_score': 2}
0 0 The location of PwC's office in PwC Poland
{'A_wins': 1, 'B_wins': 0, 'tie': 3, 'A_score': 8, 'B_score': 2}
10 8 Yes, foreigners are allowed to acquire land in Foreigners are allowed to acquire land in Korea, but 

Task exception was never retrieved
future: <Task finished name='Task-1446' coro=<AsyncClient.aclose() done, defined at c:\Users\emon\anaconda3\Lib\site-packages\httpx\_client.py:1996> exception=AttributeError("'AsyncHttpxClientWrapper' object has no attribute '_transport'")>
Traceback (most recent call last):
  File "c:\Users\emon\anaconda3\Lib\site-packages\httpx\_client.py", line 2003, in aclose
    await self._transport.aclose()
          ^^^^^^^^^^^^^^^
AttributeError: 'AsyncHttpxClientWrapper' object has no attribute '_transport'
