In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from predibase import Predibase, FinetuningConfig, DeploymentConfig
from dotenv import load_dotenv
import os
import pandas as pd

# .env 파일 로드
load_dotenv()

# Get a KEY from https://app.predibase.com/
api_token: str = os.getenv('PREDIBASE_API_KEY')
pb = Predibase(api_token=api_token)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:

def review_data_to_dataframe(review_df, max=-1):
    template = {
        "prompt": """system\n다음은 해당 업체에 대한 소비자의 리뷰입니다. 해당 리뷰를 positive, neutral, negative 중 하나로 분류하세요.\nreview\n {content}\nclassification\n""",
        "completion": "{label}",
        "split": "train"
    }
    
    # Shuffle the DataFrame
    shuffled_df = review_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Limit to max rows if specified
    if max != -1:
        shuffled_df = shuffled_df.head(max)
    
    data = []

    for _, r in shuffled_df.iterrows():
        row = {
            "prompt": template["prompt"].format(content=r["content"]),
            "completion": template["completion"].format(label=r["human_label"]),
            "split": "train"
        }
        data.append(row)
    
    return pd.DataFrame(data)

In [4]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_pretrained("upstage/solar-1-mini-tokenizer")

def compute_cost_from_dataframe(df, price_per_million_tokens=0.5):
    """ Compute the cost of the dataset from a DataFrame """

    total_num_of_tokens = 0
    # get all values from the DataFrame
    values = df.apply(lambda row: row['completion'] + " " + row['prompt'], axis=1)
    
    for value in values:
        # tokenize
        enc = tokenizer.encode(value)
        num_of_tokens = len(enc.tokens)
        total_num_of_tokens += num_of_tokens

    return total_num_of_tokens / 1000000 * price_per_million_tokens

In [5]:
def load_review_data(path):
    review_df = pd.read_excel(path)

    return review_df

In [6]:
review_df = load_review_data(path="review_label_df.xlsx")
review_df.head()

Unnamed: 0,nickname,content,date,visit_cnt,solar_label,solar_label_eval,human_label
0,우히히밍ㅇ,명불허전,8.9.금,1번째 방문,"{""sentiment"": ""positive""}",positive,positive
1,이야기 탐정,제주에서 유명한 김밥맛집 오는정김밥에 갔어요.💕예약을 하력고 전화를 하는데 통화 중...,8.9.금,1번째 방문,"{""sentiment"": ""positive""}",positive,positive
2,jennyscampus,"제 개인적으로는 짜고, 느끼했는데 친언니는 입맛에 맞다고 하네요~^^",8.8.목,1번째 방문,"{""sentiment"": ""negative""}",negative,neutral
3,연희6440,어렵다어려워 힘들게드디여 김밥 영접.당일 예약만 가능한줄 알았는데전날 예약도 가능하...,8.8.목,1번째 방문,"{""sentiment"": ""negative""}",negative,negative
4,낙원컴퍼니,제주 맛집 블로거 디포입니다개인적으로 예전보단 맛이 덜하지만여전히 많은분들께 사랑받...,8.8.목,1번째 방문,"{""sentiment"": ""neutral""}",neutral,neutral


In [7]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(review_df, test_size=0.1, random_state=42)

In [8]:
train_df = train_df.reset_index(drop=True)
print(len(train_df))
train_df.head(3)

5373


Unnamed: 0,nickname,content,date,visit_cnt,solar_label,solar_label_eval,human_label
0,프린세스BIN,"베스트 이불빵이예요. 맘모스같은데 팥,크림,팥 겹겹이 들어있고 크기도 커요.요런종류...",1.1.월,1번째 방문,"{""sentiment"": ""positive""}",positive,positive
1,A Sun Shower,마농바게트 몇개씩 사가는 이유가 있네요한개구입한개 아쉽네요 마늘빵 좋아하시는분들은 ...,7.23.화,1번째 방문,"{""sentiment"": ""positive""}",positive,positive
2,라꽁비에뜨,넘 귀여운 도넛집💗,1.25.목,3번째 방문,"{""sentiment"": ""positive""}",positive,positive


In [9]:
test_df = test_df.reset_index(drop=True)
print(len(test_df))
test_df.head(3)

597


Unnamed: 0,nickname,content,date,visit_cnt,solar_label,solar_label_eval,human_label
0,사또라레84,빵 맛집 뷰 맛집입니다특히 3층이 전망 예술이네요빨리오셔야할듯 6시까지래요 ㅋㅋ잘쉬...,7.6.토,1번째 방문,"{\n ""sentiment"": ""positive""\n}",positive,positive
1,부산61,역시 맛이 짱이에요,7.22.월,1번째 방문,"{""sentiment"": ""positive""}",positive,positive
2,사랑해냠냠,넓고 쾌적해요 오전이라 빵종류도 많고 넉넉하네요소금빵 정말 너무 맛남 ㅋㅋ커피도 싸...,7.30.화,5번째 방문,"{""sentiment"": ""positive""}",positive,positive


In [11]:
import time

dataset_name = "jeju_review_240817"
try:
  pb_dataset = pb.datasets.get(dataset_name)
  print(f"Dataset found: {pb_dataset}")
except RuntimeError:
  print("Dataset not found, creating...")

  review_df_final = review_data_to_dataframe(train_df, max=500)

  print(f"One step FT Cost: {compute_cost_from_dataframe(review_df_final)} USD")
  review_df_final.to_csv(f"{dataset_name}.csv",encoding="utf-8",index=False)
  print("Uploading dataset...")
  pb_dataset = pb.datasets.from_file(f"{dataset_name}.csv", name=dataset_name)

  # Dataset Validation: True
  # One step FT Cost: 0.0078415 USD

# FIXME1: how to delete or update the data

Dataset not found, creating...
One step FT Cost: 0.0172775 USD
Uploading daatset...


In [12]:
# Create an adapter repository
repo_name = "review-classification-model_test1"
repo = pb.repos.create(name=repo_name, description="Review Classification Experiments", exists_ok=True)
print(repo)

uuid='58d29514-a663-4883-9083-f7504f27e8da' name='review-classification-model_test1' description='Review Classification Experiments'


In [13]:


# Start a fine-tuning job, blocks until training is finished
adapter = pb.adapters.create(
    config=FinetuningConfig(
        base_model="solar-1-mini-chat-240612",
        epochs=3, # default: 3
        rank=16, # default: 16
    ),
    dataset=pb_dataset, # Also accepts the dataset name as a string
    repo=repo,
    description="initial model with defaults"
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `review-classification-model_test1/1`. (Job UUID: 7f92f995-b033-4f0f-a00c-c95f2c91e565).

Watching progress of finetuning job 7f92f995-b033-4f0f-a00c-c95f2c91e565. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.
Job is starting. Total queue time: 0:00:46         
Waiting to receive training metrics...
┌────────────┬────────────┬─────────────────┐
│ checkpoint [0m│ train_loss [0m│ validation_loss [0m│
├────────────┼────────────┼─────────────────┤
│     1      [0m│   0.0518   [0m│        --       [0m│
│     2      [0m│   1.0782   [0m│        --       [0m│
│     3      [0m│   0.0096   [0m│        --       [0m│
└────────────┴────────────┴─────────────────┘


In [14]:
adapter

Adapter(repo='review-classification-model_test1', tag=1, archived=False, base_model='solar-1-mini-chat-240612', description='initial model with defaults', artifact_path='7f92f995-b033-4f0f-a00c-c95f2c91e565/8cff6e5bd6724b0091b4fff90a2e3f37/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='7f92f995-b033-4f0f-a00c-c95f2c91e565')

In [15]:
adapter_id = adapter.repo + "/" + str(adapter.tag)
adapter_id

'review-classification-model_test1/1'

In [16]:
# Get adapter, blocking call if training is still in progress
adapter = pb.adapters.get(adapter_id)
adapter

Adapter(repo='review-classification-model_test1', tag=1, archived=False, base_model='solar-1-mini-chat-240612', description='initial model with defaults', artifact_path='7f92f995-b033-4f0f-a00c-c95f2c91e565/8cff6e5bd6724b0091b4fff90a2e3f37/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='7f92f995-b033-4f0f-a00c-c95f2c91e565')

In [17]:
input_prompt="""
<|im_start|>system\n다음은 해당 업체에 대한 소비자의 리뷰입니다. 해당 리뷰를 positive, neutral, negative 중 하나로 분류하세요.<|im_end|>
<|im_start|>review\n 아이가 가장 좋아하는 맛집이라 공항가기전에 들렀어요. 여기 고기국수는 쿰쿰한 냄새가 없어서 좋아요. 김치도 진짜 최고구요ㅠㅠ 돔베고기 짱짱맛있어요... 아이랑 제주여행 간다면 무조건 필수코스로 들릴예정입니다😋😘🫶🏻<|im_end|>
<|im_start|>classification
"""


In [20]:
input_prompt="""
<|im_start|>system\n다음은 해당 업체에 대한 소비자의 리뷰입니다. 해당 리뷰를 positive, neutral, negative 중 하나로 분류하세요.<|im_end|>
<|im_start|>review\n 굿<|im_end|>
<|im_start|>classification
"""

In [18]:
lorax_client = pb.deployments.client("solar-1-mini-chat-240612")
print(lorax_client.generate(input_prompt, adapter_id=adapter_id, max_new_tokens=1000).generated_text)

positive


In [21]:
# CURL test
import requests
import json

url = "https://serving.app.predibase.com/7ea6d0/deployments/v2/llms/solar-1-mini-chat-240612/generate"


payload = {
    "inputs": input_prompt,
    "parameters": {
        "adapter_id": adapter_id,
        "adapter_source": "pbase",
        "max_new_tokens": 20,
        "temperature": 0.1
    }
}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_token}"
}

response = requests.post(url, data=json.dumps(payload), headers=headers)

print(eval(response.text)["generated_text"])

{"generated_text":"positive"}


In [24]:
eval(response.text)["generated_text"]

'positive'

In [22]:
# Download adapter
pb.adapters.download(adapter_id, dest=f"{adapter.tag}.zip")

Downloading adapter review-classification-model_test1/1 as 1.zip...
Done!
