In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
from predibase import Predibase, FinetuningConfig, DeploymentConfig
from dotenv import load_dotenv
import os
import pandas as pd

# .env ÌååÏùº Î°úÎìú
load_dotenv()

# Get a KEY from https://app.predibase.com/
api_token: str = os.getenv('PREDIBASE_API_KEY')
pb = Predibase(api_token=api_token)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:

def review_data_to_dataframe(review_df, max=-1):
    template = {
        "prompt": """system\nÎã§ÏùåÏùÄ Ìï¥Îãπ ÏóÖÏ≤¥Ïóê ÎåÄÌïú ÏÜåÎπÑÏûêÏùò Î¶¨Î∑∞ÏûÖÎãàÎã§. Ìï¥Îãπ Î¶¨Î∑∞Î•º positive, neutral, negative Ï§ë ÌïòÎÇòÎ°ú Î∂ÑÎ•òÌïòÏÑ∏Ïöî.\nreview\n {content}\nclassification\n""",
        "completion": "{label}",
        "split": "train"
    }
    
    # Shuffle the DataFrame
    shuffled_df = review_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Limit to max rows if specified
    if max != -1:
        shuffled_df = shuffled_df.head(max)
    
    data = []

    for _, r in shuffled_df.iterrows():
        row = {
            "prompt": template["prompt"].format(content=r["content"]),
            "completion": template["completion"].format(label=r["human_label"]),
            "split": "train"
        }
        data.append(row)
    
    return pd.DataFrame(data)

In [4]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_pretrained("upstage/solar-1-mini-tokenizer")

def compute_cost_from_dataframe(df, price_per_million_tokens=0.5):
    """ Compute the cost of the dataset from a DataFrame """

    total_num_of_tokens = 0
    # get all values from the DataFrame
    values = df.apply(lambda row: row['completion'] + " " + row['prompt'], axis=1)
    
    for value in values:
        # tokenize
        enc = tokenizer.encode(value)
        num_of_tokens = len(enc.tokens)
        total_num_of_tokens += num_of_tokens

    return total_num_of_tokens / 1000000 * price_per_million_tokens

In [5]:
def load_review_data(path):
    review_df = pd.read_excel(path)

    return review_df

In [6]:
review_df = load_review_data(path="review_label_df.xlsx")
review_df.head()

Unnamed: 0,nickname,content,date,visit_cnt,solar_label,solar_label_eval,human_label
0,Ïö∞ÌûàÌûàÎ∞ç„Öá,Î™ÖÎ∂àÌóàÏ†Ñ,8.9.Í∏à,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""positive""}",positive,positive
1,Ïù¥ÏïºÍ∏∞ ÌÉêÏ†ï,Ï†úÏ£ºÏóêÏÑú Ïú†Î™ÖÌïú ÍπÄÎ∞•ÎßõÏßë Ïò§ÎäîÏ†ïÍπÄÎ∞•Ïóê Í∞îÏñ¥Ïöî.üíïÏòàÏïΩÏùÑ ÌïòÎ†•Í≥† Ï†ÑÌôîÎ•º ÌïòÎäîÎç∞ ÌÜµÌôî Ï§ë...,8.9.Í∏à,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""positive""}",positive,positive
2,jennyscampus,"Ï†ú Í∞úÏù∏Ï†ÅÏúºÎ°úÎäî ÏßúÍ≥†, ÎäêÎÅºÌñàÎäîÎç∞ ÏπúÏñ∏ÎãàÎäî ÏûÖÎßõÏóê ÎßûÎã§Í≥† ÌïòÎÑ§Ïöî~^^",8.8.Î™©,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""negative""}",negative,neutral
3,Ïó∞Ìù¨6440,Ïñ¥Î†µÎã§Ïñ¥Î†§Ïõå ÌûòÎì§Í≤åÎìúÎîîÏó¨ ÍπÄÎ∞• ÏòÅÏ†ë.ÎãπÏùº ÏòàÏïΩÎßå Í∞ÄÎä•ÌïúÏ§Ñ ÏïåÏïòÎäîÎç∞Ï†ÑÎÇ† ÏòàÏïΩÎèÑ Í∞ÄÎä•Ìïò...,8.8.Î™©,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""negative""}",negative,negative
4,ÎÇôÏõêÏª¥ÌçºÎãà,Ï†úÏ£º ÎßõÏßë Î∏îÎ°úÍ±∞ ÎîîÌè¨ÏûÖÎãàÎã§Í∞úÏù∏Ï†ÅÏúºÎ°ú ÏòàÏ†ÑÎ≥¥Îã® ÎßõÏù¥ ÎçúÌïòÏßÄÎßåÏó¨Ï†ÑÌûà ÎßéÏùÄÎ∂ÑÎì§Íªò ÏÇ¨ÎûëÎ∞õ...,8.8.Î™©,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""neutral""}",neutral,neutral


In [7]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(review_df, test_size=0.1, random_state=42)

In [8]:
train_df = train_df.reset_index(drop=True)
print(len(train_df))
train_df.head(3)

5373


Unnamed: 0,nickname,content,date,visit_cnt,solar_label,solar_label_eval,human_label
0,ÌîÑÎ¶∞ÏÑ∏Ïä§BIN,"Î≤†Ïä§Ìä∏ Ïù¥Î∂àÎπµÏù¥ÏòàÏöî. ÎßòÎ™®Ïä§Í∞ôÏùÄÎç∞ Ìå•,ÌÅ¨Î¶º,Ìå• Í≤πÍ≤πÏù¥ Îì§Ïñ¥ÏûàÍ≥† ÌÅ¨Í∏∞ÎèÑ Ïª§Ïöî.ÏöîÎü∞Ï¢ÖÎ•ò...",1.1.Ïõî,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""positive""}",positive,positive
1,A Sun Shower,ÎßàÎÜçÎ∞îÍ≤åÌä∏ Î™áÍ∞úÏî© ÏÇ¨Í∞ÄÎäî Ïù¥Ïú†Í∞Ä ÏûàÎÑ§ÏöîÌïúÍ∞úÍµ¨ÏûÖÌïúÍ∞ú ÏïÑÏâΩÎÑ§Ïöî ÎßàÎäòÎπµ Ï¢ãÏïÑÌïòÏãúÎäîÎ∂ÑÎì§ÏùÄ ...,7.23.Ìôî,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""positive""}",positive,positive
2,ÎùºÍΩÅÎπÑÏóêÎú®,ÎÑò Í∑ÄÏó¨Ïö¥ ÎèÑÎÑõÏßëüíó,1.25.Î™©,3Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""positive""}",positive,positive


In [9]:
test_df = test_df.reset_index(drop=True)
print(len(test_df))
test_df.head(3)

597


Unnamed: 0,nickname,content,date,visit_cnt,solar_label,solar_label_eval,human_label
0,ÏÇ¨ÎòêÎùºÎ†à84,Îπµ ÎßõÏßë Î∑∞ ÎßõÏßëÏûÖÎãàÎã§ÌäπÌûà 3Ï∏µÏù¥ Ï†ÑÎßù ÏòàÏà†Ïù¥ÎÑ§ÏöîÎπ®Î¶¨Ïò§ÏÖîÏïºÌï†ÎìØ 6ÏãúÍπåÏßÄÎûòÏöî „Öã„ÖãÏûòÏâ¨...,7.6.ÌÜ†,1Î≤àÏß∏ Î∞©Î¨∏,"{\n ""sentiment"": ""positive""\n}",positive,positive
1,Î∂ÄÏÇ∞61,Ïó≠Ïãú ÎßõÏù¥ Ïß±Ïù¥ÏóêÏöî,7.22.Ïõî,1Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""positive""}",positive,positive
2,ÏÇ¨ÎûëÌï¥ÎÉ†ÎÉ†,ÎÑìÍ≥† ÏæåÏ†ÅÌï¥Ïöî Ïò§Ï†ÑÏù¥Îùº ÎπµÏ¢ÖÎ•òÎèÑ ÎßéÍ≥† ÎÑâÎÑâÌïòÎÑ§ÏöîÏÜåÍ∏àÎπµ Ï†ïÎßê ÎÑàÎ¨¥ ÎßõÎÇ® „Öã„ÖãÏª§ÌîºÎèÑ Ïã∏...,7.30.Ìôî,5Î≤àÏß∏ Î∞©Î¨∏,"{""sentiment"": ""positive""}",positive,positive


In [11]:
import time

dataset_name = "jeju_review_240817"
try:
  pb_dataset = pb.datasets.get(dataset_name)
  print(f"Dataset found: {pb_dataset}")
except RuntimeError:
  print("Dataset not found, creating...")

  review_df_final = review_data_to_dataframe(train_df, max=500)

  print(f"One step FT Cost: {compute_cost_from_dataframe(review_df_final)} USD")
  review_df_final.to_csv(f"{dataset_name}.csv",encoding="utf-8",index=False)
  print("Uploading dataset...")
  pb_dataset = pb.datasets.from_file(f"{dataset_name}.csv", name=dataset_name)

  # Dataset Validation: True
  # One step FT Cost: 0.0078415 USD

# FIXME1: how to delete or update the data

Dataset not found, creating...
One step FT Cost: 0.0172775 USD
Uploading daatset...


In [12]:
# Create an adapter repository
repo_name = "review-classification-model_test1"
repo = pb.repos.create(name=repo_name, description="Review Classification Experiments", exists_ok=True)
print(repo)

uuid='58d29514-a663-4883-9083-f7504f27e8da' name='review-classification-model_test1' description='Review Classification Experiments'


In [13]:


# Start a fine-tuning job, blocks until training is finished
adapter = pb.adapters.create(
    config=FinetuningConfig(
        base_model="solar-1-mini-chat-240612",
        epochs=3, # default: 3
        rank=16, # default: 16
    ),
    dataset=pb_dataset, # Also accepts the dataset name as a string
    repo=repo,
    description="initial model with defaults"
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `review-classification-model_test1/1`. (Job UUID: 7f92f995-b033-4f0f-a00c-c95f2c91e565).

Watching progress of finetuning job 7f92f995-b033-4f0f-a00c-c95f2c91e565. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.
Job is starting. Total queue time: 0:00:46         
Waiting to receive training metrics...
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ checkpoint [0m‚îÇ train_loss [0m‚îÇ validation_loss [0m‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îº‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ     1      [0m‚îÇ   0.0518   [0m‚îÇ        --       [0m‚îÇ
‚îÇ     2      [0m‚îÇ   1.0782   [0m‚îÇ        --       [0m‚îÇ
‚îÇ     3      [0m‚îÇ   0.0096   [0m‚îÇ        --       [0m‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ

In [14]:
adapter

Adapter(repo='review-classification-model_test1', tag=1, archived=False, base_model='solar-1-mini-chat-240612', description='initial model with defaults', artifact_path='7f92f995-b033-4f0f-a00c-c95f2c91e565/8cff6e5bd6724b0091b4fff90a2e3f37/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='7f92f995-b033-4f0f-a00c-c95f2c91e565')

In [15]:
adapter_id = adapter.repo + "/" + str(adapter.tag)
adapter_id

'review-classification-model_test1/1'

In [16]:
# Get adapter, blocking call if training is still in progress
adapter = pb.adapters.get(adapter_id)
adapter

Adapter(repo='review-classification-model_test1', tag=1, archived=False, base_model='solar-1-mini-chat-240612', description='initial model with defaults', artifact_path='7f92f995-b033-4f0f-a00c-c95f2c91e565/8cff6e5bd6724b0091b4fff90a2e3f37/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='7f92f995-b033-4f0f-a00c-c95f2c91e565')

In [17]:
input_prompt="""
<|im_start|>system\nÎã§ÏùåÏùÄ Ìï¥Îãπ ÏóÖÏ≤¥Ïóê ÎåÄÌïú ÏÜåÎπÑÏûêÏùò Î¶¨Î∑∞ÏûÖÎãàÎã§. Ìï¥Îãπ Î¶¨Î∑∞Î•º positive, neutral, negative Ï§ë ÌïòÎÇòÎ°ú Î∂ÑÎ•òÌïòÏÑ∏Ïöî.<|im_end|>
<|im_start|>review\n ÏïÑÏù¥Í∞Ä Í∞ÄÏû• Ï¢ãÏïÑÌïòÎäî ÎßõÏßëÏù¥Îùº Í≥µÌï≠Í∞ÄÍ∏∞Ï†ÑÏóê Îì§Î†ÄÏñ¥Ïöî. Ïó¨Í∏∞ Í≥†Í∏∞Íµ≠ÏàòÎäî Ïø∞Ïø∞Ìïú ÎÉÑÏÉàÍ∞Ä ÏóÜÏñ¥ÏÑú Ï¢ãÏïÑÏöî. ÍπÄÏπòÎèÑ ÏßÑÏßú ÏµúÍ≥†Íµ¨Ïöî„Ö†„Ö† ÎèîÎ≤†Í≥†Í∏∞ Ïß±Ïß±ÎßõÏûàÏñ¥Ïöî... ÏïÑÏù¥Îûë Ï†úÏ£ºÏó¨Ìñâ Í∞ÑÎã§Î©¥ Î¨¥Ï°∞Í±¥ ÌïÑÏàòÏΩîÏä§Î°ú Îì§Î¶¥ÏòàÏ†ïÏûÖÎãàÎã§üòãüòòü´∂üèª<|im_end|>
<|im_start|>classification
"""


In [20]:
input_prompt="""
<|im_start|>system\nÎã§ÏùåÏùÄ Ìï¥Îãπ ÏóÖÏ≤¥Ïóê ÎåÄÌïú ÏÜåÎπÑÏûêÏùò Î¶¨Î∑∞ÏûÖÎãàÎã§. Ìï¥Îãπ Î¶¨Î∑∞Î•º positive, neutral, negative Ï§ë ÌïòÎÇòÎ°ú Î∂ÑÎ•òÌïòÏÑ∏Ïöî.<|im_end|>
<|im_start|>review\n Íµø<|im_end|>
<|im_start|>classification
"""

In [18]:
lorax_client = pb.deployments.client("solar-1-mini-chat-240612")
print(lorax_client.generate(input_prompt, adapter_id=adapter_id, max_new_tokens=1000).generated_text)

positive


In [21]:
# CURL test
import requests
import json

url = "https://serving.app.predibase.com/7ea6d0/deployments/v2/llms/solar-1-mini-chat-240612/generate"


payload = {
    "inputs": input_prompt,
    "parameters": {
        "adapter_id": adapter_id,
        "adapter_source": "pbase",
        "max_new_tokens": 20,
        "temperature": 0.1
    }
}

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_token}"
}

response = requests.post(url, data=json.dumps(payload), headers=headers)

print(eval(response.text)["generated_text"])

{"generated_text":"positive"}


In [24]:
eval(response.text)["generated_text"]

'positive'

In [22]:
# Download adapter
pb.adapters.download(adapter_id, dest=f"{adapter.tag}.zip")

Downloading adapter review-classification-model_test1/1 as 1.zip...
Done!
