In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
from predibase import Predibase, FinetuningConfig, DeploymentConfig

# Get a KEY from https://app.predibase.com/
api_token = 'API_KEY'
pb = Predibase(api_token=api_token)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [6]:
import csv
import sys
from tokenizers import Tokenizer

# Increase the CSV field size limit to handle large fields
# csv.field_size_limit(sys.maxsize)

# Sample tokenizer cost calculation
tokenizer = Tokenizer.from_pretrained("upstage/solar-1-mini-tokenizer")

def validate_data_csv(csv_file_name):
    """Ensure the CSV has 'prompt', 'completion', and 'split' with all values."""
    with open(csv_file_name, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            assert row['prompt'], "Missing prompt"
            assert row['completion'], "Missing completion"
            assert row['split'], "Missing split"
    return True

def compute_cost(csv_file_name, price_per_million_tokens=0.5):
    """Compute the cost of the dataset based on the number of tokens."""
    total_num_of_tokens = 0
    with open(csv_file_name, 'r') as f:
        reader = csv.DictReader(f)
        values = [row['completion'] + " " + row['prompt'] for row in reader]
        for value in values:
            # Tokenize
            enc = tokenizer.encode(value)
            num_of_tokens = len(enc.tokens)
            total_num_of_tokens += num_of_tokens

    return total_num_of_tokens / 1000000 * price_per_million_tokens

# Path to your CSV file
csv_file_name = "jeju_itinerary_with_questions_final_clean.csv"
dataset_name = "jeju_itinerary_with_questions"

try:
    # Check if the dataset already exists in the system
    pb_dataset = pb.datasets.get(dataset_name)
    print(f"Dataset found: {pb_dataset}")
except RuntimeError:
    print("Dataset not found, creating...")

    # Validate the CSV
    print(f"Dataset Validation: {validate_data_csv(csv_file_name)}")
    
    # Compute the cost of training
    print(f"Estimated Cost: {compute_cost(csv_file_name)} USD")

    # Upload the dataset
    print("Uploading dataset...")
    pb_dataset = pb.datasets.from_file(csv_file_name, name=dataset_name)

    # Dataset Validation: True
    # Estimated Cost: (computed based on token count)

# FIXME1: how to delete or update the data

Dataset found: uuid='595e23f4-5757-4b62-84bd-54bfdd514bc8' name='jeju_itinerary_with_questions' connection_type='file' connection_name='file_uploads' status='connected'


In [7]:
repo_name = "jeju-itinerary-question-generator"
repo = pb.repos.create(name=repo_name, description="Jeju Itinerary Question Generator", exists_ok=True)
print(repo)

uuid='fa6337c5-8dc5-4054-8025-09c576d90fef' name='jeju-itinerary-question-generator' description='Jeju Itinerary Question Generator'


In [8]:
# Start the fine-tuning job
adapter = pb.adapters.create(
    config=FinetuningConfig(
        base_model="solar-1-mini-chat-240612",  # Base model for fine-tuning
        epochs=2,  # Number of training epochs
        rank=1,    # Adapter rank
    ),
    dataset=pb_dataset,  # The dataset for training
    repo=repo,  # The repository to store the fine-tuned model
    description="Fine-tuning model for generating questions based on Jeju itinerary"
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `jeju-itinerary-question-generator/3`. (Job UUID: 0219fe35-a117-42fa-82a4-8145d7205653).

Watching progress of finetuning job 0219fe35-a117-42fa-82a4-8145d7205653. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.

Job is starting. Total queue time: 0:02:41                  
Waiting to receive training metrics...

┌────────────┬────────────┬─────────────────┐
│ checkpoint [0m│ train_loss [0m│ validation_loss [0m│
├────────────┼────────────┼─────────────────┤
│     1      [0m│   1.9278   [0m│        --       [0m│
│     2      [0m│   1.4703   [0m│        --       [0m│
└────────────┴────────────┴─────────────────┘


In [9]:
adapter

Adapter(repo='jeju-itinerary-question-generator', tag=3, archived=False, base_model='solar-1-mini-chat-240612', description='Fine-tuning model for generating questions based on Jeju itinerary', artifact_path='0219fe35-a117-42fa-82a4-8145d7205653/1e6d3fa804dc41d79de71b20e4b8ef9c/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='0219fe35-a117-42fa-82a4-8145d7205653')

In [10]:
adapter_id = adapter.repo + "/" + str(adapter.tag)
adapter_id

'jeju-itinerary-question-generator/3'

In [11]:
# Get adapter, blocking call if training is still in progress
adapter = pb.adapters.get(adapter_id)
adapter

Adapter(repo='jeju-itinerary-question-generator', tag=3, archived=False, base_model='solar-1-mini-chat-240612', description='Fine-tuning model for generating questions based on Jeju itinerary', artifact_path='0219fe35-a117-42fa-82a4-8145d7205653/1e6d3fa804dc41d79de71b20e4b8ef9c/artifacts/model/model_weights', finetuning_error=None, finetuning_job_uuid='0219fe35-a117-42fa-82a4-8145d7205653')

In [6]:
# Define an example prompt
input_prompt = """
system\nYou are a travel agent who specializes in creating personalized itineraries for Jeju Island. Based on the user's question, generate a detailed itinerary that matches the specified trip duration and interests. Ensure that the itinerary is well-structured and offers a variety of activities and attractions on Jeju Island.
user\nI am planning a 3-day trip to Jeju Island. Can you recommend an itinerary that includes nature hikes and local food experiences?
itinerary
"""

adapter_id = "jeju-itinerary-question-generator/3"

# Load the fine-tuned model (assuming adapter_id is already available from the fine-tuning process)
lorax_client = pb.deployments.client("solar-1-mini-chat-240612")
print(lorax_client.generate(input_prompt, adapter_id=adapter_id, max_new_tokens=1000).generated_text)

Day 1:

1. Arrival at Jeju International Airport
2. Rent a car (optional)
3. Visit Manjanggul Cave, a UNESCO World Heritage Site and one of the largest lava tubes in the world.
4. Have lunch at a local restaurant, try the Jeju black pork buns.
5. Explore the Seongsan Ilchulbong, a UNESCO World Heritage Site and a tuff cone formed by underwater volcanic activity.
6. Dinner at a local restaurant, try the Jeju abalone stew.

Day 2:

1. Start the day with a visit to the Jeju Folk Village Museum to learn about traditional Jeju life.
2. Have lunch at a local restaurant, try the Jeju octopus hot pot.
3. Hike Hallasan National Park, the highest mountain in South Korea and the center of Jeju Island.
4. Dinner at a local restaurant, try the Jeju grilled octopus.

Day 3:

1. Visit the Jeju Arboretum, a botanical garden with over 1,000 species of trees and plants.
2. Have lunch at a local restaurant, try the Jeju seafood pancake.
3. Explore the Jeju Museum of Art and History, which showcases the i

In [13]:
# Download adapter
pb.adapters.download(adapter_id, dest=f"{adapter.tag}.zip")

Downloading adapter jeju-itinerary-question-generator/3 as 3.zip...
Done!
