# Dataset Preprocessing and Upload to AWS S3

**1-Loading your dataset (tsv)**

You should prepare your dataset in tsv (it is basically a txt with the name changed, the standard for finetuning models)

**Important note**: The data in the file should look like any kind of text and a separation with a pattern to differenciate instruction and response. We will use "\<s>[INST]" as the beginning of the instruction and "[/INST]" as the end of the instruction and beginning of response. Finally "\</s>" will be used to address the end of the pattern.

**Example**: \<s>[INST] How are you today? [/INST] I am good, thank you! \</s>

In [1]:
import pandas as pd

# Function to load a TSV file into an array
def load_tsv_to_array(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None)
    return df[0].values

# Load each TSV file into the respective array
train_set = load_tsv_to_array('./train_set.tsv')
validation_set = load_tsv_to_array('./validation_set.tsv')
test_set_evaluation = load_tsv_to_array('./test_set.tsv')

# Verify the contents
print("Train set loaded: ", train_set[:1])  # Print first 5 elements for verification
print("Length: "+str(len(train_set)))
print("Validation set loaded: ", validation_set[:1])  # Print first 5 elements for verification
print("Length: "+str(len(validation_set)))
print("Test set loaded: ", test_set_evaluation[:1])  # Print first 5 elements for verification
print("Length: "+str(len(test_set_evaluation)))

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Train set loaded:  ["<s>[INST] F/35 Para 0-0-0-0 HTN/DM(-/-) # Latent tuberculosis Rifampin 4개월 복용중 11. 말 피임약 복용하다가 깜빡 하여 복용 중단. 다음번 생리 주기에 맞추어 다시 복용 하려고 하였음. 2019-12-03 LMP. 경황이 없어 피임약 복용하지 못하였음. 4일 후 생리 마침. 경구 피임약 복용시작 하지 못하여 다시 다음번 주기에 맞춰 복용시작하려고 복용하지 않음 이후 더이상 생리 없음 2019-02-01 임신 테스트기 사용하였고 양성. 2019-02-02 산부인과 진료 아기집 추정되는 sac 확인되며 주위에 피가고여 있다고 들었으며 일주일 뒤 f/u하기로 함/ 2019-02-09 Low abdominal pain 발생하였고 지속. 심하지 않아 경과관찰. 이 시기부터 갈색 vaginal bleeding발생. 하루 pad 2개 정도 사용 abdominal pain, vaginal bleeding 지속 2019-02-12 LLQ area abdominal pain 악화됨. 산부인과 진료. sac 은 안보이나 자궁 벽은 임신시 변화 소견이라고 들었음. 혈액검사 beta HCG 3000 abdominal pain지속시 응급실 내원하라고 들었고 abdominal pain 지속되어 응급실 내원함. 복부수술(-) [/INST] {'SOCIAL_DETERMINANTS': [{'type': 'F/35', 'start_offset': 0, 'end_offset': 4}, {'type': 'Para 0-0-0-0', 'start_offset': 5, 'end_offset': 17}, {'type': '2019-12-03 LMP', 'start_offset': 128, 'end_offset': 142}, {'type': '임신', 'start_offset': 251, 'end_offset': 253}], 'DIAGNOSIS': [{'type': 'HTN/DM(-/-)', 'start_of

**2-Code for applying the desired scheme**

Each LLM has a different scheme, your dataset should be adapted for it. This code grabs the current dataset and applies the corresponding scheme. (You do not need to modify this part)

In [3]:
import pandas as pd
import re

# Function to reformat a single sample using the selected schema
def reformat_sample(sample, formatter):
    """
    Reformat a single sample using the specified formatter.
    
    Args:
        sample (str): The sample to be reformatted.
        formatter (function): A function that formats the user and model text.
        
    Returns:
        str: Reformatted sample.
    """
    # Regular expression to extract USER_TEXT and MODEL_TEXT
    pattern = r'<s>\[INST\]\s*(.*?)\s*\[/INST\]\s*(.*?)\s*</s>'
    match = re.match(pattern, sample, re.DOTALL)
    
    if match:
        user_text = match.group(1).strip()
        model_text = match.group(2).strip()
        return formatter(user_text, model_text)
    else:
        print(f"Warning: Sample format unexpected:\n{sample}\n")
        return sample
    
# Function to apply the selected schema to a dataset
def reformat_dataset(dataset, schema_name):
    """
    Reformat a dataset using the specified schema.
    
    Args:
        dataset (list): List of samples to be reformatted.
        schema_name (str): Name of the schema to use for formatting.
        
    Returns:
        list: List of reformatted samples.
    """
    if schema_name not in SCHEMA_MAP:
        raise ValueError(f"Unknown schema: {schema_name}. Available schemas: {list(SCHEMA_MAP.keys())}")
    
    formatter = SCHEMA_MAP[schema_name]
    return [reformat_sample(sample, formatter) for sample in dataset]

# Function to save an array to a TSV file
def save_array_to_tsv(array, file_path):
    df = pd.DataFrame(array)
    df.to_csv(file_path, sep='\t', index=False, header=False)


**3-Schema Definition**

Here you can add your own schemas

In [None]:
# Formatting schemas
def gemma_2_format(user_text, model_text):
    """
    Gemma 2 formatting scheme.
    """
    return (
        f"<start_of_turn>user\n"
        f"{user_text}<end_of_turn>\n"
        f"<start_of_turn>model\n"
        f"{model_text}<end_of_turn>"
    )

def llama_3_1_format(user_text, model_text):
    """
    Llama 3.1 formatting scheme.
    """
    return (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
        f"{user_text}<|eot_id|>\n"
        f"<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{model_text}<|eot_id|>"
    )

# Map schemas to formatting functions
SCHEMA_MAP = {
    "gemma_2": gemma_2_format,
    "llama_3_1": llama_3_1_format,
    # Add more schemas here
}

**4-Format and verify the content**

In [5]:

# Example Usage (you currently have them loaded from step 1, but should look similar to this)
# train_set = ["<s>[INST] User text 1 [/INST] Model text 1 </s>", "<s>[INST] User text 2 [/INST] Model text 2 </s>"]
# validation_set = ["<s>[INST] Validation user [/INST] Validation model </s>"]
# test_set_evaluation = ["<s>[INST] Test user [/INST] Test model </s>"]

# Reformat the datasets using a chosen schema
chosen_schema = "gemma_2"  # Change to "llama_3_1" or another schema as needed - make sure it is a name defined in SCHEMA_MAP
train_set_formatted = reformat_dataset(train_set, chosen_schema)
validation_set_formatted = reformat_dataset(validation_set, chosen_schema)
test_set_evaluation_formatted = reformat_dataset(test_set_evaluation, chosen_schema)

# Verify the reformatted contents
print("Train set formatted sample: ", train_set_formatted[:1])
print("Length: " + str(len(train_set_formatted)))
print("Validation set formatted sample: ", validation_set_formatted[:1])
print("Length: " + str(len(validation_set_formatted)))
print("Test set formatted sample: ", test_set_evaluation_formatted[:1])
print("Length: " + str(len(test_set_evaluation_formatted)))

# (Optional) Save the reformatted datasets
save_array_to_tsv(train_set_formatted, f'./{chosen_schema}_train_set.tsv')
save_array_to_tsv(validation_set_formatted, f'./{chosen_schema}_validation_set.tsv')
save_array_to_tsv(test_set_evaluation_formatted, f'./{chosen_schema}_test_set.tsv')


Train set formatted sample:  ["<start_of_turn>user\nF/35 Para 0-0-0-0 HTN/DM(-/-) # Latent tuberculosis Rifampin 4개월 복용중 11. 말 피임약 복용하다가 깜빡 하여 복용 중단. 다음번 생리 주기에 맞추어 다시 복용 하려고 하였음. 2019-12-03 LMP. 경황이 없어 피임약 복용하지 못하였음. 4일 후 생리 마침. 경구 피임약 복용시작 하지 못하여 다시 다음번 주기에 맞춰 복용시작하려고 복용하지 않음 이후 더이상 생리 없음 2019-02-01 임신 테스트기 사용하였고 양성. 2019-02-02 산부인과 진료 아기집 추정되는 sac 확인되며 주위에 피가고여 있다고 들었으며 일주일 뒤 f/u하기로 함/ 2019-02-09 Low abdominal pain 발생하였고 지속. 심하지 않아 경과관찰. 이 시기부터 갈색 vaginal bleeding발생. 하루 pad 2개 정도 사용 abdominal pain, vaginal bleeding 지속 2019-02-12 LLQ area abdominal pain 악화됨. 산부인과 진료. sac 은 안보이나 자궁 벽은 임신시 변화 소견이라고 들었음. 혈액검사 beta HCG 3000 abdominal pain지속시 응급실 내원하라고 들었고 abdominal pain 지속되어 응급실 내원함. 복부수술(-)<end_of_turn>\n<start_of_turn>model\n{'SOCIAL_DETERMINANTS': [{'type': 'F/35', 'start_offset': 0, 'end_offset': 4}, {'type': 'Para 0-0-0-0', 'start_offset': 5, 'end_offset': 17}, {'type': '2019-12-03 LMP', 'start_offset': 128, 'end_offset': 142}, {'type': '임신', 'start_offset': 251, 'end_offset': 253}]

**5-Upload content to S3**

In [None]:
import os
import sagemaker
import boto3

# Specify your custom bucket name
bucket_name = "llama-training-s3"
s3_prefix = "llama-training-s3"
role = "WRITE_YOUR_AWS_ROLE_ARN"  # Replace with your actual Role ARN
region_name = sagemaker.Session().boto_region_name  # Detect the region

# Initialize Boto3 and SageMaker sessions
session = boto3.Session(region_name=region_name)
s3 = session.resource("s3")
sagemaker_session = sagemaker.Session(boto_session=session)

# Create the bucket if it doesn't exist
try:
    s3.meta.client.head_bucket(Bucket=bucket_name)
    print(f"Bucket '{bucket_name}' already exists.")
except Exception as e:
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={'LocationConstraint': region_name}
    )
    print(f"Bucket '{bucket_name}' created.")

train_s3_path = f"{s3_prefix}/train/{chosen_schema}_train_set.tsv"
validation_s3_path = f"{s3_prefix}/validation/{chosen_schema}_validation_set.tsv"
test_s3_path = f"{s3_prefix}/test/{chosen_schema}_test_set.tsv"

# Local paths for storing training and validation data
train_dir = "./train"
validation_dir = "./validation"
test_dir = "./test"
os.makedirs(train_dir, exist_ok=True)
os.makedirs(validation_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

print(f"Training data will be uploaded to: s3://{bucket_name}/{train_s3_path}")
print(f"Validation data will be uploaded to: s3://{bucket_name}/{validation_s3_path}")
print(f"Testing data will be uploaded to: s3://{bucket_name}/{test_s3_path}")

# Initialize the S3 resource
s3_resource = boto3.resource("s3")

# Define local paths to your TSV files
train_tsv_path = f'./{chosen_schema}_train_set.tsv'
validation_tsv_path = f'./{chosen_schema}_validation_set.tsv'
test_tsv_path = f'./{chosen_schema}_test_set.tsv'

# Upload TSV files to S3
s3_resource.Bucket(bucket_name).upload_file(train_tsv_path, train_s3_path)
s3_resource.Bucket(bucket_name).upload_file(validation_tsv_path, validation_s3_path)
s3_resource.Bucket(bucket_name).upload_file(test_tsv_path, test_s3_path)

print("TSV files uploaded to S3 successfully.")
