# Data Preparation for Language Model Training

This notebook demonstrates the process of preparing and formatting training data for a language model using the Databricks Dolly dataset and custom QA pairs.

## 1. Import Required Libraries

- Imports necessary Python libraries for data processing and model handling.

In [None]:
import os
import json

from typing import Dict, List, Any
from dotenv import load_dotenv
from datasets import Dataset, load_dataset, concatenate_datasets

## 2. HuggingFace Authentication Setup

- Sets up authentication with HuggingFace using environment variables and API token.

In [None]:
load_dotenv('../end.local')
HF_TOKEN = os.getenv('HF_TOKEN')
if not HF_TOKEN:
    raise ValueError("HF_TOKEN not found in environment variables")

!huggingface-cli login --token {HF_TOKEN}

## 3. Load and Combine Datasets
- Loads the Dolly dataset and custom QA pairs, then combines them into a single dataset.

In [None]:
dataset = load_dataset("databricks/databricks-dolly-15k")
qa_dataset = load_dataset('json', data_files='../data/qa_pairs.jsonl', split='train')

def convert_qa_to_dolly_format(example):
    return {
        "instruction": example["question"],
        "context": "",  
        "response": example["answer"],
        "category": "closed_qa"  
    }

qa_dataset_converted = qa_dataset.map(convert_qa_to_dolly_format)
qa_dataset_converted = qa_dataset_converted.remove_columns(['question', 'answer', 'chunk_index'])

combined_dataset = concatenate_datasets([dataset['train'], qa_dataset_converted])

## 4. Dataset Splitting
- Splits the combined dataset into train (80%), validation (10%), and test (10%) sets.


In [None]:
RANDOM_SEED = 42
TEST_SIZE = 0.2
VALIDATION_SIZE = 0.5

initial_split = combined_dataset.train_test_split(
    test_size=TEST_SIZE,
    seed=RANDOM_SEED
)

validation_test_split = initial_split['test'].train_test_split(
    test_size=VALIDATION_SIZE,  
    seed=RANDOM_SEED
)

final_dataset = {
    'train': initial_split['train'],
    'validation': validation_test_split['train'],
    'test': validation_test_split['test']
}

In [None]:
print(next(iter(example for example in final_dataset['train'] if example['context'])))

## 5. Message formatting
- Formats the data into a structured conversation format with user instructions and assistant responses.

In [None]:
def format_instruction(instruction: str, context: str, response: str) -> List[Dict[str, str]]:
    """
    Format instruction, context and response into a structured conversation format.
    """
    user_content = (
        f"Instruction: {instruction}\n"
        f"Context: {context.strip() if context and context.strip() else 'N/A'}"
    )
    
    response_data = {
        "response": response,
        "metadata": {
            "has_context": bool(context and context.strip()),
            "input_type": "text"
        }
    }
    
    assistant_content = json.dumps(
        response_data, 
        ensure_ascii=False, 
        indent=2
    )

    return [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": assistant_content}
    ]


def process_datasets(dataset: Dict[str, Dataset], random_seed: int = 42, verbose: bool = False) -> Dict[str, Dataset]:
    """
    Process all datasets (train, validation, test) at once.
    """
    COLUMNS_TO_REMOVE = ['instruction', 'context', 'response', 'category']
    
    processed_datasets = {}
    
    for split_name, split_data in dataset.items():
        if verbose:
            print(f"Processing {split_name} dataset...")
            
        processed_datasets[split_name] = (
            split_data.shuffle(seed=random_seed)
            .map(
                lambda x: {
                    "messages": format_instruction(
                        x["instruction"],
                        x["context"],
                        x["response"]
                    )
                }
            )
            .remove_columns(COLUMNS_TO_REMOVE)
        )
        
        if verbose:
            print(f"{split_name} dataset processed. Size: {len(processed_datasets[split_name])}")
    
    return processed_datasets

# Process datasets
train_dataset = process_dataset(final_dataset['train'])
validation_dataset = process_dataset(final_dataset['validation'])
test_dataset = process_dataset(final_dataset['test'])

# Display sample output
print("\nSample of transformed data:")
print(train_dataset[0])

## 6. Save Datasets
- Processes the formatted data and saves it into JSON files for each split (train/validation/test).

In [None]:
def save_datasets_to_json(dataset_name, datasets, base_path="../data"):
    """
    Save datasets to JSON files in organized directory structure.
    """
    
    data_folder = os.path.join(base_path, dataset_name)
    os.makedirs(data_folder, exist_ok=True)
    
    json_paths = {}

    for split_name, dataset in datasets.items():
        split_dir = os.path.join(data_folder, split_name)
        os.makedirs(split_dir, exist_ok=True)
        
        json_path = os.path.join(split_dir, f"{split_name}_dataset.json")
        dataset.to_json(json_path)
        
        json_paths[split_name] = json_path
        print(f"{split_name.capitalize()} data saved to: {json_path}")
    
    print(f"\nAll datasets saved to: {data_folder}")
    return data_folder, json_paths

# Store dataset
DATASET_NAME = "dolly_dataset"

data_folder, json_paths = save_datasets_to_json(
    dataset_name=DATASET_NAME,
    datasets={
        'train': train_dataset,
        'validation': validation_dataset,
        'test': test_dataset
    }
)

In [None]:
%store data_folder
%store json_paths