In [None]:


from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install datasets
from datasets import load_dataset
!pip install -q evaluate sacrebleu



**Smaller** **model**

In [None]:
pip install transformers datasets peft evaluate




In [None]:
# Install required packages
!pip install -q transformers datasets sacrebleu sentencepiece peft evaluate torch

import torch
from transformers import MBartForConditionalGeneration, MBartTokenizer
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

def load_model_from_hf():
    """
    Load mBART model for Q&A
    """
    model_name = "facebook/mbart-large-cc25"
    print(f"Loading model: {model_name}")

    tokenizer = MBartTokenizer.from_pretrained(model_name)
    tokenizer.src_lang = "en_XX"
    tokenizer.tgt_lang = "cs_CZ"

    model = MBartForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    print(f"Model loaded successfully and moved to {device}")
    print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M parameters")

    return model, tokenizer

# Execute the loading function
if __name__ == "__main__":
    model, tokenizer = load_model_from_hf()

Loading model: facebook/mbart-large-cc25
Model loaded successfully and moved to cuda
Model size: 610.85M parameters


In [None]:
def prepare_dataset():
    dataset = load_dataset("json", data_files="/content/drive/MyDrive/Czech.json")

    dataset = dataset["train"].shuffle(seed=42).select(range(len(dataset["train"])//5))
    dataset = dataset.train_test_split(test_size=0.1)

    def preprocess_function(examples):
        inputs = [f"{instr} {inp}" for instr, inp in zip(examples["instruction"], examples["input"])]
        targets = examples["output"]


        model_inputs = tokenizer(
            inputs,
            max_length=64,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                targets,
                max_length=64,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    processed_dataset = {}
    for split in ["train", "test"]:
        processed_dataset[split if split == "train" else "validation"] = dataset[split].map(
            preprocess_function,
            batched=True,
            remove_columns=dataset[split].column_names
        )

    return processed_dataset

def setup_training_args():
    from transformers import Seq2SeqTrainingArguments

    return Seq2SeqTrainingArguments(
        output_dir="/content/drive/MyDrive/czech_qa_smallu_model",
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        learning_rate=5e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        weight_decay=0.01,
        save_total_limit=1,
        load_best_model_at_end=True,
        logging_steps=50,
        warmup_steps=100,
        fp16=True,
        save_steps=100,
        eval_accumulation_steps=2,
        predict_with_generate=True,
        generation_max_length=64
    )

def setup_lora_config():
    from peft import LoraConfig
    return LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "fc1", "fc2"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_2_SEQ_LM"
    )

def train_model(model, dataset, training_args, lora_config):
    from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
    from peft import get_peft_model
    import evaluate
    import numpy as np

    model = get_peft_model(model, lora_config)

    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        padding=True,
        return_tensors="pt"
    )

    metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_preds):
        predictions, labels = eval_preds
        if isinstance(predictions, tuple):
            predictions = predictions[0]

        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]

        bleu_score = metric.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])

        return {"bleu": bleu_score["score"]}

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer

def save_training_data(trainer, output_dir="/content/drive/MyDrive/czech_qa_smallu_model"):
    import os
    import pandas as pd
    import matplotlib.pyplot as plt

    os.makedirs(output_dir, exist_ok=True)

    log_history = pd.DataFrame(trainer.state.log_history)
    log_history.to_csv(f"{output_dir}/training_logs.csv", index=False)

    plt.figure(figsize=(10, 6))
    plt.plot(log_history['step'], log_history['loss'], label='Training Loss')
    if 'eval_loss' in log_history.columns:
        plt.plot(log_history['step'], log_history['eval_loss'], label='Validation Loss')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.title('Training Curves')
    plt.legend()
    plt.savefig(f"{output_dir}/training_curves.png")
    plt.close()

    final_model_dir = os.path.join(output_dir, "final_model")
    trainer.model.save_pretrained(final_model_dir)
    tokenizer.save_pretrained(final_model_dir)
    print(f"Model and training data saved to {output_dir}")

def main():
    dataset = prepare_dataset()
    training_args = setup_training_args()
    lora_config = setup_lora_config()
    trainer = train_model(model, dataset, training_args, lora_config)
    save_training_data(trainer)

if __name__ == "__main__":
    main()

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/11943 [00:00<?, ? examples/s]

Map:   0%|          | 0/1328 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Bleu
100,6.6437,5.818168,0.276351
200,5.45,5.101542,0.733541
300,5.1354,4.945078,0.488545
400,5.1157,4.94898,1.211878
500,5.1564,4.866698,1.93865
600,5.014,4.855597,2.706766
700,5.2151,4.840298,3.60971


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_clas

Model and training data saved to /content/drive/MyDrive/czech_qa_smallu_model


**LargerModel**

In [None]:

!pip install -q transformers datasets sacrebleu sentencepiece peft evaluate torch

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

def load_model_from_hf():
    """
    Load the NLLB model for multilingual Q&A
    """
    model_name = "facebook/nllb-200-distilled-1.3B"
    print(f"Loading model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.src_lang = "eng_Latn"
    tokenizer.tgt_lang = "ces_Latn"

    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    print(f"Model loaded successfully and moved to {device}")
    print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters")

    return model, tokenizer

# Execute the loading function
if __name__ == "__main__":
    model, tokenizer = load_model_from_hf()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency reso

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.48G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Model loaded successfully and moved to cuda
Model size: 1.37B parameters


In [None]:

from datasets import load_dataset


from transformers import T5Tokenizer


from peft import LoraConfig, get_peft_model


from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import evaluate


import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt


from transformers import AutoModelForSeq2SeqLM


In [None]:
def prepare_dataset():
    dataset = load_dataset("json", data_files="/content/drive/MyDrive/Czech.json")
    dataset = dataset["train"].shuffle(seed=42).select(range(len(dataset["train"])//5))
    dataset = dataset.train_test_split(test_size=0.1)

    def preprocess_function(examples):
        inputs = [f"{instr} {inp}" for instr, inp in zip(examples["instruction"], examples["input"])]
        targets = examples["output"]

        model_inputs = tokenizer(
            inputs,
            max_length=48,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                targets,
                max_length=48,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    processed_dataset = {}
    for split in ["train", "test"]:
        processed_dataset[split if split == "train" else "validation"] = dataset[split].map(
            preprocess_function,
            batched=True,
            remove_columns=dataset[split].column_names
        )

    return processed_dataset

def setup_training_args():
    from transformers import Seq2SeqTrainingArguments

    return Seq2SeqTrainingArguments(
        output_dir="/content/drive/MyDrive/czech_qa_model",
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        learning_rate=2e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=1,
        load_best_model_at_end=True,
        logging_steps=100,
        warmup_steps=100,
        fp16=True,
        save_steps=500,
        eval_accumulation_steps=4,
        predict_with_generate=True
    )

def setup_lora_config():
    from peft import LoraConfig
    return LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_2_SEQ_LM"
    )

def train_model(model, dataset, training_args, lora_config):
    from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
    from peft import get_peft_model
    import evaluate

    model = get_peft_model(model, lora_config)

    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        padding=True,
        return_tensors="pt"
    )

    metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_preds):
        predictions, labels = eval_preds
        if isinstance(predictions, tuple):
            predictions = predictions[0]

        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]

        bleu_score = metric.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])

        return {"bleu": bleu_score["score"]}

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer

def save_training_data(trainer, output_dir="/content/drive/MyDrive/czech_qa_model"):
    import os
    import pandas as pd
    import matplotlib.pyplot as plt

    os.makedirs(output_dir, exist_ok=True)

    log_history = pd.DataFrame(trainer.state.log_history)
    log_history.to_csv(f"{output_dir}/training_logs.csv", index=False)

    plt.figure(figsize=(10, 6))
    plt.plot(log_history['step'], log_history['loss'], label='Training Loss')
    if 'eval_loss' in log_history.columns:
        plt.plot(log_history['step'], log_history['eval_loss'], label='Validation Loss')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.title('Training Curves')
    plt.legend()
    plt.savefig(f"{output_dir}/training_curves.png")
    plt.close()

    final_model_dir = os.path.join(output_dir, "final_model")
    trainer.model.save_pretrained(final_model_dir)
    tokenizer.save_pretrained(final_model_dir)
    print(f"Model and training data saved to {output_dir}")

def main():
    dataset = prepare_dataset()
    training_args = setup_training_args()
    lora_config = setup_lora_config()
    trainer = train_model(model, dataset, training_args, lora_config)
    save_training_data(trainer)

if __name__ == "__main__":
    main()

Map:   0%|          | 0/11943 [00:00<?, ? examples/s]

Map:   0%|          | 0/1328 [00:00<?, ? examples/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Bleu
500,4.0205,3.795106,18.114684
1000,3.8894,3.709525,19.216525
1500,3.7878,3.674387,19.0296
2000,3.721,3.65424,19.904381


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_clas

Model and training data saved to /content/drive/MyDrive/czech_qa_model


In [None]:

!pip install transformers huggingface_hub


from huggingface_hub import HfApi


YOUR_TOKEN = "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"
api = HfApi(token=YOUR_TOKEN)


small_model_path = "/content/drive/MyDrive/czech_qa_smallu_model/final_model"
large_model_path = "/content/drive/MyDrive/czech_qa_model/final_model"


api.create_repo(
    repo_id="koushikkanch/small-model",
    token=YOUR_TOKEN,
    exist_ok=True,
    private=False
)

api.create_repo(
    repo_id="koushikkanch/large-model",
    token=YOUR_TOKEN,
    exist_ok=True,
    private=False
)


api.upload_folder(
    folder_path=small_model_path,
    repo_id="koushikkanch/small-model",
    repo_type="model",
    token=YOUR_TOKEN
)

api.upload_folder(
    folder_path=large_model_path,
    repo_id="koushikkanch/large-model",
    repo_type="model",
    token=YOUR_TOKEN
)

print("Model folders uploaded successfully!")

adapter_model.safetensors:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/28.4M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/32.2M [00:00<?, ?B/s]