In [1]:
!pip install transformers datasets onnx onnxruntime langchain



In [1]:
import json
from datasets import Dataset
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load JSON data
file_path = "/content/drive/MyDrive/prompt.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Count the number of instructions
instruction_count = len(data)
print(f"Number of instructions: {instruction_count}")

# Ensure data is in the correct format for Hugging Face Dataset
formatted_data = {
    "instruction": [entry["instruction"] for entry in data],
    "input": [entry["input"] for entry in data],
    "output": [entry["output"] for entry in data]
}

# Load into a Dataset object
dataset = Dataset.from_dict(formatted_data)
print(f"Dataset loaded with {len(dataset)} rows.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of instructions: 57
Dataset loaded with 57 rows.


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load model and tokenizer
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
def preprocess(examples):
    combined = [f"Instruction: {instruction}\nResponse: {output}" for instruction, output in zip(examples["instruction"], examples["output"])]
    tokenized = tokenizer(combined, truncation=True, padding="max_length", max_length=128)  # Increased max_length
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenize dataset
tokenized_dataset = dataset.map(preprocess, batched=True)
print("Tokenized Dataset:\n", tokenized_dataset)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    num_train_epochs=30,
    per_device_train_batch_size=1,  # Adjust based on memory
    gradient_accumulation_steps=4,  # Simulate larger batch sizes
    warmup_steps=500,  # Gradual learning rate increase
    weight_decay=0.01,  # Regularization
    fp16=True,  # Mixed precision
    save_total_limit=2,
    learning_rate=5e-6,  # Adjust if necessary
    report_to="none",
    dataloader_num_workers=4,  # Speed up data loading
)

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Tokenized Dataset:
 Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 57
})


In [7]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Start training
try:
    print("Starting training...")
    trainer.train()
    print("Training completed successfully.")
except Exception as e:
    print(f"Error during training: {e}")

  trainer = Trainer(


Starting training...


Step,Training Loss
50,20.3399
100,19.9394
150,18.8001
200,17.8403
250,15.9953
300,13.6963


Step,Training Loss
50,20.3399
100,19.9394
150,18.8001
200,17.8403
250,15.9953
300,13.6963
350,11.4287
400,9.1782


Training completed successfully.


In [8]:
try:
    model.save_pretrained("./model")
    tokenizer.save_pretrained("./model")
    print("Model and tokenizer saved successfully.")
except Exception as e:
    print(f"Error saving the model: {e}")

Model and tokenizer saved successfully.


In [9]:
import torch
from google.colab import files  # To handle file downloads in Colab

# Define ONNX export path
onnx_export_path = "./model.onnx"

# Create a dummy input for the model
dummy_input = torch.randint(0, tokenizer.vocab_size, (1, 64), device=device)

# Export the model to ONNX
try:
    torch.onnx.export(
        model,  # Fine-tuned model
        dummy_input,  # Dummy input for the model
        onnx_export_path,  # Path where ONNX model will be saved
        input_names=["input_ids"],  # Input names for the model
        output_names=["logits"],  # Output names for the model
        opset_version=14,  # ONNX opset version (adjust as needed)
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "sequence_length"},
            "logits": {0: "batch_size", 1: "sequence_length"},
        },  # Support variable sequence lengths
    )
    print(f"Model exported to ONNX format at {onnx_export_path}")

    # Download the ONNX file
    files.download(onnx_export_path)
    print("Model downloaded successfully!")
except Exception as e:
    print(f"Error exporting model to ONNX: {e}")

  if sequence_length != 1:
  mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(attn_weights.device)


Model exported to ONNX format at ./model.onnx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model downloaded successfully!


In [12]:
import os
import shutil

model_dir = "./model"
drive_model_dir = "/content/drive/MyDrive/saved_models/model"

try:
    # Ensure the local model directory exists
    if not os.path.exists(model_dir):
        raise FileNotFoundError(f"The directory {model_dir} does not exist.")

    # Copy the model directory to Google Drive
    if os.path.exists(drive_model_dir):
        shutil.rmtree(drive_model_dir)  # Remove the old directory if it exists
    shutil.copytree(model_dir, drive_model_dir)

    print(f"Model files saved to Google Drive at {drive_model_dir}")
except Exception as e:
    print(f"Error saving model files to Google Drive: {e}")

Model files saved to Google Drive at /content/drive/MyDrive/saved_models/model


In [13]:
!pwd

/content


In [14]:
!ls

 Constant_1468_attr__value   onnx__MatMul_5276	 onnx__MatMul_5834
 Constant_1633_attr__value   onnx__MatMul_5294	 onnx__MatMul_5837
 fine_tuned_model	     onnx__MatMul_5297	 onnx__MatMul_5840
 fine_tuned_model_II	     onnx__MatMul_5300	 onnx__MatMul_5843
 fine_tuned_model.onnx	     onnx__MatMul_5302	'prompt (1).json'
 onnx__Add_5142		     onnx__MatMul_5304	 prompt.json
 onnx__Add_5145		     onnx__MatMul_5306	 sample_data
 onnx__Add_5148		     onnx__MatMul_5324	 transformer.h.0.ln_1.bias
 onnx__Add_5172		     onnx__MatMul_5327	 transformer.h.0.ln_1.weight
 onnx__Add_5175		     onnx__MatMul_5330	 transformer.h.0.ln_2.bias
 onnx__Add_5178		     onnx__MatMul_5332	 transformer.h.0.ln_2.weight
 onnx__Add_5202		     onnx__MatMul_5334	 transformer.h.10.ln_1.bias
 onnx__Add_5205		     onnx__MatMul_5336	 transformer.h.10.ln_1.weight
 onnx__Add_5208		     onnx__MatMul_5354	 transformer.h.10.ln_2.bias
 onnx__Add_5232		     onnx__MatMul_5357	 transformer.h.10.ln_2.weight
 onnx__Add_5235		     onnx_

In [15]:
import os
if os.path.exists(onnx_export_path):
    print("ONNX model file exists!")
else:
    print("ONNX model file was not created.")

ONNX model file exists!


In [18]:
from google.colab import files
files.download("fine_tuned_model.onnx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
!ls

 Constant_1468_attr__value   onnx__MatMul_5274	 onnx__MatMul_5816
 Constant_1633_attr__value   onnx__MatMul_5276	 onnx__MatMul_5834
 fine_tuned_model	     onnx__MatMul_5294	 onnx__MatMul_5837
 fine_tuned_model_II	     onnx__MatMul_5297	 onnx__MatMul_5840
 fine_tuned_model_II.zip     onnx__MatMul_5300	 onnx__MatMul_5843
 fine_tuned_model.onnx	     onnx__MatMul_5302	'prompt (1).json'
 onnx__Add_5142		     onnx__MatMul_5304	 prompt.json
 onnx__Add_5145		     onnx__MatMul_5306	 sample_data
 onnx__Add_5148		     onnx__MatMul_5324	 transformer.h.0.ln_1.bias
 onnx__Add_5172		     onnx__MatMul_5327	 transformer.h.0.ln_1.weight
 onnx__Add_5175		     onnx__MatMul_5330	 transformer.h.0.ln_2.bias
 onnx__Add_5178		     onnx__MatMul_5332	 transformer.h.0.ln_2.weight
 onnx__Add_5202		     onnx__MatMul_5334	 transformer.h.10.ln_1.bias
 onnx__Add_5205		     onnx__MatMul_5336	 transformer.h.10.ln_1.weight
 onnx__Add_5208		     onnx__MatMul_5354	 transformer.h.10.ln_2.bias
 onnx__Add_5232		     onnx__Mat

In [22]:
files.download("fine_tuned_model_II.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
from google.colab import files
import shutil

# Path to your directory
directory_path = "./fine_tuned_model_II"

# Zip the directory
shutil.make_archive("fine_tuned_model_II", "zip", directory_path)

# Download the zip file
files.download("fine_tuned_model_II.zip")


KeyboardInterrupt: 

In [25]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Copy directory to Google Drive
shutil.copytree("./fine_tuned_model_II", "/content/drive/MyDrive/fine_tuned_model_II")

print("Directory saved to Google Drive.")


Mounted at /content/drive
Directory saved to Google Drive.


In [26]:
!cd fine_tuned_model_II

In [29]:
!cd fine_tuned_model_II

In [30]:
!pwd

/content
