# Fine Tuning Modelo Qwen Coder 0.5B Instruct

In [None]:
!pip install -U bitsandbytes --q

In [None]:
!pip install -U trl transformers accelerate peft datasets bitsandbytes --q

In [None]:
 #--- GOOGLE DRIVE INTEGRATION ---
def setup_google_drive():
    """Set up Google Drive for saving models"""
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        print("Google Drive mounted successfully!")
        return True
    except ImportError:
        print("Not running in Colab. Google Drive integration skipped.")
        return False
    except Exception as e:
        print(f"Error mounting Google Drive: {e}")
        return False

def save_to_drive(model_path, drive_path):
    """Save model to Google Drive"""
    if not os.path.exists(drive_path):
        os.makedirs(drive_path, exist_ok=True)

    # Copy model files to Drive
    import shutil
    try:
        shutil.copytree(model_path, os.path.join(drive_path, os.path.basename(model_path)), dirs_exist_ok=True)
        print(f"Model saved to Google Drive: {drive_path}")
        return True
    except Exception as e:
        print(f"Error saving to Google Drive: {e}")
        return False

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import re

# Parameters config
MODEL_NAME = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
MAX_LENGTH = 1024
BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 2
LEARNING_RATE = 2e-4
NUM_EPOCHS = 2
WARMUP_STEPS = 50

# Google Drive path
DRIVE_PATH = "/content/drive/MyDrive/ds-coder-model-final"

# DATASET PREPROCESSING
def filter_ds_libraries(example):
    """Enhanced filter for data science libraries"""
    ds_keywords = [
        "pandas", "sklearn", "scikit-learn", "seaborn", "matplotlib", "numpy",
        "pd.", "np.", "plt.", "sns.", "train_test_split", "RandomForest",
        "LinearRegression", "DataFrame", "Series", "groupby", "merge", "join",
        "corr", "cov", "hist", "plot", "scatter", "bar", "heatmap", "imshow"
    ]

    content = (example.get("instruction", "") + example.get("output", "")).lower()
    return any(keyword.lower() in content for keyword in ds_keywords)

def validate_code_quality(example):
    """Additional filter to ensure code quality"""
    code = example.get("output", "")

    # Basic Python syntax validation
    if not code.strip():
        return False

    # Check for reasonable code structure
    lines = code.split('\n')
    if len(lines) < 3:  # Too short
        return False

    # Check for common data science patterns
    good_patterns = ['import', 'def ', 'class ', 'for ', 'if ', 'try:', 'with ']
    if not any(pattern in code for pattern in good_patterns):
        return False

    # Remove examples with excessive special characters (might be garbage)
    if len(re.findall(r'[^a-zA-Z0-9\s\w._(),\[\]{}:;"\']', code)) > 10:
        return False

    return True

def format_dataset(example):
    """Format the dataset into instruction-following format"""
    instruction = example.get("instruction", "")
    code = example.get("output", "")

    # If instruction is empty, try to generate one from the code
    if not instruction.strip():
        # Extract function name or create generic instruction
        func_match = re.search(r'def (\w+)', code)
        if func_match:
            instruction = f"Write a Python function called {func_match.group(1)}"
        else:
            instruction = "Write Python code for data science task"

    formatted_text = f"""### Instruction: {instruction}

    ### Code:
    {code}

    ### End"""

    return {"text": formatted_text}

print("Loading and filtering dataset...")
# Load dataset
dataset = load_dataset("jtatman/python-code-dataset-500k", split="train")

Loading and filtering dataset...


In [None]:
# Apply filters
print("Applying data science filters...")
ds_dataset = dataset.filter(filter_ds_libraries)
ds_dataset = ds_dataset.filter(validate_code_quality)

print(f"Filtered dataset size: {len(ds_dataset)}")

# Take a larger sample for better training (25K samples)
ds_dataset = ds_dataset.shuffle(seed=42).select(range(min(25000, len(ds_dataset))))

Applying data science filters...


Filter:   0%|          | 0/559515 [00:00<?, ? examples/s]

Filter:   0%|          | 0/143417 [00:00<?, ? examples/s]

Filtered dataset size: 17924


In [None]:
# Show first 3 examples
for i in range(min(3, len(ds_dataset))):
    example = ds_dataset[i]
    print(f"\nExample {i+1}:")

    # Also show the original components if available
    if "instruction" in example and "output" in example:
        print(f"Original Instruction: {example['instruction']}")
        print(f"Original Output: {example['output']}")


Example 1:
Original Instruction: Write a Python 3 function for
converting a list into a comma separated string for displaying select multiple values in emails.
Original Output: def format_value(value):
    """
    Convert a list into a comma separated string, for displaying
    select multiple values in emails.
    """
    if isinstance(value, list):
        value = ", ".join([v.strip() for v in value])
    return value

Example 2:
Original Instruction: How would you code a function in Python 3 to
apply a reset instruction to a qubit.
Original Output: def _add_qasm_reset(self, qubit):
        """Apply a reset instruction to a qubit.

        Args:
            qubit (int): the qubit being rest

        This is done by doing a simulating a measurement
        outcome and projecting onto the outcome state while
        renormalizing.
        """
        # get measure outcome
        outcome, probability = self._get_measure_outcome(qubit)
        # update quantum state
        if outcome 

In [None]:
from IPython.utils.sysinfo import pprint
from pprint import pprint

# format_dataset returns a dictionary, so we need to access the 'text' key
formatted_example = format_dataset(ds_dataset[100])
pprint(formatted_example['text'])

('### Instruction: Create a Python program to die arbitary shape black and '
 'white using matplotlib\n'
 '\n'
 '    ### Code:\n'
 '    import matplotlib.pyplot as plt\n'
 '\n'
 '# Generating random coordinates\n'
 'x = np.random.randint(0,100,100)\n'
 'y = np.random.randint(0,100,100)\n'
 '\n'
 '# Use the coordinates to plot a black and white shape\n'
 "plt.scatter(x, y, c='black')\n"
 "plt.fill_between(x, y, color='white')\n"
 'plt.show()\n'
 '\n'
 '    ### End')


In [None]:
ds_dataset.to_csv("./instruct_python_samples.txt")

Creating CSV from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

15600272

In [None]:
# Formatting the dataset
ds_dataset = ds_dataset.map(format_dataset, remove_columns=ds_dataset.column_names)


# Use 8-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Set special tokens
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Formatting dataset...


Map:   0%|          | 0/17924 [00:00<?, ? examples/s]

Setting up model and tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

In [None]:

model = prepare_model_for_kbit_training(model)

# Simpler LoRA tuning for faster training
lora_config = LoraConfig(
    r=32,  # Reduced rank
    lora_alpha=64,
    target_modules=["q_proj", "v_proj", "o_proj"],  # Fewer modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
def tokenize_function(examples):
    """Tokenize the formatted text"""
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_overflowing_tokens=False,
    )

print("Tokenizing dataset...")
tokenized_dataset = ds_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Split into train/validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

# Get data splits
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

Tokenizing dataset...


Map:   0%|          | 0/17924 [00:00<?, ? examples/s]

In [None]:
print(train_dataset.shape, val_dataset.shape)

(16131, 2) (1793, 2)


In [None]:
# Training settings
training_args = TrainingArguments(
    output_dir="./ds-coder-model",
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    eval_strategy="steps",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    ddp_find_unused_parameters=False,
    report_to="none"  # Disable wandb/tensorboard for now
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [None]:
from transformers import TrainerCallback

class MetricsCallback(TrainerCallback):
    """Custom callback to track data science-specific metrics"""
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                self.train_losses.append(logs["loss"])
            if "eval_loss" in logs:
                self.eval_losses.append(logs["eval_loss"])

            # Print progress
            if state.global_step % 100 == 0:
                print(f"Step {state.global_step}, Loss: {logs.get('loss', 'N/A')}")

metrics_callback = MetricsCallback()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    callbacks=[metrics_callback],
)

In [None]:

print("Starting training...")
trainer.train()


print("\nEvaluating model...")
eval_results = trainer.evaluate()
print(f"Final evaluation loss: {eval_results['eval_loss']:.4f}")


# Mount Google Drive and save there
drive_mounted = setup_google_drive()

if drive_mounted:
    success = save_to_drive("./ds-coder-model-final", DRIVE_PATH)
    if success:
        print(f"✅ Model successfully saved to Google Drive: {DRIVE_PATH}")
    else:
        print("❌ Failed to save to Google Drive")
else:
    print("⚠️ Google Drive not mounted. Model saved only locally.")

Starting training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
500,1.4264,1.401094
1000,1.3682,1.383191
1500,1.3203,1.375345
2000,1.3299,1.370611


Step 100, Loss: 1.4884
Step 200, Loss: 1.3922
Step 300, Loss: 1.394
Step 400, Loss: 1.4446
Step 500, Loss: 1.4264
Step 500, Loss: N/A


  return fn(*args, **kwargs)


Step 600, Loss: 1.4109
Step 700, Loss: 1.3804
Step 800, Loss: 1.3769
Step 900, Loss: 1.3602
Step 1000, Loss: 1.3682
Step 1000, Loss: N/A


  return fn(*args, **kwargs)


Step 1100, Loss: 1.3467
Step 1200, Loss: 1.3228
Step 1300, Loss: 1.3178
Step 1400, Loss: 1.3477
Step 1500, Loss: 1.3203
Step 1500, Loss: N/A


  return fn(*args, **kwargs)


Step 1600, Loss: 1.3211
Step 1700, Loss: 1.313
Step 1800, Loss: 1.3215
Step 1900, Loss: 1.3213
Step 2000, Loss: 1.3299
Step 2000, Loss: N/A


  return fn(*args, **kwargs)



Evaluating model...




Final evaluation loss: 1.3706
Saving model...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully!


NameError: name 'os' is not defined

In [None]:
import os

if drive_mounted:
    success = save_to_drive("./ds-coder-model", DRIVE_PATH)
    if success:
        print(f"✅ Model successfully saved to Google Drive: {DRIVE_PATH}")
    else:
        print("❌ Failed to save to Google Drive")
else:
    print("⚠️ Google Drive not mounted. Model saved only locally.")

Model saved to Google Drive: /content/drive/MyDrive/ds-coder-model-final
✅ Model successfully saved to Google Drive: /content/drive/MyDrive/ds-coder-model-final
