In [1]:
import google.protobuf
print(google.protobuf.__version__)

6.33.2


In [2]:
import os
from pathlib import Path
import pandas as pd
import json
import yaml


In [3]:
csv_file = "./ci_cd_logs.csv"
df = pd.read_csv(csv_file)
print(f"Loaded {len(df)} rows from {csv_file}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nStatus distribution:\n{df['status'].value_counts()}")
print(f"\nFirst few rows:")
print(df.head())

Loaded 800 rows from ./ci_cd_logs.csv

Columns: ['timestamp', 'pipeline_id', 'stage_name', 'job_name', 'task_name', 'status', 'message', 'commit_id', 'branch', 'user', 'environment']

Data types:
timestamp      object
pipeline_id    object
stage_name     object
job_name       object
task_name      object
status         object
message        object
commit_id      object
branch         object
user           object
environment    object
dtype: object

Status distribution:
status
skipped    219
running    206
failed     194
success    181
Name: count, dtype: int64

First few rows:
                  timestamp pipeline_id stage_name           job_name  \
0  2024-03-02 01:05:07+0000  pipe-txnem      Build  deploy_to_staging   
1  2024-07-22 19:55:41+0000  pipe-hjahz      Build     run_unit_tests   
2  2024-03-01 23:03:43+0000  pipe-vcsbx   Analysis      deploy_to_dev   
3  2024-06-02 12:21:00+0000  pipe-pnvzk       Test      deploy_to_dev   
4  2024-04-17 07:59:29+0000  pipe-mwkkl       Test 

In [26]:
jsonl_file = "./train.jsonl"

# Filter for failures (you can adjust this filter)
failed_logs = df[df['status'].str.lower().isin(['failed', 'error', 'failure'])].copy()
print(f"\nFound {len(failed_logs)} failed pipeline logs")

if len(failed_logs) == 0:
    print("‚ö†Ô∏è  No failed logs found. Using all logs for training.")
    failed_logs = df.copy()

# Create training examples
training_examples = []

for idx, row in failed_logs.iterrows():
    # Build a context-rich instruction
    context_parts = []
    
    # Add each field if it's not null
    if pd.notna(row['pipeline_id']):
        context_parts.append(f"Pipeline ID: {row['pipeline_id']}")
    if pd.notna(row['stage_name']):
        context_parts.append(f"Stage: {row['stage_name']}")
    if pd.notna(row['job_name']):
        context_parts.append(f"Job: {row['job_name']}")
    if pd.notna(row['task_name']):
        context_parts.append(f"Task: {row['task_name']}")
    if pd.notna(row['branch']):
        context_parts.append(f"Branch: {row['branch']}")
    if pd.notna(row['environment']):
        context_parts.append(f"Environment: {row['environment']}")
    if pd.notna(row['status']):
        context_parts.append(f"Status: {row['status']}")
    if pd.notna(row['message']):
        context_parts.append(f"Error: {row['message']}")
    
    context = "\n".join(context_parts)
    
    # Create instruction
    instruction = f"""Analyze this CI/CD pipeline failure and provide a solution:

{context}

What went wrong and how should we fix it?"""
    
    # Create a realistic response (you'll want to enhance this with actual solutions)
    # For now, creating template responses based on common CI/CD issues
    message_lower = str(row['message']).lower()
    
    # Pattern matching for common CI/CD errors
    if any(word in message_lower for word in ['timeout', 'timed out']):
        solution = f"""The pipeline timed out in the {row['stage_name']} stage. 

Recommended fixes:
1. Increase timeout values in your CI/CD configuration
2. Optimize the {row['task_name']} task to run faster
3. Check for network issues or slow dependencies
4. Consider parallelizing tasks if possible"""
    
    elif any(word in message_lower for word in ['test', 'failed', 'assertion']):
        solution = f"""Tests failed in the {row['stage_name']} stage.

Recommended fixes:
1. Review the test failure logs for specific assertions
2. Check if recent code changes in commit {row['commit_id'][:8]} broke functionality
3. Verify test data and mocks are properly configured
4. Run tests locally to reproduce and debug
5. Check for environment-specific issues in {row['environment']}"""
    
    elif any(word in message_lower for word in ['dependency', 'package', 'module', 'import']):
        solution = f"""Dependency or import error in {row['stage_name']}.

Recommended fixes:
1. Update your requirements.txt or package.json with correct versions
2. Clear dependency cache and reinstall
3. Check for version conflicts between packages
4. Verify all required dependencies are listed in your config files
5. Consider using a dependency lock file"""
    
    elif any(word in message_lower for word in ['permission', 'denied', 'unauthorized']):
        solution = f"""Permission or access error in {row['stage_name']}.

Recommended fixes:
1. Check CI/CD service account permissions
2. Verify secrets and credentials are properly configured
3. Review branch protection rules for {row['branch']}
4. Ensure user {row['user']} has necessary access rights
5. Check file/directory permissions in the {row['environment']} environment"""
    
    elif any(word in message_lower for word in ['build', 'compile', 'syntax']):
        solution = f"""Build or compilation error in {row['stage_name']}.

Recommended fixes:
1. Check syntax errors in recent commit {row['commit_id'][:8]}
2. Verify build configuration files are correct
3. Ensure all build dependencies are available
4. Check for environment-specific compilation issues
5. Review build logs for specific error details"""
    
    elif any(word in message_lower for word in ['deploy', 'deployment']):
        solution = f"""Deployment failed in {row['environment']} environment.

Recommended fixes:
1. Verify deployment credentials and configurations
2. Check if the {row['environment']} environment is accessible
3. Review resource availability (disk space, memory)
4. Validate deployment manifests and configurations
5. Check for conflicts with existing deployments"""
    
    else:
        solution = f"""The pipeline failed in the {row['stage_name']} stage with status: {row['status']}.

Recommended fixes:
1. Review the full error message: "{row['message']}"
2. Check logs for commit {row['commit_id'][:8]} on branch {row['branch']}
3. Verify the {row['task_name']} task configuration
4. Test the build locally in a similar environment
5. Check recent changes by user {row['user']}
6. Review {row['environment']} environment settings"""
    
    # Create conversation format
    example = {
        "messages": [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": solution}
        ]
    }
    
    training_examples.append(example)

# Write to JSONL
with open(jsonl_file, 'w') as f:
    for example in training_examples:
        f.write(json.dumps(example) + '\n')

print(f"\n‚úÖ Created {jsonl_file} with {len(training_examples)} training examples")

# Show a sample
print("\nüìã Sample training example:")
print(json.dumps(training_examples[0], indent=2))



Found 194 failed pipeline logs

‚úÖ Created ./train.jsonl with 194 training examples

üìã Sample training example:
{
  "messages": [
    {
      "role": "user",
      "content": "Analyze this CI/CD pipeline failure and provide a solution:\n\nPipeline ID: pipe-mwkkl\nStage: Test\nJob: build_and_test\nTask: test\nBranch: branch_xqp\nStatus: failed\nError: Task execution failed.\n\nWhat went wrong and how should we fix it?"
    },
    {
      "role": "assistant",
      "content": "Tests failed in the Test stage.\n\nRecommended fixes:\n1. Review the test failure logs for specific assertions\n2. Check if recent code changes in commit a4e872fb broke functionality\n3. Verify test data and mocks are properly configured\n4. Run tests locally to reproduce and debug\n5. Check for environment-specific issues in nan"
    }
  ]
}


In [4]:
import google.protobuf
print(google.protobuf.__version__)

6.33.2


In [None]:
# pip uninstall protobuf -y
#

: 

In [31]:
# Run this cell to create the corrected YAML config
yaml_content = f"""
model:
  model_name: "HuggingFaceTB/SmolLM2-135M-Instruct"
  model_max_length: 1024
  torch_dtype_str: "bfloat16"
  trust_remote_code: true
  attn_implementation: "sdpa"
  load_pretrained_weights: true

data:
  train:
    datasets:
      - dataset_name: "text_sft"
        dataset_path: "{jsonl_file}"
        split: "train"

peft:
  lora_r: 16
  lora_alpha: 32
  lora_dropout: 0.05
  lora_target_modules:
    - q_proj
    - v_proj
    - k_proj
    - o_proj

training:
  trainer_type: "TRL_SFT"
  use_peft: true
  per_device_train_batch_size: 4
  gradient_accumulation_steps: 2
  learning_rate: 3e-4
  max_steps: {min(1000, len(training_examples) * 3)}
  logging_steps: 10
  save_steps: 200
  run_name: "cicd_auto_healer"
  output_dir: "./output"
  save_final_model: true
  warmup_steps: 100
  lr_scheduler_type: "cosine"
  weight_decay: 0.01
"""

yaml_path = "./train.yaml"
with open(yaml_path, "w") as f:
    f.write(yaml_content)

print(f"‚úÖ YAML config saved to {yaml_path}")
print("\nüìù Key change: dataset_name is now 'text_sft' (not 'oumi_sft')")

‚úÖ YAML config saved to ./train.yaml

üìù Key change: dataset_name is now 'text_sft' (not 'oumi_sft')


In [None]:
# !pip install --upgrade protobuf


: 

In [16]:
import google.protobuf
print(google.protobuf.__version__)

6.33.2


In [None]:
pip install -U protobuf>=6.32

Note: you may need to restart the kernel to use updated packages.


: 

In [32]:
from oumi.core.configs import TrainingConfig
from oumi.train import train

# Fix HF warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

try:
    # Load config
    config = TrainingConfig.from_yaml(yaml_path)
    
    print("\nüìã Training Configuration:")
    print(f"  Model: {config.model.model_name}")
    print(f"  Max length: {config.model.model_max_length}")
    print(f"  Dataset: {len(training_examples)} examples")
    print(f"  Using PEFT: {config.training.use_peft}")
    print(f"  LoRA rank: {config.peft.lora_r}")
    print(f"  Batch size: {config.training.per_device_train_batch_size}")
    print(f"  Max steps: {config.training.max_steps}")
    print(f"  Output: {config.training.output_dir}")
    
    # Start training
    print("\nüöÄ Starting training...")
    train(config)
    
    print("\n‚úÖ Fine-tuning complete! Model saved in ./output")
    
except Exception as e:
    print(f"\n‚ùå Error during training: {e}")
    import traceback
    traceback.print_exc()



üìã Training Configuration:
  Model: HuggingFaceTB/SmolLM2-135M-Instruct
  Max length: 1024
  Dataset: 194 examples
  Using PEFT: True
  LoRA rank: 16
  Batch size: 4
  Max steps: 582
  Output: ./output

üöÄ Starting training...
[2025-12-14 13:17:29,047][oumi][rank0][pid:33000][MainThread][INFO]][torch_utils.py:80] Torch version: 2.8.0+cu128. NumPy version: 2.2.6
[2025-12-14 13:17:29,048][oumi][rank0][pid:33000][MainThread][INFO]][torch_utils.py:82] CUDA is not available!
[2025-12-14 13:17:29,050][oumi][rank0][pid:33000][MainThread][INFO]][train.py:154] Oumi version: 0.5.0
[2025-12-14 13:17:29,068][oumi][rank0][pid:33000][MainThread][INFO]][train.py:318] Training config saved to output/telemetry/training_config.yaml
[2025-12-14 13:17:29,610][oumi][rank0][pid:33000][MainThread][INFO]][models.py:544] Using the model's built-in chat template for model 'HuggingFaceTB/SmolLM2-135M-Instruct'.
[2025-12-14 13:17:29,612][oumi][rank0][pid:33000][MainThread][INFO]][base_map_dataset.py:91] Crea

Generating train split: 9 examples [00:00, 832.26 examples/s]

[2025-12-14 13:17:29,647][oumi][rank0][pid:33000][MainThread][INFO]][base_map_dataset.py:312] TextSftJsonLinesDataset: features=dict_keys(['input_ids', 'attention_mask'])



Generating train split: 194 examples [00:00, 1945.14 examples/s]

[2025-12-14 13:17:29,782][oumi][rank0][pid:33000][MainThread][INFO]][base_map_dataset.py:376] Finished transforming dataset (TextSftJsonLinesDataset)! Speed: 1444.58 examples/sec. Examples: 194. Duration: 0.1 sec. Transform workers: 1.
[2025-12-14 13:17:29,785][oumi][rank0][pid:33000][MainThread][INFO]][models.py:260] Building model using device_map: auto (DeviceRankInfo(world_size=1, rank=0, local_world_size=1, local_rank=0))...
[2025-12-14 13:17:29,786][oumi][rank0][pid:33000][MainThread][INFO]][models.py:336] Using model class: <class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'> to instantiate model.





[2025-12-14 13:17:30,619][oumi][rank0][pid:33000][MainThread][INFO]][train.py:463] Building PEFT model...
[2025-12-14 13:17:30,730][oumi][rank0][pid:33000][MainThread][INFO]][torch_utils.py:288] 
Model Parameters Summary:
üî¢ Total     parameters: 136,358,208
üîó Embedding parameters: 28,311,552
üéØ Trainable parameters: 1,843,200
üîí Frozen    parameters: 134,515,008 (98.65%)

[2025-12-14 13:17:30,731][oumi][rank0][pid:33000][MainThread][INFO]][train.py:486] Skipping dataset preparation for TRL_SFT trainer since the dataset is already processed.
[2025-12-14 13:17:31,000][oumi][rank0][pid:33000][MainThread][INFO]][torch_profiler_utils.py:164] PROF: Torch Profiler disabled!


The model is already on multiple devices. Skipping the move to device specified in `args`.


[2025-12-14 13:17:31,100][oumi][rank0][pid:33000][MainThread][INFO]][device_utils.py:343] GPU Metrics Before Training: GPU runtime info: None.
[2025-12-14 13:17:31,101][oumi][rank0][pid:33000][MainThread][INFO]][train.py:558] Training init time: 2.055s
[2025-12-14 13:17:31,102][oumi][rank0][pid:33000][MainThread][INFO]][train.py:559] Starting training... (TrainerType.TRL_SFT, transformers: 4.57.1)


Step,Training Loss


KeyboardInterrupt: 

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print("üîÑ Loading your fine-tuned CI/CD Auto-Healer model...")

model_path = "./output"  # Your trained model location
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print(f"‚úÖ Model loaded successfully!")
print(f"üìç Model location: {model_path}")
print(f"üß† Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")
print(f"üíæ Device: {model.device}")


üîÑ Loading your fine-tuned CI/CD Auto-Healer model...


`torch_dtype` is deprecated! Use `dtype` instead!


‚úÖ Model loaded successfully!
üìç Model location: ./output
üß† Model size: 136.4M parameters
üíæ Device: cpu


In [7]:
# Add this to a new cell in your notebook

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# 1. Load your fine-tuned model
print("üîÑ Loading your fine-tuned CI/CD Auto-Healer model...")

model_path = "./output"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Set padding token if not set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"‚úÖ Model loaded successfully!\n")

# 2. Create a test function with better response extraction
def diagnose_pipeline_failure(
    pipeline_id,
    stage_name,
    job_name,
    task_name,
    branch,
    status,
    message,
    commit_id="unknown",
    environment="unknown"
):
    """Generate diagnosis and fix for pipeline failure"""
    
    # Build the prompt in the same format as training data
    prompt = f"""Analyze this CI/CD pipeline failure and provide a solution:

Pipeline ID: {pipeline_id}
Stage: {stage_name}
Job: {job_name}
Task: {task_name}
Branch: {branch}
Status: {status}
Error: {message}

What went wrong and how should we fix it?"""
    
    # Apply chat template if available
    if hasattr(tokenizer, 'chat_template') and tokenizer.chat_template:
        messages = [{"role": "user", "content": prompt}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
    else:
        formatted_prompt = prompt
    
    # Tokenize and generate
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=400,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode the full response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Try to extract just the assistant's response
    # Method 1: Split by the prompt
    if "What went wrong and how should we fix it?" in full_response:
        response = full_response.split("What went wrong and how should we fix it?")[-1].strip()
    # Method 2: Split by assistant marker if present
    elif "<|assistant|>" in full_response:
        response = full_response.split("<|assistant|>")[-1].strip()
    # Method 3: Just remove the input prompt
    else:
        response = full_response.replace(formatted_prompt, "").strip()
    
    # If response is empty, return the full output for debugging
    if not response or len(response) < 10:
        response = full_response
    
    return response

# 3. Test with examples from your actual data
print("üß™ Testing with real pipeline failures from your dataset:\n")
print("=" * 80)

# Example 1: Test failure from your data
print("\nüìã TEST 1: Test Stage Failure")
print("-" * 80)
try:
    diagnosis = diagnose_pipeline_failure(
        pipeline_id="pipe-mwkkl",
        stage_name="Test",
        job_name="build_and_test",
        task_name="test",
        branch="branch_xqp",
        status="failed",
        message="Task execution failed.",
        commit_id="a4e872fb"
    )
    print(f"Response length: {len(diagnosis)} chars\n")
    print(diagnosis)
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

# Example 2: Build failure
print("\n" + "=" * 80)
print("\nüìã TEST 2: Build Stage Failure")
print("-" * 80)
try:
    diagnosis = diagnose_pipeline_failure(
        pipeline_id="pipe-abc123",
        stage_name="Build",
        job_name="compile_project",
        task_name="build",
        branch="feature/new-api",
        status="failed",
        message="Compilation error: undefined reference to 'parseConfig'",
        commit_id="def456gh"
    )
    print(f"Response length: {len(diagnosis)} chars\n")
    print(diagnosis)
except Exception as e:
    print(f"Error: {e}")

# Example 3: Deployment failure
print("\n" + "=" * 80)
print("\nüìã TEST 3: Deployment Failure")
print("-" * 80)
try:
    diagnosis = diagnose_pipeline_failure(
        pipeline_id="pipe-deploy-99",
        stage_name="Deploy",
        job_name="deploy_to_production",
        task_name="deploy",
        branch="main",
        status="failed",
        message="Permission denied: unable to access deployment credentials",
        environment="production"
    )
    print(f"Response length: {len(diagnosis)} chars\n")
    print(diagnosis)
except Exception as e:
    print(f"Error: {e}")

print("\n" + "=" * 80)
print("\n‚úÖ Testing complete!")
print("\nüí° To test with your own failures, use:")
print("   diagnosis = diagnose_pipeline_failure(pipeline_id, stage, job, task, branch, status, message)")

üîÑ Loading your fine-tuned CI/CD Auto-Healer model...
‚úÖ Model loaded successfully!

üß™ Testing with real pipeline failures from your dataset:


üìã TEST 1: Test Stage Failure
--------------------------------------------------------------------------------
Response length: 334 chars

assistant
Tests failed in the Test stage.

Recommended fixes:
1. Review the test failure logs for specific assertions
2. Check if recent code changes in commit 8f7a2c9e broke functionality
3. Verify test data and mocks are properly configured
4. Run tests locally to reproduce and debug
5. Check for environment-specific issues in nan


üìã TEST 2: Build Stage Failure
--------------------------------------------------------------------------------
Response length: 335 chars

assistant
Tests failed in the Build stage.

Recommended fixes:
1. Review the test failure logs for specific assertions
2. Check if recent code changes in commit 97c1c47a broke functionality
3. Verify test data and mocks are proper

In [8]:
def diagnose_and_display(
    pipeline_id, stage_name, job_name, task_name, 
    branch, status, message, commit_id="unknown", environment="unknown"
):
    """Diagnose and display with clean formatting"""
    
    print(f"üîç FAILURE DETAILS:")
    print(f"   Pipeline: {pipeline_id}")
    print(f"   Stage: {stage_name} ‚Üí Job: {job_name} ‚Üí Task: {task_name}")
    print(f"   Branch: {branch}")
    print(f"   Error: {message}")
    print(f"\nü§ñ MODEL DIAGNOSIS:\n")
    
    diagnosis = diagnose_pipeline_failure(
        pipeline_id, stage_name, job_name, task_name,
        branch, status, message, commit_id, environment
    )
    
    # Clean up the response
    diagnosis = diagnosis.replace("assistant ", "").strip()
    print(diagnosis)
    print("\n" + "=" * 90 + "\n")
    
    return diagnosis

# Test with your CSV data
print("üìä TESTING MODEL WITH REAL CI/CD FAILURES\n")
print("=" * 90 + "\n")

# Load your data
df = pd.read_csv("./ci_cd_logs.csv")
failed_logs = df[df['status'].str.lower() == 'failed'].copy()

# Test with 5 random failures
test_samples = failed_logs.sample(min(5, len(failed_logs)))

results = []
for idx, (_, row) in enumerate(test_samples.iterrows(), 1):
    print(f"TEST {idx}/5")
    print("-" * 90)
    
    diagnosis = diagnose_and_display(
        pipeline_id=row['pipeline_id'],
        stage_name=row['stage_name'],
        job_name=row['job_name'],
        task_name=row['task_name'],
        branch=row['branch'],
        status=row['status'],
        message=row['message'],
        commit_id=str(row.get('commit_id', 'unknown'))[:8],
        environment=str(row.get('environment', 'unknown'))
    )
    
    results.append({
        'test_num': idx,
        'stage': row['stage_name'],
        'job': row['job_name'],
        'error': row['message'][:50] + "...",
        'diagnosis_length': len(diagnosis),
        'has_recommendations': 'Recommended fixes:' in diagnosis or '1.' in diagnosis
    })

# Summary
print("\n" + "=" * 90)
print("üìà TESTING SUMMARY")
print("=" * 90)

summary_df = pd.DataFrame(results)
print(f"\nTotal tests: {len(results)}")
print(f"Average diagnosis length: {summary_df['diagnosis_length'].mean():.0f} chars")
print(f"Tests with recommendations: {summary_df['has_recommendations'].sum()}/{len(results)}")

print("\nüìã Breakdown by stage:")
print(summary_df[['test_num', 'stage', 'job', 'diagnosis_length']].to_string(index=False))

print("\n‚úÖ Model testing complete!")

üìä TESTING MODEL WITH REAL CI/CD FAILURES


TEST 1/5
------------------------------------------------------------------------------------------
üîç FAILURE DETAILS:
   Pipeline: pipe-gvqcj
   Stage: Test ‚Üí Job: run_integration_tests ‚Üí Task: checkout
   Branch: branch_lou
   Error: Task execution failed.

ü§ñ MODEL DIAGNOSIS:

assistant
Tests failed in the Test stage.

Recommended fixes:
1. Review the test failure logs for specific assertions
2. Check if recent code changes in commit 983090d4 broke functionality
3. Verify test data and mocks are properly configured
4. Run tests locally to reproduce and debug
5. Check for environment-specific issues in nan


TEST 2/5
------------------------------------------------------------------------------------------
üîç FAILURE DETAILS:
   Pipeline: pipe-bmxgt
   Stage: Test ‚Üí Job: run_unit_tests ‚Üí Task: deploy
   Branch: branch_xnw
   Error: Task execution failed.

ü§ñ MODEL DIAGNOSIS:

assistant
Tests failed in the Test stage.

Rec

In [9]:
# Compare your fine-tuned model with the base model



print("üîÑ Loading BASE model for comparison...")
base_model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

print("‚úÖ Base model loaded!\n")

def get_base_model_response(prompt):
    """Get response from base model"""
    if hasattr(base_tokenizer, 'chat_template') and base_tokenizer.chat_template:
        messages = [{"role": "user", "content": prompt}]
        formatted_prompt = base_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    else:
        formatted_prompt = prompt
    
    inputs = base_tokenizer(formatted_prompt, return_tensors="pt").to(base_model.device)
    
    with torch.no_grad():
        outputs = base_model.generate(
            **inputs,
            max_new_tokens=400,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=base_tokenizer.pad_token_id,
            eos_token_id=base_tokenizer.eos_token_id
        )
    
    full_response = base_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract response
    if "What went wrong and how should we fix it?" in full_response:
        response = full_response.split("What went wrong and how should we fix it?")[-1].strip()
    elif "<|assistant|>" in full_response:
        response = full_response.split("<|assistant|>")[-1].strip()
    else:
        response = full_response.replace(formatted_prompt, "").strip()
    
    return response.replace("assistant ", "").strip()

# Test case
test_prompt = """Analyze this CI/CD pipeline failure and provide a solution:

Pipeline ID: pipe-test-001
Stage: Test
Job: run_integration_tests
Task: test
Branch: feature/auth
Status: failed
Error: Test suite timed out after 15 minutes

What went wrong and how should we fix it?"""

print("=" * 90)
print("üß™ COMPARISON TEST: Integration Test Timeout")
print("=" * 90)

print("\nü§ñ YOUR FINE-TUNED MODEL:")
print("-" * 90)
finetuned_response = diagnose_pipeline_failure(
    "pipe-test-001", "Test", "run_integration_tests", "test",
    "feature/auth", "failed", "Test suite timed out after 15 minutes"
).replace("assistant ", "").strip()
print(finetuned_response)

print("\n\nüìò BASE MODEL (not fine-tuned):")
print("-" * 90)
base_response = get_base_model_response(test_prompt)
print(base_response)

print("\n\n" + "=" * 90)
print("üìä COMPARISON METRICS")
print("=" * 90)
print(f"Fine-tuned response length: {len(finetuned_response)} chars")
print(f"Base model response length: {len(base_response)} chars")
print(f"\nFine-tuned has structured fixes: {'Recommended fixes:' in finetuned_response or '1.' in finetuned_response}")
print(f"Base model has structured fixes: {'Recommended fixes:' in base_response or '1.' in base_response}")
print(f"\nFine-tuned mentions specifics: {any(word in finetuned_response.lower() for word in ['timeout', 'test', 'pipeline'])}")
print(f"Base model mentions specifics: {any(word in base_response.lower() for word in ['timeout', 'test', 'pipeline'])}")

üîÑ Loading BASE model for comparison...




‚úÖ Base model loaded!

üß™ COMPARISON TEST: Integration Test Timeout

ü§ñ YOUR FINE-TUNED MODEL:
------------------------------------------------------------------------------------------
assistant
Tests failed in the Test stage.

Recommended fixes:
1. Review the test failure logs for specific assertions
2. Check if recent code changes in commit 9c782cbe broke functionality
3. Verify test data and mocks are properly configured
4. Run tests locally to reproduce and debug
5. Check for environment-specific issues in nan


üìò BASE MODEL (not fine-tuned):
------------------------------------------------------------------------------------------
assistant
The pipeline failed due to a bug in the `test_integration_tests` test suite. Specifically, the `test_integration_tests` test suite failed to complete after 15 minutes.

The issue lies in the `run_integration_tests` test, which includes a test suite that includes a test called `test_integration_tests`. The `test_integration_tests` test 