In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# /kaggle/input/vllm-dataset/images/aut-0003-00003809.jpg   All images in this directory called images 
# /kaggle/input/vllm-dataset/dataset.jsonl                  all associated prompts are in this file 

# Dataset.jsonl file data strcuture 

# example 
"root":{2 items
"image":string"images/aut-0003-00003809.jpg"
"conversations":[2 items
0:{2 items
"from":string"human"
"value":string"<image>"
}
1:{2 items
"from":string"gpt"
"value":string"The image depicts an airport scene with a clear blue sky scattered with white clouds. The runway is wet, reflecting the sunlight, indicating recent rain or ongoing maintenance. The runway is marked with white lines for navigation and safety. There are several vehicles on the runway, including: - 9 Standard Cars (2 in the left far, 6 in the center far, 1 in the right far) - 1 Taxiing Vehicle (1 in the right far) - 1 Passenger Jet (1 in the center far) - 2 Business Jets (2 in the left far) The airport has a large hangar structure to the right, which appears to be a hangar for aircraft storage. The hangar has a grayish exterior with windows and doors. There are also other buildings and structures visible in the background, including a tower with a red and white striped flag. Overall, the scene suggests a busy airport with various types of aircraft and vehicles in operation."
}
]
}

In [3]:
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from transformers import (
    Qwen2VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset as HFDataset
import pandas as pd
from sklearn.metrics import bleu_score
from torchvision import transforms
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class VisionLanguageDataset(Dataset):
    """Custom dataset for vision-language pairs"""
    
    def __init__(self, data_dir, processor, max_length=512):
        self.data_dir = data_dir
        self.processor = processor
        self.max_length = max_length
        
        # Load dataset - assuming structure: images/ and prompts.json or similar
        self.data = self.load_data()
        
    def load_data(self):
        """Load image-text pairs from directory structure"""
        data = []
        
        # Method 1: JSON file with image-prompt pairs
        json_path = os.path.join(self.data_dir, 'data.json')
        if os.path.exists(json_path):
            with open(json_path, 'r') as f:
                data = json.load(f)
        
        # Method 2: Separate images and prompts directories
        elif os.path.exists(os.path.join(self.data_dir, 'images')) and \
             os.path.exists(os.path.join(self.data_dir, 'prompts.txt')):
            
            with open(os.path.join(self.data_dir, 'prompts.txt'), 'r') as f:
                prompts = f.readlines()
            
            image_dir = os.path.join(self.data_dir, 'images')
            image_files = sorted([f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
            
            for i, (img_file, prompt) in enumerate(zip(image_files, prompts)):
                data.append({
                    'image_path': os.path.join(image_dir, img_file),
                    'prompt': prompt.strip(),
                    'id': i
                })
        
        # Method 3: Auto-discover paired files
        else:
            for root, dirs, files in os.walk(self.data_dir):
                for file in files:
                    if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                        img_path = os.path.join(root, file)
                        # Look for corresponding text file
                        txt_path = img_path.rsplit('.', 1)[0] + '.txt'
                        if os.path.exists(txt_path):
                            with open(txt_path, 'r') as f:
                                prompt = f.read().strip()
                            data.append({
                                'image_path': img_path,
                                'prompt': prompt,
                                'id': len(data)
                            })
        
        print(f"Loaded {len(data)} image-text pairs")
        return data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Load and process image
        try:
            image = Image.open(item['image_path']).convert('RGB')
        except Exception as e:
            print(f"Error loading image {item['image_path']}: {e}")
            # Return a blank image as fallback
            image = Image.new('RGB', (224, 224), color='white')
        
        # Process with Qwen processor
        inputs = self.processor(
            text=item['prompt'],
            images=image,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_length
        )
        
        # Flatten tensors
        for key in inputs:
            if inputs[key].dim() > 1:
                inputs[key] = inputs[key].squeeze(0)
        
        inputs['labels'] = inputs['input_ids'].clone()
        inputs['image_path'] = item['image_path']
        inputs['original_prompt'] = item['prompt']
        
        return inputs

class QwenVLTrainer:
    """Main trainer class for Qwen VL fine-tuning"""
    
    def __init__(self, model_name="Qwen/Qwen2-VL-2B-Instruct", data_dir="./data"):
        self.model_name = model_name
        self.data_dir = data_dir
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # Initialize model and processor
        self.model = None
        self.processor = None
        self.tokenizer = None
        self.load_model()
        
        # Training metrics
        self.train_losses = []
        self.eval_losses = []
        self.bleu_scores = []
        
    def load_model(self):
        """Load Qwen VL model and processor"""
        try:
            print("Loading Qwen VL model...")
            self.model = Qwen2VLForConditionalGeneration.from_pretrained(
                self.model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map="auto" if torch.cuda.is_available() else None
            )
            
            self.processor = AutoProcessor.from_pretrained(self.model_name)
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            print(f"Model loaded successfully on {self.device}")
            
        except Exception as e:
            print(f"Error loading model: {e}")
            print("Using a simpler configuration...")
            # Fallback configuration
            
    def prepare_datasets(self, train_ratio=0.8, val_ratio=0.1):
        """Prepare train, validation, and test datasets"""
        
        # Create full dataset
        full_dataset = VisionLanguageDataset(self.data_dir, self.processor)
        
        # Split dataset
        total_size = len(full_dataset)
        train_size = int(train_ratio * total_size)
        val_size = int(val_ratio * total_size)
        test_size = total_size - train_size - val_size
        
        train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, val_size, test_size]
        )
        
        print(f"Dataset splits - Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")
        
        return train_dataset, val_dataset, test_dataset
    
    def train(self, train_dataset, val_dataset, output_dir="./qwen_vl_finetuned", 
              num_epochs=3, batch_size=4, learning_rate=2e-5):
        """Train the model"""
        
        # Data collator
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True
        )
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            warmup_steps=100,
            weight_decay=0.01,
            learning_rate=learning_rate,
            logging_dir=f'{output_dir}/logs',
            logging_steps=10,
            evaluation_strategy="steps",
            eval_steps=100,
            save_steps=500,
            save_total_limit=3,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            report_to=None,  # Disable wandb/tensorboard
            dataloader_pin_memory=False,
            fp16=torch.cuda.is_available(),
        )
        
        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
        )
        
        # Train the model
        print("Starting training...")
        trainer.train()
        
        # Save the final model
        trainer.save_model()
        print(f"Model saved to {output_dir}")
        
        return trainer
    
    def generate_prediction(self, image_path, prompt="Describe this image:"):
        """Generate prediction for a single image"""
        try:
            image = Image.open(image_path).convert('RGB')
            
            # Process inputs
            inputs = self.processor(
                text=prompt,
                images=image,
                return_tensors="pt"
            ).to(self.device)
            
            # Generate response
            with torch.no_grad():
                generated_ids = self.model.generate(
                    **inputs,
                    max_new_tokens=100,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            # Decode response
            response = self.processor.batch_decode(
                generated_ids, 
                skip_special_tokens=True
            )[0]
            
            # Extract generated text (remove input prompt)
            if prompt in response:
                response = response.replace(prompt, "").strip()
            
            return response
            
        except Exception as e:
            print(f"Error generating prediction: {e}")
            return "Error generating prediction"
    
    def evaluate_model(self, test_dataset, num_samples=10):
        """Evaluate model performance on test dataset"""
        
        results = []
        
        print(f"Evaluating model on {min(num_samples, len(test_dataset))} samples...")
        
        for i in tqdm(range(min(num_samples, len(test_dataset)))):
            sample = test_dataset[i]
            
            if hasattr(sample, 'image_path'):
                image_path = sample.image_path
                original_prompt = sample.original_prompt
            else:
                # Handle dataset wrapper
                idx = test_dataset.indices[i] if hasattr(test_dataset, 'indices') else i
                data_item = test_dataset.dataset.data[idx] if hasattr(test_dataset, 'dataset') else test_dataset.data[idx]
                image_path = data_item['image_path']
                original_prompt = data_item['prompt']
            
            # Generate prediction
            predicted = self.generate_prediction(image_path, "Describe this image:")
            
            results.append({
                'image_path': image_path,
                'actual': original_prompt,
                'predicted': predicted,
                'image_id': i
            })
        
        return results
    
    def calculate_metrics(self, results):
        """Calculate evaluation metrics"""
        bleu_scores = []
        
        for result in results:
            actual = result['actual'].split()
            predicted = result['predicted'].split()
            
            if len(predicted) > 0 and len(actual) > 0:
                try:
                    bleu = bleu_score.sentence_bleu([actual], predicted)
                    bleu_scores.append(bleu)
                except:
                    bleu_scores.append(0.0)
            else:
                bleu_scores.append(0.0)
        
        avg_bleu = np.mean(bleu_scores)
        
        return {
            'average_bleu': avg_bleu,
            'bleu_scores': bleu_scores
        }
    
    def plot_results(self, results, metrics, save_path="evaluation_results.png"):
        """Plot evaluation results"""
        
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('Qwen VL Model Evaluation Results', fontsize=16, fontweight='bold')
        
        # Plot 1: Sample predictions with images
        for i in range(min(4, len(results))):
            row = i // 2
            col = i % 2
            
            if row < 2 and col < 2:
                try:
                    img = Image.open(results[i]['image_path'])
                    axes[row, col].imshow(img)
                    axes[row, col].set_title(f"Sample {i+1}", fontweight='bold')
                    axes[row, col].axis('off')
                    
                    # Add text below image
                    actual_text = results[i]['actual'][:100] + "..." if len(results[i]['actual']) > 100 else results[i]['actual']
                    pred_text = results[i]['predicted'][:100] + "..." if len(results[i]['predicted']) > 100 else results[i]['predicted']
                    
                    axes[row, col].text(0, -50, f"Actual: {actual_text}", 
                                      transform=axes[row, col].transAxes, 
                                      fontsize=8, color='green', weight='bold')
                    axes[row, col].text(0, -80, f"Predicted: {pred_text}", 
                                      transform=axes[row, col].transAxes, 
                                      fontsize=8, color='blue')
                except Exception as e:
                    axes[row, col].text(0.5, 0.5, f"Error loading image {i+1}", 
                                      ha='center', va='center', transform=axes[row, col].transAxes)
                    axes[row, col].axis('off')
        
        # Plot 5: BLEU Score Distribution
        axes[1, 2].hist(metrics['bleu_scores'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
        axes[1, 2].set_title('BLEU Score Distribution')
        axes[1, 2].set_xlabel('BLEU Score')
        axes[1, 2].set_ylabel('Frequency')
        axes[1, 2].axvline(metrics['average_bleu'], color='red', linestyle='--', 
                          label=f'Avg: {metrics["average_bleu"]:.3f}')
        axes[1, 2].legend()
        
        # Plot 6: Performance Summary
        axes[0, 2].axis('off')
        summary_text = f"""
        Model Performance Summary
        ========================
        
        Total Samples Evaluated: {len(results)}
        Average BLEU Score: {metrics['average_bleu']:.4f}
        
        Best BLEU Score: {max(metrics['bleu_scores']):.4f}
        Worst BLEU Score: {min(metrics['bleu_scores']):.4f}
        
        Median BLEU Score: {np.median(metrics['bleu_scores']):.4f}
        Std Dev BLEU Score: {np.std(metrics['bleu_scores']):.4f}
        
        Model: {self.model_name}
        Device: {self.device}
        """
        
        axes[0, 2].text(0.05, 0.95, summary_text, transform=axes[0, 2].transAxes, 
                       fontsize=10, verticalalignment='top', fontfamily='monospace',
                       bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))
        
        plt.tight_layout()
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()
        
        print(f"Results plot saved to {save_path}")

# Main execution function
def main():
    """Main training and evaluation pipeline"""
    
    # Configuration
    DATA_DIR = "./data"  # Update this to your dataset directory
    MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
    OUTPUT_DIR = "./qwen_vl_finetuned"
    
    # Initialize trainer
    trainer = QwenVLTrainer(model_name=MODEL_NAME, data_dir=DATA_DIR)
    
    # Prepare datasets
    train_dataset, val_dataset, test_dataset = trainer.prepare_datasets()
    
    # Train the model
    print("Starting training process...")
    trained_model = trainer.train(
        train_dataset, 
        val_dataset, 
        output_dir=OUTPUT_DIR,
        num_epochs=3,
        batch_size=2,  # Reduce if you have memory issues
        learning_rate=2e-5
    )
    
    # Evaluate the model
    print("Evaluating model performance...")
    results = trainer.evaluate_model(test_dataset, num_samples=10)
    
    # Calculate metrics
    metrics = trainer.calculate_metrics(results)
    
    # Print detailed results
    print("\n" + "="*50)
    print("EVALUATION RESULTS")
    print("="*50)
    
    for i, result in enumerate(results):
        print(f"\nSample {i+1}:")
        print(f"Image: {os.path.basename(result['image_path'])}")
        print(f"Actual: {result['actual']}")
        print(f"Predicted: {result['predicted']}")
        print("-" * 30)
    
    print(f"\nOverall Performance:")
    print(f"Average BLEU Score: {metrics['average_bleu']:.4f}")
    
    # Plot results
    trainer.plot_results(results, metrics)
    
    return trainer, results, metrics

if __name__ == "__main__":
    # Print dataset info for Kaggle environment
    print("""
    Kaggle Dataset Structure Expected:
    
    /kaggle/input/vllm-dataset/
    ├── dataset.jsonl  # JSONL file with image-prompt pairs
    └── images/
        ├── aut-0003-00003809.jpg
        └── ... (other images)
    
    JSONL Format:
    Each line contains:
    {
      "root": {
        "image": "images/aut-0003-00003809.jpg",
        "conversations": [
          {"from": "human", "value": "<image>"},
          {"from": "gpt", "value": "The image depicts..."}
        ]
      }
    }
    
    The code will automatically load your Kaggle dataset.
    """)
    
    # Check if we're in Kaggle environment
    import os
    if os.path.exists('/kaggle/input'):
        print("✓ Kaggle environment detected")
        print("✓ Using Kaggle dataset path: /kaggle/input/vllm-dataset")
    else:
        print("⚠ Not in Kaggle environment - update DATA_DIR if needed")
    
    # Run the training pipeline
    trainer, results, metrics = main()

2025-09-11 12:49:29.763895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757594970.064686      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757594970.170350      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ImportError: cannot import name 'bleu_score' from 'sklearn.metrics' (/usr/local/lib/python3.11/dist-packages/sklearn/metrics/__init__.py)