In [None]:
"""

Author: Lakshya Marwaha
Team Name: Individual
Team Members: -
Leaderboard Rank: 44

"""

## 1. Enhanced Imports and Data Loading

Import additional libraries for deep learning, feature extraction, and SVM.  
Load the training and test data, and display basic dataset statistics.

In [None]:
# Enhanced imports for one-class classification
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from PIL import Image
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load data
train_df = pd.read_csv('/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv')
test_df = pd.read_csv('/kaggle/input/soil-classification-part-2/soil_competition-2025/train_labels.csv')
print(f"Training samples: {len(train_df)} (all soil images)")
print(f"Test samples: {len(test_df)} (soil + non-soil mix)")

## 2. Feature Extractor Definition

Define a feature extractor using a ConvNeXt-Base backbone (from timm) to extract high-level features from soil images.

In [24]:
class SoilFeatureExtractor(nn.Module):
    def __init__(self, model_name='convnext_base'):
        super().__init__()
        # Use proven ConvNeXt from Task 1
        self.backbone = timm.create_model(model_name, pretrained=True, num_classes=0)
        
    def forward(self, x):
        return self.backbone(x)

# Create feature extractor
feature_extractor = SoilFeatureExtractor('convnext_base').to(device)
feature_extractor.eval()

print("Feature extractor ready (ConvNeXt-Base)")

Feature extractor ready (ConvNeXt-Base)


## 3. Dataset Class and Transforms

Define a custom PyTorch Dataset for loading and transforming images, and set up image preprocessing transforms.

In [25]:
class SoilDataset(Dataset):
    def __init__(self, df, image_dir, transform=None, is_test=False):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = f"{self.image_dir}/{row['image_id']}"
        
        image = Image.open(img_path).convert('RGB')
        image = np.array(image)
        
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
            
        return image

# Define transforms (proven from Task 1)
transforms = A.Compose([
    A.Resize(384, 384),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])

print("Dataset class and transforms ready")

Dataset class and transforms ready


## 4. Feature Extraction from Training Data

Extract features from all training images using the defined feature extractor and save them for SVM training.

In [27]:
def extract_features(df, image_dir, model, batch_size=32):
    dataset = SoilDataset(df, image_dir, transforms)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    features = []
    model.eval()
    
    with torch.no_grad():
        for batch_idx, images in enumerate(dataloader):
            images = images.to(device)
            batch_features = model(images)
            features.append(batch_features.cpu().numpy())
            
            if batch_idx % 10 == 0:
                print(f"Processed {batch_idx * batch_size}/{len(dataset)} images")
    
    return np.vstack(features)

# Extract features from training data (only soil images)
print("Extracting features from training data (soil images)...")
train_features = extract_features(train_df, '/kaggle/input/soil-classification-part-2/soil_competition-2025/train', 
                                 feature_extractor, batch_size=16)

print(f"Training features shape: {train_features.shape}")
print("Feature extraction complete!")


Extracting features from training data (soil images)...
Processed 0/1222 images
Processed 160/1222 images
Processed 320/1222 images
Processed 480/1222 images
Processed 640/1222 images
Processed 800/1222 images
Processed 960/1222 images
Processed 1120/1222 images
Training features shape: (1222, 1024)
Feature extraction complete!


## 5. One-Class SVM Training

Normalize the extracted features and train a One-Class SVM to distinguish soil from non-soil images.

In [28]:
# Normalize features (critical for SVM)
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)

# Train One-Class SVM (as recommended in search results)
print("Training One-Class SVM...")

# nu parameter: expected proportion of outliers in test data
# Start with 0.1 (10% outliers) and tune if needed
one_class_svm = OneClassSVM(
    kernel='rbf',           # RBF kernel works well for image features
    gamma='scale',          # Auto-scale gamma
    nu=0.1,                # Expect ~10% outliers (non-soil images)
    cache_size=1000        # Increase cache for faster training
)

# Fit on soil features only (as per search results methodology)
one_class_svm.fit(train_features_scaled)

print("One-Class SVM training complete!")
print(f"Support vectors: {one_class_svm.n_support_}")


Training One-Class SVM...
One-Class SVM training complete!
Support vectors: [178]


## 6. Full Pipeline: Feature Extraction, SVM Training, and Submission

Apply fixes to the dataset class and feature extraction, retrain the SVM, extract test features, make predictions, and generate the submission file.

In [40]:
# Fix 1: Correct the Dataset Class Path Issue
class SoilDataset(Dataset):
    def __init__(self, df, image_dir, transform=None, is_test=False):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.is_test = is_test
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Fix: Remove the extra folder and .jpg extension
        img_path = f"{self.image_dir}/{row['image_id']}"
        
        try:
            image = Image.open(img_path).convert('RGB')
            image = np.array(image)
            
            if self.transform:
                augmented = self.transform(image=image)
                image = augmented['image']
                
            return image
        except Exception as e:
            print(f"Error loading {img_path}: {e}")
            # Return a dummy image if file not found
            dummy_image = np.zeros((384, 384, 3), dtype=np.uint8)
            if self.transform:
                augmented = self.transform(image=dummy_image)
                return augmented['image']
            return dummy_image

# Fix 2: Updated Feature Extraction with No Multiprocessing
def extract_features(df, image_dir, model, batch_size=32):
    # Set num_workers=0 to avoid multiprocessing issues
    dataset = SoilDataset(df, image_dir, transforms)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    features = []
    model.eval()
    
    with torch.no_grad():
        for batch_idx, images in enumerate(dataloader):
            images = images.to(device)
            batch_features = model(images)
            features.append(batch_features.cpu().numpy())
            
            if batch_idx % 10 == 0:
                print(f"Processed {batch_idx * batch_size}/{len(dataset)} images")
    
    return np.vstack(features)

# Fix 3: Continue with One-Class SVM Training
print("Extracting features from training data...")
train_features = extract_features(train_df, '/kaggle/input/soil-classification-part-2/soil_competition-2025/train', 
                                 feature_extractor, batch_size=16)

print(f"Training features shape: {train_features.shape}")

# Normalize features
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)

# Train One-Class SVM
print("Training One-Class SVM...")
one_class_svm = OneClassSVM(
    kernel='rbf',
    gamma='scale',
    nu=0.1,
    cache_size=1000
)

one_class_svm.fit(train_features_scaled)
print("One-Class SVM training complete!")

# Extract test features
print("Extracting features from test data...")
test_features = extract_features(test_df, '/kaggle/input/soil-classification-part-2/soil_competition-2025/test', 
                                feature_extractor, batch_size=16)

# Make predictions
test_features_scaled = scaler.transform(test_features)
predictions = one_class_svm.predict(test_features_scaled)

# Convert to binary labels (1 for soil, 0 for non-soil)
binary_predictions = np.where(predictions == 1, 1, 0)

# Create submission
submission_df = pd.DataFrame({
    'image_id': test_df['image_id'],
    'label': binary_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("Submission created successfully!")
print(f"Prediction distribution:")
print(f"  Soil (Class 1): {np.sum(binary_predictions == 1)} images")
print(f"  Non-soil (Class 0): {np.sum(binary_predictions == 0)} images")

Extracting features from training data...
Processed 0/1222 images
Processed 160/1222 images
Processed 320/1222 images
Processed 480/1222 images
Processed 640/1222 images
Processed 800/1222 images
Processed 960/1222 images
Processed 1120/1222 images
Training features shape: (1222, 1024)
Training One-Class SVM...
One-Class SVM training complete!
Extracting features from test data...
Processed 0/967 images
Processed 160/967 images
Processed 320/967 images
Processed 480/967 images
Processed 640/967 images
Processed 800/967 images
Processed 960/967 images
Submission created successfully!
Prediction distribution:
  Soil (Class 1): 307 images
  Non-soil (Class 0): 660 images


## 7. Optimized Ensemble Approach for F1=1.0

Implement an ensemble of One-Class SVMs with different `nu` values, combine their predictions, and optimize the threshold for perfect F1-score.  
Generate and save the optimized submission.

In [1]:
import torch.nn.functional as F

# Define the missing MultiLayerFeatureExtractor class
class MultiLayerFeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        self.convnext = timm.create_model('convnext_base', pretrained=True, num_classes=0)
        
    def forward(self, x):
        # Extract features from multiple stages
        features = []
        x = self.convnext.stem(x)
        
        for i, stage in enumerate(self.convnext.stages):
            x = stage(x)
            if i >= 2:  # Use later stages for richer features
                pooled = F.adaptive_avg_pool2d(x, 1).flatten(1)
                features.append(pooled)
        
        return torch.cat(features, dim=1)

# Simplified but effective optimization approach
def achieve_perfect_score_v2():
    print("Implementing optimized approach for F1=1.0000...")
    
    # Strategy 1: Multiple nu values ensemble
    print("Training ensemble of One-Class SVMs...")
    ensemble_models = []
    nu_values = [0.03, 0.05, 0.08, 0.1, 0.12]  # More conservative nu values
    
    for nu in nu_values:
        print(f"Training model with nu={nu}")
        model = OneClassSVM(
            kernel='rbf', 
            nu=nu, 
            gamma='scale', 
            cache_size=1000
        )
        model.fit(train_features_scaled)
        ensemble_models.append((model, nu))
    
    # Strategy 2: Weighted ensemble prediction
    all_scores = []
    weights = []
    
    for model, nu in ensemble_models:
        scores = model.decision_function(test_features_scaled)
        all_scores.append(scores)
        # Lower nu gets higher weight (more conservative models)
        weights.append(1.0 / nu)
    
    # Normalize weights
    weights = np.array(weights)
    weights = weights / weights.sum()
    
    # Weighted ensemble scores
    ensemble_scores = np.average(all_scores, axis=0, weights=weights)
    
    # Strategy 3: Optimized threshold using training data
    train_ensemble_scores = []
    for model, nu in ensemble_models:
        train_scores = model.decision_function(train_features_scaled)
        train_ensemble_scores.append(train_scores)
    
    train_ensemble_avg = np.average(train_ensemble_scores, axis=0, weights=weights)
    
    # Use 5th percentile as threshold (very conservative)
    threshold = np.percentile(train_ensemble_avg, 5)
    
    # Final predictions
    final_predictions = (ensemble_scores > threshold).astype(int)
    
    print(f"Ensemble complete! Used {len(ensemble_models)} models")
    print(f"Threshold: {threshold:.4f}")
    
    return final_predictions

# Execute the simplified optimization
perfect_predictions = achieve_perfect_score_v2()

# Create optimized submission
optimized_submission = pd.DataFrame({
    'image_id': test_df['image_id'],
    'label': perfect_predictions
})

optimized_submission.to_csv('optimized_submission.csv', index=False)

print("\nOptimized submission created!")
print(f"Previous F1-score: 0.8832")
print(f"New prediction distribution:")
print(f"  Soil (Class 1): {np.sum(perfect_predictions == 1)} images ({np.sum(perfect_predictions == 1)/len(perfect_predictions)*100:.1f}%)")
print(f"  Non-soil (Class 0): {np.sum(perfect_predictions == 0)} images ({np.sum(perfect_predictions == 0)/len(perfect_predictions)*100:.1f}%)")

# Compare with previous predictions
if 'binary_predictions' in globals():
    changed_predictions = np.sum(perfect_predictions != binary_predictions)
    print(f"Changed predictions: {changed_predictions}/{len(perfect_predictions)} ({changed_predictions/len(perfect_predictions)*100:.1f}%)")


Implementing optimized approach for F1=1.0000...
Training ensemble of One-Class SVMs...
Training model with nu=0.03
Training model with nu=0.05
Training model with nu=0.08
Training model with nu=0.1
Training model with nu=0.12
Ensemble complete! Used 5 models
Threshold: -0.0240

Optimized submission created!
Previous F1-score: 0.8832
New prediction distribution:
  Soil (Class 1): 324 images (33.5%)
  Non-soil (Class 0): 643 images (66.5%)
Changed predictions: 17/967 (1.8%)

