<a href="https://colab.research.google.com/github/joaosMart/fish-species-class-siglip/blob/main/Code/species-classification/Learning_Curve_ResNet_50_fine_tuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ResNet-50 Learning Curve Analysis for Fish Classification

This notebook implements the learning curve experiments from:

"Temporal Aggregation of Vision-Language Features for High-Accuracy Fish Classification in Automated Monitoring"

The analysis trains ResNet-50 models with varying amounts of training data across multiple random seeds to evaluate model performance and data efficiency for classifying salmonid species.

## Requirements

Install required packages:

```bash
!pip install torch torchvision scikit-learn matplotlib seaborn pandas numpy tqdm Pillow decord
```

### Dataset Structure
Place your dataset in the following structure:
```
dataset/
├── Bleikja/          # Arctic Char images
├── Lax/              # Atlantic Salmon images  
└── Urriði/           # Brown Trout images

```



## Configuration and Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torchvision.models as models
from torchvision import transforms
from sklearn.metrics import accuracy_score, f1_score
import time
import copy
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
import gc
import os
import random
from collections import namedtuple
from PIL import Image
from pathlib import Path
from scipy import stats


# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Configuration

In [None]:

# Dataset configuration
DATASET_PATH = './dataset'  # Update this path to your dataset location
CLASS_NAMES = ['Bleikja', 'Lax', 'Urriði']  # Arctic Char, Atlantic Salmon, Brown/Sea Trout

# Training configuration
BATCH_SIZE = 128  # Adjust based on GPU memory
NUM_EPOCHS = 40
PATIENCE = 3
LEARNING_RATE = 4.37e-04  # Optimized learning rate from paper
NUM_RUNS = 10  # Number of random seeds for statistical significance
BASE_SEED = 42

# Learning curve configuration
TRAIN_FRACTIONS = [0.05, 0.2, 0.4, 0.6, 0.8, 1.0]  # Fractions of training data to use

print(f"Configuration:")
print(f"Dataset path: {DATASET_PATH}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Number of runs: {NUM_RUNS}")

## Utility Functions

In [None]:
def set_seed(seed):
    """Set random seed for reproducibility."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def print_dataset_info(dataset_path):
    """Print information about the dataset structure."""
    dataset_path = Path(dataset_path)
    if not dataset_path.exists():
        print(f"⚠️  Dataset path {dataset_path} not found!")
        print("Please update DATASET_PATH or ensure dataset is in correct location.")
        return False

    print(f"\n📁 Dataset Information:")
    print(f"Location: {dataset_path}")

    total_images = 0
    for class_name in CLASS_NAMES:
        class_path = dataset_path / class_name
        if class_path.exists():
            image_count = len(list(class_path.glob('*.jpg')) + list(class_path.glob('*.png')))
            print(f"  {class_name}: {image_count} images")
            total_images += image_count
        else:
            print(f"  ❌ {class_name}: Directory not found")

    print(f"  Total: {total_images} images")
    return total_images > 0

# Check dataset
if not print_dataset_info(DATASET_PATH):
    print("\n⚠️  Please ensure your dataset is properly structured before continuing.")


## Dataset Classes

In [None]:
class FishFrameDataset(Dataset):
    """Custom dataset for fish frame classification."""

    def __init__(self, root_dir, transform=None):
        self.root_dir = Path(root_dir)
        self.transform = transform

        print("📂 Scanning directories...")
        # Get all valid class directories
        self.classes = sorted([
            d.name for d in self.root_dir.iterdir()
            if d.is_dir() and not d.name.startswith('.')
        ])
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}

        # Index dataset
        self.samples = []
        print("🔍 Indexing dataset...")
        for class_name in tqdm(self.classes, desc="Loading classes"):
            class_dir = self.root_dir / class_name
            image_files = [
                f for f in class_dir.glob('*')
                if f.suffix.lower() in ('.jpg', '.jpeg', '.png')
                and not f.name.startswith('.')
            ]
            self.samples.extend([(str(img_path), class_name) for img_path in image_files])

        print(f"✅ Found {len(self.samples)} images across {len(self.classes)} classes")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, class_name = self.samples[idx]

        try:
            with Image.open(img_path) as image:
                image = image.convert('RGB')
            label_idx = self.class_to_idx[class_name]

            if self.transform:
                image = self.transform(image)

            return image, torch.tensor(label_idx)
        except Exception as e:
            print(f"❌ Error reading image {img_path}: {str(e)}")
            return self.__getitem__((idx + 1) % len(self))

class TransformSubset(Dataset):
    """Dataset subset with transforms applied."""

    def __init__(self, dataset, indices, transform):
        self.dataset = dataset
        self.indices = indices
        self.transform = transform

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        img, label = self.dataset[self.indices[idx]]
        if self.transform:
            img = self.transform(img)
        return img, label


## Model Architecture

In [None]:
class Bottleneck(nn.Module):
    """Bottleneck block for ResNet-50."""

    expansion = 4

    def __init__(self, in_channels, out_channels, stride=1, downsample=False):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3,
                              stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.conv3 = nn.Conv2d(out_channels, self.expansion * out_channels,
                              kernel_size=1, stride=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion * out_channels)

        self.relu = nn.ReLU(inplace=True)

        if downsample:
            conv = nn.Conv2d(in_channels, self.expansion * out_channels,
                           kernel_size=1, stride=stride, bias=False)
            bn = nn.BatchNorm2d(self.expansion * out_channels)
            downsample = nn.Sequential(conv, bn)
        else:
            downsample = None

        self.downsample = downsample

    def forward(self, x):
        identity = x

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.conv3(x)
        x = self.bn3(x)

        if self.downsample is not None:
            identity = self.downsample(identity)

        x += identity
        x = self.relu(x)

        return x

class ResNet(nn.Module):
    """Custom ResNet implementation."""

    def __init__(self, config, output_dim):
        super().__init__()

        block, n_blocks, channels = config
        self.in_channels = channels[0]

        assert len(n_blocks) == len(channels) == 4

        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self.get_resnet_layer(block, n_blocks[0], channels[0])
        self.layer2 = self.get_resnet_layer(block, n_blocks[1], channels[1], stride=2)
        self.layer3 = self.get_resnet_layer(block, n_blocks[2], channels[2], stride=2)
        self.layer4 = self.get_resnet_layer(block, n_blocks[3], channels[3], stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(self.in_channels, output_dim)

    def get_resnet_layer(self, block, n_blocks, channels, stride=1):
        layers = []

        if self.in_channels != block.expansion * channels:
            downsample = True
        else:
            downsample = False

        layers.append(block(self.in_channels, channels, stride, downsample))

        for i in range(1, n_blocks):
            layers.append(block(block.expansion * channels, channels))

        self.in_channels = block.expansion * channels

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        h = x.view(x.shape[0], -1)
        x = self.fc(h)

        return x, h


## Training Functions

In [None]:
class EarlyStopping:
    """Early stopping utility."""

    def __init__(self, patience=7, min_delta=0, verbose=False):
        self.patience = patience
        self.min_delta = min_delta
        self.verbose = verbose
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model, path='checkpoint.pt'):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model, path)
        elif val_loss > self.best_loss + self.min_delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model, path)
            self.counter = 0

    def save_checkpoint(self, val_loss, model, path):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), path)
        self.val_loss_min = val_loss

def train_model_with_early_stopping(model, train_loader, criterion, optimizer, scheduler,
                                   device, num_epochs=20, patience=5, verbose=True):
    """Train model with early stopping."""
    best_model_weights = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    no_improve_epochs = 0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        # Train for one epoch
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", disable=not verbose)
        for inputs, labels in progress_bar:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs, _ = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

            progress_bar.set_postfix({'loss': loss.item(), 'acc': 100.*correct/total})

        epoch_loss = running_loss / len(train_loader)
        epoch_acc = correct / total

        if verbose:
            print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}, Acc: {epoch_acc*100:.2f}%")

        # Update learning rate scheduler
        scheduler.step(epoch_loss)

        # Early stopping check
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            best_model_weights = copy.deepcopy(model.state_dict())
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                if verbose:
                    print(f"Early stopping triggered at epoch {epoch+1}")
                break

    # Load best model weights
    model.load_state_dict(best_model_weights)
    return model

def evaluate_model(model, data_loader, device, metrics):
    """Evaluate model and return specified metrics."""
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs, _ = model(inputs)
            _, predicted = outputs.max(1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    results = {}
    if 'accuracy' in metrics:
        results['accuracy'] = accuracy_score(all_labels, all_preds)
    if 'f1_weighted' in metrics:
        results['f1_weighted'] = f1_score(all_labels, all_preds, average='weighted')
    if 'f1_macro' in metrics:
        results['f1_macro'] = f1_score(all_labels, all_preds, average='macro')

    return results

## Learning Curve Experiment

In [None]:
def run_learning_curve_experiment(dataset_path=DATASET_PATH,
                                 train_sizes=None,
                                 num_runs=NUM_RUNS,
                                 batch_size=BATCH_SIZE,
                                 num_epochs=NUM_EPOCHS,
                                 patience=PATIENCE,
                                 metrics=['accuracy', 'f1_weighted', 'f1_macro'],
                                 class_names=CLASS_NAMES,
                                 base_seed=BASE_SEED):
    """Run learning curve experiment across multiple random seeds."""

    print(f"🚀 Starting learning curve experiment with {num_runs} runs...")
    print(f"📊 Metrics: {', '.join(metrics)}")
    print(f"🏷️  Classes: {', '.join(class_names)}")

    # Storage for results across all runs
    all_results = []

    # Define transforms
    pretrained_size = 224
    pretrained_means = [0.485, 0.456, 0.406]
    pretrained_stds = [0.229, 0.224, 0.225]

    train_transforms = transforms.Compose([
        transforms.Resize((pretrained_size, pretrained_size)),
        transforms.RandomRotation(5),
        transforms.RandomHorizontalFlip(0.5),
        transforms.RandomCrop(pretrained_size, padding=10),
        transforms.ToTensor(),
        transforms.Normalize(mean=pretrained_means, std=pretrained_stds)
    ])

    eval_transforms = transforms.Compose([
        transforms.Resize((pretrained_size, pretrained_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=pretrained_means, std=pretrained_stds)
    ])

    # Create base dataset
    print(f"📂 Loading dataset from {dataset_path}...")
    base_dataset = FishFrameDataset(root_dir=dataset_path)

    # Calculate split sizes
    total_size = len(base_dataset)
    train_val_size = int(0.8 * total_size)  # 80% for train+val

    # Convert train_sizes to absolute numbers if needed
    if train_sizes is None:
        train_fractions = TRAIN_FRACTIONS
        train_sizes_abs = [int(frac * train_val_size) for frac in train_fractions]
    elif all(isinstance(size, float) for size in train_sizes) and all(0 < size <= 1 for size in train_sizes):
        train_sizes_abs = [int(frac * train_val_size) for frac in train_sizes]
    else:
        train_sizes_abs = train_sizes

    # Model configuration
    ResNetConfig = namedtuple('ResNetConfig', ['block', 'n_blocks', 'channels'])
    resnet50_config = ResNetConfig(
        block=Bottleneck,
        n_blocks=[3, 4, 6, 3],
        channels=[64, 128, 256, 512]
    )

    OUTPUT_DIM = len(class_names)

    # Class weights for balanced loss (adjust these based on your dataset)
    class_counts = torch.tensor([279, 1086, 1588], dtype=torch.float32)  # Update for your dataset
    class_weights = 1.0 / class_counts
    class_weights = class_weights / class_weights.sum()

    # Run experiments across multiple seeds
    for run in range(num_runs):
        print(f"\n{'='*60}")
        print(f"🔄 Starting run {run+1}/{num_runs}")

        # Set seed for this run
        current_seed = base_seed + run
        set_seed(current_seed)
        print(f"🎲 Using seed: {current_seed}")

        # Create train/test split for this run
        indices = torch.randperm(total_size)
        train_val_indices = indices[:train_val_size]
        test_indices = indices[train_val_size:]

        # Create test dataset
        test_dataset = TransformSubset(base_dataset, test_indices, eval_transforms)
        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=torch.cuda.is_available()
        )

        # Device setup
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        class_weights_device = class_weights.to(device)

        # For each training size in this run
        for i, n_train_samples in enumerate(train_sizes_abs):
            print(f"\n📈 Training with {n_train_samples} samples ({n_train_samples/train_val_size*100:.1f}% of training data)")

            # Create subset for training
            shuffled_train_indices = train_val_indices[torch.randperm(len(train_val_indices))]
            train_subset_indices = shuffled_train_indices[:n_train_samples]
            train_subset = TransformSubset(base_dataset, train_subset_indices, train_transforms)

            # Create DataLoader
            train_loader = DataLoader(
                train_subset,
                batch_size=batch_size,
                shuffle=True,
                num_workers=0,
                pin_memory=torch.cuda.is_available()
            )

            # Initialize model with pre-trained weights
            pretrained_model = models.resnet50(weights='DEFAULT')
            IN_FEATURES = pretrained_model.fc.in_features
            fc = nn.Linear(IN_FEATURES, OUTPUT_DIM)
            pretrained_model.fc = fc

            # Initialize custom ResNet model
            model = ResNet(resnet50_config, OUTPUT_DIM)
            model.load_state_dict(pretrained_model.state_dict())
            model = model.to(device)

            # Layer-wise learning rates (as in paper)
            params = [
                {'params': model.conv1.parameters(), 'lr': LEARNING_RATE / 10},
                {'params': model.bn1.parameters(), 'lr': LEARNING_RATE / 10},
                {'params': model.layer1.parameters(), 'lr': LEARNING_RATE / 8},
                {'params': model.layer2.parameters(), 'lr': LEARNING_RATE / 6},
                {'params': model.layer3.parameters(), 'lr': LEARNING_RATE / 4},
                {'params': model.layer4.parameters(), 'lr': LEARNING_RATE / 2},
                {'params': model.fc.parameters()}
            ]

            # Loss function with class weights
            criterion = nn.CrossEntropyLoss(weight=class_weights_device)

            # Optimizer and scheduler
            optimizer = optim.Adam(params, lr=LEARNING_RATE)
            scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

            # Train the model
            start_time = time.time()
            model = train_model_with_early_stopping(
                model=model,
                train_loader=train_loader,
                criterion=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                device=device,
                num_epochs=num_epochs,
                patience=patience,
                verbose=True
            )
            end_time = time.time()
            training_time = end_time - start_time

            # Evaluate on test data
            test_metrics = evaluate_model(model, test_loader, device, metrics)

            # Store results
            result = {
                'run': run,
                'seed': current_seed,
                'train_size': n_train_samples,
                'training_time': training_time
            }
            for metric in metrics:
                result[f'test_{metric}'] = test_metrics[metric]

            all_results.append(result)

            # Print results for this training size
            print(f"⏱️  Training completed in {training_time:.1f} seconds")
            print("📊 Test metrics:")
            for metric in metrics:
                print(f"   {metric}: {test_metrics[metric]:.4f}")

            # Clean up GPU memory
            del model, optimizer, scheduler
            torch.cuda.empty_cache()
            gc.collect()
            time.sleep(1)

    # Convert results to DataFrame
    results_df = pd.DataFrame(all_results)

    # Save results to CSV
    results_df.to_csv('learning_curve_results.csv', index=False)
    print(f"\n💾 Results saved to 'learning_curve_results.csv'")

    return train_sizes_abs, results_df

## Visualization Functions

In [None]:

def plot_learning_curve_with_confidence(results_df, train_sizes, metrics, class_names, figsize=(14, 12)):
    """Plot learning curve with confidence intervals from multiple runs."""

    # Calculate statistics for each training size
    stats_df = pd.DataFrame()
    for train_size in train_sizes:
        size_data = results_df[results_df['train_size'] == train_size]
        row = {'train_size': train_size}

        for metric in metrics:
            test_values = size_data[f'test_{metric}']
            row[f'test_{metric}_mean'] = test_values.mean()
            row[f'test_{metric}_std'] = test_values.std()
            # Calculate 95% confidence interval
            n = len(test_values)
            row[f'test_{metric}_ci95'] = stats.t.ppf(0.975, n-1) * test_values.std() / np.sqrt(n)

        stats_df = pd.concat([stats_df, pd.DataFrame([row])], ignore_index=True)

    # Sort by train_size
    stats_df = stats_df.sort_values('train_size')

    # Create figure
    fig, axes = plt.subplots(len(metrics), 1, figsize=figsize, sharex=True)
    if len(metrics) == 1:
        axes = [axes]

    # Color palette
    colors = sns.color_palette("viridis", len(metrics))

    # Plot each metric
    for i, metric in enumerate(metrics):
        ax = axes[i]

        # Get data
        test_mean = stats_df[f'test_{metric}_mean']
        test_ci95 = stats_df[f'test_{metric}_ci95']

        # Plot curve
        ax.plot(stats_df['train_size'], test_mean, 'o-', color=colors[i],
                linewidth=2, markersize=8, label=f'Test {metric}')

        # Plot confidence interval
        ax.fill_between(stats_df['train_size'],
                       test_mean - test_ci95,
                       test_mean + test_ci95,
                       alpha=0.3, color=colors[i],
                       label='95% CI')

        # Formatting
        metric_name = metric.replace('_', ' ').title()
        ax.set_title(f'{metric_name} vs Training Size', fontsize=14, fontweight='bold')
        ax.set_ylabel(f'{metric_name}', fontsize=12)
        ax.grid(True, alpha=0.3)
        ax.legend(loc='best')

        # Set y-axis limits for better visualization
        y_min = max(0.5, test_mean.min() - test_ci95.max() - 0.05)
        y_max = min(1.0, test_mean.max() + test_ci95.max() + 0.05)
        ax.set_ylim(y_min, y_max)

    # Format x-axis
    axes[-1].set_xlabel('Number of Training Examples', fontsize=12)

    # Overall title
    fig.suptitle(f'Learning Curves for Fish Classification\n({", ".join(class_names)})',
                fontsize=16, fontweight='bold', y=0.98)

    plt.tight_layout()
    plt.subplots_adjust(top=0.93)
    plt.savefig('learning_curve_resnet50.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Print summary statistics
    print("\n📈 Learning Curve Summary:")
    print("="*50)
    for metric in metrics:
        print(f"\n{metric.replace('_', ' ').title()}:")
        smallest = stats_df.iloc[0]
        largest = stats_df.iloc[-1]
        improvement = largest[f'test_{metric}_mean'] - smallest[f'test_{metric}_mean']
        improvement_pct = improvement / smallest[f'test_{metric}_mean'] * 100

        print(f"  Smallest dataset: {smallest[f'test_{metric}_mean']:.4f}")
        print(f"  Largest dataset:  {largest[f'test_{metric}_mean']:.4f}")
        print(f"  Improvement:      {improvement:.4f} ({improvement_pct:.1f}%)")

def print_final_summary(results_df, metrics, class_names):
    """Print final summary of results."""
    print("\n🎯 Final Results Summary")
    print("="*50)

    # Overall performance
    print(f"\nAverage performance across all runs:")
    for metric in metrics:
        mean_val = results_df[f'test_{metric}'].mean()
        std_val = results_df[f'test_{metric}'].std()
        print(f"  {metric.replace('_', ' ').title()}: {mean_val:.4f} ± {std_val:.4f}")

    # Best performance
    print(f"\nBest performance (single run):")
    for metric in metrics:
        best_val = results_df[f'test_{metric}'].max()
        best_run = results_df.loc[results_df[f'test_{metric}'].idxmax()]
        print(f"  {metric.replace('_', ' ').title()}: {best_val:.4f} (Run {best_run['run']+1}, Seed {best_run['seed']})")

    # Data efficiency analysis
    print(f"\n💡 Data Efficiency Insights:")
    largest_size = results_df['train_size'].max()
    largest_results = results_df[results_df['train_size'] == largest_size]

    for metric in metrics:
        # Find minimum data needed to reach 95% of best performance
        best_score = largest_results[f'test_{metric}'].mean()
        target_score = 0.95 * best_score

        efficient_size = None
        for size in sorted(results_df['train_size'].