In [None]:
# 🧪 Soil Image Classification Challenge - Google Colab Notebook

## 📦 Step 1: Upload Dataset (zip file)
from google.colab import files
uploaded = files.upload()

import zipfile
import os

with zipfile.ZipFile("soil_classification-2025.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

print("Extracted files:")
print(os.listdir("soil_classification-2025"))

## 🧰 Step 2: Install Required Libraries
!pip install -q efficientnet_pytorch scikit-learn

## ⚙️ Step 3: Import Libraries
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from efficientnet_pytorch import EfficientNet
from tqdm import tqdm

## 📁 Step 4: Set Paths (Updated for your ZIP structure)
train_csv = 'soil_classification-2025/train_labels.csv'
train_dir = 'soil_classification-2025/train'
test_dir = 'soil_classification-2025/test'
submission_file = 'soil_classification-2025/test_ids.csv'

## ⚙️ Step 5: Define Constants and Helpers
IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 10
SEED = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(SEED)

label2idx = {'Alluvial soil': 0, 'Black Soil': 1, 'Clay soil': 2, 'Red soil': 3}
idx2label = {v: k for k, v in label2idx.items()}

class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, train=True):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        self.train = train

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(os.path.join(self.img_dir, row['image_id'])).convert("RGB")
        if self.transform:
            image = self.transform(image)
        if self.train:
            label = label2idx[row['soil_type']]
            return image, label
        else:
            return image, row['image_id']

train_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def get_model():
    model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=4)
    return model.to(DEVICE)

def train_one_fold(model, train_loader, val_loader):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    best_f1 = 0

    for epoch in range(EPOCHS):
        model.train()
        for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        val_preds, val_labels = [], []
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(DEVICE)
                outputs = model(images)
                preds = torch.argmax(outputs, dim=1).cpu().numpy()
                val_preds.extend(preds)
                val_labels.extend(labels.numpy())

        scores = [f1_score(val_labels, val_preds, labels=[i], average='macro') for i in range(4)]
        min_f1 = min(scores)
        print(f"Min F1-score: {min_f1:.4f}, All: {np.round(scores, 4)}")

        if min_f1 > best_f1:
            best_f1 = min_f1
            torch.save(model.state_dict(), 'best_model.pth')

    print(f"Best Min F1-score: {best_f1:.4f}")

## 🧠 Step 6: Train Model
df = pd.read_csv(train_csv)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, stratify=df['soil_type'], test_size=0.2, random_state=SEED)

train_ds = SoilDataset(train_df, train_dir, train_transforms)
val_ds = SoilDataset(val_df, train_dir, test_transforms)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

model = get_model()
train_one_fold(model, train_loader, val_loader)

## 🔍 Step 7: Inference and Submission
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

test_df = pd.read_csv(submission_file)
test_ds = SoilDataset(test_df, test_dir, test_transforms, train=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

all_preds, all_ids = [], []
with torch.no_grad():
    for images, image_ids in tqdm(test_loader, desc="Inference"):
        images = images.to(DEVICE)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_ids.extend(image_ids)

submission = pd.DataFrame({'image_id': all_ids, 'soil_type': [idx2label[p] for p in all_preds]})
submission.to_csv('submission.csv', index=False)
print("Saved submission.csv")

## 📤 Step 8: Download Submission
from google.colab import files
files.download('submission.csv')