# Set up de librerias y configuración

In [None]:
!pip install ml-collections

In [None]:
import pandas as pd
import numpy as np

import cv2
import os
import random
from pathlib import Path
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision
from torch.optim.lr_scheduler import CyclicLR
from torch.optim import Adam
import torch.nn.functional as F

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

from albumentations import Compose, Normalize, Resize
from albumentations.pytorch import ToTensorV2

import ml_collections
from tqdm import tqdm

In [None]:
!wget https://challenges-asset-files.s3.us-east-2.amazonaws.com/data_sets/Data-Science/4+-+events/SchneiderElectricEuropeanHackathon22/train.csv
!wget https://challenges-asset-files.s3.us-east-2.amazonaws.com/data_sets/Data-Science/4+-+events/SchneiderElectricEuropeanHackathon22/test.csv
!wget https://challenges-asset-files.s3.us-east-2.amazonaws.com/data_sets/Data-Science/4+-+events/SchneiderElectricEuropeanHackathon22/train_test_data.zip
!unzip train_test_data.zip > /dev/null

In [None]:
cfg = ml_collections.ConfigDict()
cfg.base_path = Path("/kaggle/working") # change to /content for Colab
cfg.train_csv_path = cfg.base_path / "train.csv"
cfg.test_csv_path = cfg.base_path / "test.csv"
cfg.data_dir = cfg.base_path / "train_test_data"
cfg.seed = 23
cfg.batch_size = 16
cfg.num_classes = 3
cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def seed_torch(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
seed_torch(cfg.seed)

# Analisis de datos exploratorio básico

In [None]:
label_to_text = {
    0: "Plantation",
    1: "Grassland/Shrubland",
    2: "Smallholder Agriculture"
}

In [None]:
train_df = pd.read_csv(cfg.train_csv_path)
test_df = pd.read_csv(cfg.test_csv_path)

Como podemos ver, está desbalanceado:

In [None]:
train_df["label"].value_counts()

# Datasets

Validación cruzada (stratified)

In [None]:
folds = train_df.copy()
train_labels = folds["label"].values

kf = StratifiedKFold(n_splits=5)
for fold, (train_index, val_index) in enumerate(kf.split(folds.values, train_labels)):
    folds.loc[val_index, 'fold'] = int(fold)
    
folds['fold'] = folds['fold'].astype(int)
folds.to_csv('folds.csv', index=None)

In [None]:
folds.head()

In [None]:
class ZeroDeforestationDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.loc[idx, "example_path"]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.df.loc[idx, "label"]

        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']


        return image, label

In [None]:
def get_transforms():
    return Compose([
            Normalize(
                mean=[0.5, 0.5, 0.5],
                std=[0.5, 0.5, 0.5],
            ),
            ToTensorV2(),
        ])

In [None]:
def train_on_fold(fold, model):
    trn_idx = folds[folds['fold'] != fold].index
    val_idx = folds[folds['fold'] == fold].index

    training_data = ZeroDeforestationDataset(folds.loc[trn_idx].reset_index(drop=True), transform=get_transforms())
    valid_data = ZeroDeforestationDataset(folds.loc[val_idx].reset_index(drop=True), transform=get_transforms())

    train_dataloader = DataLoader(training_data, batch_size=cfg.batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_data, batch_size=cfg.batch_size, shuffle=False)

    epochs = 200
    lr = 0.01
    model.to(cfg.device)

    best_score = 0.
    best_loss = np.inf

    optimizer = Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        average_loss = 0.

        for i, (images, labels) in tqdm(enumerate(train_dataloader)):
            images = images.to(cfg.device)
            labels = labels.to(cfg.device)

            y_preds = model(images)
            loss = criterion(y_preds, labels)

            optimizer.step()
            optimizer.zero_grad()

            average_loss += loss.item() / len(train_dataloader)
        
        model.eval()
        average_val_loss = 0.
        preds = np.zeros((len(valid_data)))

        for i, (images, labels) in tqdm(enumerate(valid_dataloader)):
            images = images.to(cfg.device)
            labels = labels.to(cfg.device)

            with torch.no_grad():
                y_preds = model(images)
            
            preds[i * cfg.batch_size: (i+1) * cfg.batch_size] = y_preds.argmax(1).to("cpu").numpy()
            loss = criterion(y_preds, labels)
            average_val_loss += loss.item() / len(valid_dataloader)
                
        score = f1_score(folds.loc[val_idx]["label"].values, preds, average='macro')
        print(f"Epoch {epoch} | average train loss: {average_loss:.5f} | average val loss: {average_val_loss:.5f} | F1: {score:.5f}")

        if score > best_score:
            best_score = score
            print(f"Saving new model with best score {best_score:.5f}")
            torch.save(model.state_dict(), f'fold{fold}_best_score.pth')
        if average_val_loss < best_loss:
            best_loss = average_val_loss
            print(f"Saving new model with best loss: {best_loss:.4f}")
            torch.save(model.state_dict(), f'fold{fold}_best_loss.pth')

In [None]:
seed_torch(cfg.seed)
for fold in range(2, 5):
    model = torchvision.models.mobilenet_v3_small(pretrained=True) # Change
    model.avgpool = nn.AdaptiveAvgPool2d(1)
    model.classifier[-1] = nn.Linear(model.classifier[-1].in_features, 3)
    train_on_fold(fold, model)