<a href="https://colab.research.google.com/github/javadan/CGIAR-Crop-Damage-Classification-Challenge/blob/main/Image_tiling_CGIAR_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

############################### MOUNT GDRIVE ###################################

from google.colab import drive
drive.mount('/content/drive')

############################### INSTALL TIMM ###################################

!pip install timm

import torch.multiprocessing as mp
mp.set_start_method('spawn', force=True)

############################### IMPORTS ########################################

from tqdm import tqdm
from fastai.vision.all import PILImage
from fastcore.parallel import *
from pathlib import Path
from collections import defaultdict
from fastai.vision.all import *
from timm import create_model
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from fastai.callback.all import CutMix
from fastai.test_utils import *
from datetime import datetime
from fastai.losses import FocalLossFlat
import torch.nn.functional as F
import shutil
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
import glob

############################### HYPER PARAMS ###################################

SEED = 42
N_FOLDS = 3
BATCH_SIZE = 20
#IMGSZ = 384 #224 #384  #determined from model
EPOCHS = 5
INIT_LR = 3e-4 #2e-4
NUM_WORKER = 8
PATIENCE = 3


model_bases = ['efficientvit_l2.r384_in1k']

#                'tiny_vit_21m_224.dist_in22k',
#                'caformer_s36.sail_in22k']

MODEL_BASE = 'efficientvit_l2.r384_in1k'
SAVE_NAME = MODEL_BASE
model_scores = {}


!cp /content/drive/MyDrive/cgiar/*.csv /content/cgiar

############################### GLOBALS SETUP ##################################

# Changed to save locally
MODELS_DIR = '/content/models/'
DATASET_DIR = '/content/cgiar/'




         ################ SKIP TRAINING?  ########


existing_model_files = {model_base: glob.glob(f'{MODELS_DIR}/{model_base}_*/{model_base}_fold*.pth') for model_base in model_bases}
skip_training = all(len(files) > 0 for files in existing_model_files.values())








current_datetime = datetime.now().strftime("%Y%m%d-%H%M%S")
MODELS_PATH = os.path.join(MODELS_DIR, f"{MODEL_BASE}_{current_datetime}")
os.makedirs(MODELS_PATH, exist_ok=True)
os.makedirs(DATASET_DIR, exist_ok=True)
set_seed(SEED, reproducible=True)

############################### COPY TO LOCAL ##################################

# Target directory where images will be copied
local_image_dir = '/content/images'

# Check if the directory exists and has files
if not os.path.exists(local_image_dir) or not os.listdir(local_image_dir):
    print("Copying and unzipping files to local directory...")

    # Create the target directory if it doesn't exist
    if not os.path.exists(local_image_dir):
        os.makedirs(local_image_dir)

    # Copy the zip file
    !cp /content/drive/MyDrive/cgiar/images/images.zip /content/

    !cp /content/drive/MyDrive/cgiar/*.csv /content/cgiar

    # Unzip the file into the target directory
    !unzip -q /content/images.zip -d /content/images
else:
    print("Files already copied to local directory.")



############################### CUSTOM IMAGE DATASET ###########################
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, label_mapping, vocab):
        self.df = dataframe
        self.to_tensor = ToTensor()  # Transformation to convert PILImage to tensor
        self.label_mapping = label_mapping  # Mapping from string labels to integers
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row['path']
        label = self.label_mapping[row['target']]  # Convert label to integer
        image = PILImage.create(img_path)
        image_tensor = self.to_tensor(image)  # Convert to tensor
        label_tensor = torch.tensor(label, dtype=torch.long)  # Convert label to tensor

        return image_tensor, label_tensor

    def new_empty(self):
        return CustomImageDataset(pd.DataFrame(columns=self.df.columns), self.label_mapping, self.vocab)


############################### CUSTOM LR SCHEDULE #############################
class CustomLRSchedule(Callback):
    def __init__(self, lr_reduction_factor=0.9):
        self.lr_reduction_factor = lr_reduction_factor

    def after_epoch(self):
        # Reduce the learning rate after each epoch
        for param_group in self.learn.opt.param_groups:
            param_group['lr'] *= self.lr_reduction_factor


############################### HELPER METHODS #################################



############ Function to determine the input size for a given model
def get_input_size_for_model(model_base):
    if '384' in model_base:
        return 384
    elif '244' in model_base:
        return 244
    else:
        return 224  # default size


model_input_size = get_input_size_for_model(MODEL_BASE)


############ Function to copy files to Google Drive
def copy_to_gdrive(source, destination):
    try:
        !cp -r {source} {destination}
        print(f"Copied {source} to {destination} successfully.")
    except Exception as e:
        print(f"Error copying {source} to {destination}: {e}")


############################### SET UP SCORES FILE #############################

scores_file_path = '/content/drive/MyDrive/models/model_scores.csv'
if not os.path.exists(scores_file_path):
    with open(scores_file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['DateTime', 'Model', 'Fold', 'Score'])




############################### DATA PREPARATION ###############################
class DataPreparation:
    def __init__(self, dataset_dir, image_dir, chunk_size):
        self.dataset_dir = dataset_dir
        self.image_dir = image_dir
        self.chunk_size = chunk_size
        self.chunk_counts_per_class = defaultdict(int)  # Initialize chunk counts for each class

    ############# CHUNK IT UP ##################################################


    def process_image(self, file_path, output_dir, class_label):
        # Ensure the output directory exists
        os.makedirs(output_dir, exist_ok=True)

        image = PILImage.create(file_path)
        w, h = image.size
        image_id = os.path.basename(file_path).split('.')[0]

        # Function to check if the chunk already exists
        def chunk_exists(image_id, i, j, output_dir):
            chunk_path = os.path.join(output_dir, f"{image_id}_chunk_{i}_{j}.jpg")
            return os.path.exists(chunk_path)

        # Convert output_dir to a relative path
        relative_output_dir = os.path.relpath(output_dir)

        # Check if chunks or resized image already exist
        if self.chunk_size is None or (w <= self.chunk_size and h <= self.chunk_size):
            resized_path = os.path.join(relative_output_dir, f"{image_id}.jpg")
            # Resize and save the image if it doesn't exist
            if not os.path.exists(resized_path):
                resized_image = image.resize((self.chunk_size, self.chunk_size))
                resized_image.save(resized_path)
            yield resized_path, image_id
            return

        n_chunks_x = max(1, w // self.chunk_size)
        n_chunks_y = max(1, h // self.chunk_size)

        # Chunk count logic
        if self.chunk_size is None or (w <= self.chunk_size and h <= self.chunk_size):
            self.chunk_counts_per_class[class_label] += 1
        else:
            self.chunk_counts_per_class[class_label] += n_chunks_x * n_chunks_y

        # Check if any chunk does not exist, then proceed with processing
        if not all(chunk_exists(image_id, i, j, output_dir) for i in range(n_chunks_x) for j in range(n_chunks_y)):
            for i in range(n_chunks_x):
                for j in range(n_chunks_y):
                    start_x = i * self.chunk_size
                    start_y = j * self.chunk_size
                    end_x = start_x + self.chunk_size
                    end_y = start_y + self.chunk_size
                    chunk = image.crop((start_x, start_y, end_x, end_y)).resize((self.chunk_size, self.chunk_size))
                    chunk_path = os.path.join(relative_output_dir, f"{image_id}_chunk_{i}_{j}.jpg")
                    chunk.save(chunk_path)
                    yield chunk_path, image_id
        else:
            # If all chunks already exist, yield their paths
            for i in range(n_chunks_x):
                for j in range(n_chunks_y):
                    chunk_path = os.path.join(relative_output_dir, f"{image_id}_chunk_{i}_{j}.jpg")
                    yield chunk_path, image_id

    def compute_class_weights(self):
        total_chunks = sum(self.chunk_counts_per_class.values())
        class_weights = {cls: total_chunks/count for cls, count in self.chunk_counts_per_class.items()}
        return class_weights
    ####### MAP THE CHUNKS TO THE DAMAGE AND ORIGINAL ##########################

    def prepare_train_data(self, data, kfold, output_dir, batch_size=100, max_samples=None):
        df = data.copy()
        df['image_id'] = df['filename'].apply(lambda x: x.split('.')[0])
        df = df.drop_duplicates(subset='image_id', keep='first')
        df['target'] = df['damage']
        df['fold'] = -1

        if max_samples is not None:
            df = df.sample(n=max_samples, random_state=SEED).reset_index(drop=True)

        for i, (train_idx, val_idx) in enumerate(kfold.split(df, df['target'])):
            df.loc[val_idx, 'fold'] = i

        chunk_data = []


        for batch_start in tqdm(range(0, len(df), batch_size), desc="Processing images"):
            batch_end = min(batch_start + batch_size, len(df))
            for _, row in df.iloc[batch_start:batch_end].iterrows():
                file_path = os.path.join(self.image_dir, row['filename'])
                class_label = row['target']
                for chunk_path, image_id in self.process_image(file_path, output_dir, class_label):
                    if not os.path.exists(chunk_path):
                        print(f"File not found: {chunk_path}")
                        continue
                    chunk_data.append({'image_id': image_id, 'path': chunk_path, 'target': row['target'], 'fold': row['fold']})

        return pd.DataFrame(chunk_data)

    ####### LOAD DATA INTO DATASETS ############################################

    def get_dataloaders(self, train_data, fold):
        train_df = train_data[train_data['fold'] != fold].reset_index(drop=True)
        valid_df = train_data[train_data['fold'] == fold].reset_index(drop=True)

        label_mapping = {label: idx for idx, label in enumerate(train_df['target'].unique())}
        vocab = [label for label, idx in sorted(label_mapping.items(), key=lambda item: item[1])]

        train_dataset = CustomImageDataset(train_df, label_mapping, vocab=vocab)
        valid_dataset = CustomImageDataset(valid_df, label_mapping, vocab=vocab)

        train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=0, shuffle=True)
        valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, num_workers=0)

        return DataLoaders(train_dl, valid_dl)







############################### TRAIN EVAL CODE

class ModelTraining:
    def __init__(self, models_dir, seed, n_folds, batch_size, imgsz, epochs, init_lr, num_worker, patience, device, num_classes):
        self.models_dir = models_dir
        self.seed = seed
        self.n_folds = n_folds
        self.batch_size = batch_size
        self.imgsz = imgsz
        self.epochs = epochs
        self.init_lr = init_lr
        self.num_worker = num_worker
        self.patience = patience
        self.device = device
        self.num_classes = num_classes


    def train_and_evaluate_model(self, model_base, train_data, current_datetime, scores_file_path, test_and_submission, data_prep):
        class_weights = data_prep.compute_class_weights()
        weights = torch.FloatTensor([class_weights[t] for t in train_data['target'].unique()]).to(self.device)
        print("Weights tensor:", weights)
        print("Device:", self.device)

        for fold in range(self.n_folds):
            print(f"Training {model_base} - Fold {fold}")


            model = create_model(model_base, pretrained=False, num_classes=self.num_classes).to(self.device)
            dls = data_prep.get_dataloaders(train_data, fold)

            learn = Learner(dls, model, loss_func=torch.nn.CrossEntropyLoss(weight=weights))
            print(f"Starting training for {model_base}, fold {fold}")

            for epoch in range(self.epochs):
                learn.model.train()
                learn.fit_one_cycle(1, self.init_lr)
                learn.model.eval()
                val_loss = learn.validate()[0]

                export_path = f'{self.models_dir}/{model_base}_fold{fold}_epoch{epoch}.pkl'
                try:
                    learn.export(export_path)
                    print(f"Model exported successfully: {export_path}")

                    # Perform testing and save submission
                    test_and_submission.test_and_save_submission(learn, f'{DATASET_DIR}Test.csv', f'{DATASET_DIR}SampleSubmission.csv', f'{self.models_dir}/submission_{model_base}_fold{fold}_epoch{epoch}.csv', model_base, self.device)
                    print(f"Testing and submission saved for epoch {epoch}")
                except Exception as e:
                    print(f"Error during model export or testing: {e}")

            print(f"Training completed for {model_base}, fold {fold}.")

#################################### TESTING ###################################


class TestAndSubmission:
    def __init__(self, models_dir, dataset_dir, number_of_classes):
        self.models_dir = models_dir
        self.dataset_dir = dataset_dir
        self.number_of_classes = number_of_classes

    def predict_from_chunks(self, model, image_path, model_base, device):
        image = PILImage.create(image_path)
        chunk_size = get_input_size_for_model(model_base)
        to_tensor = ToTensor()

        w, h = image.size
        n_chunks_x = w // chunk_size
        n_chunks_y = h // chunk_size

        model.eval()
        all_probs = []
        with torch.no_grad():
            if w < chunk_size or h < chunk_size:
                # Resize image to the required chunk_size
                resized_image = image.resize((chunk_size, chunk_size))
                resized_tensor = to_tensor(resized_image).unsqueeze(0).to(device)
                logits = model(resized_tensor)
                probs = F.softmax(logits, dim=1).cpu().numpy()
                all_probs.append(probs)
            else:
                for i in range(n_chunks_x):
                    for j in range(n_chunks_y):
                        start_x, start_y = i * chunk_size, j * chunk_size
                        end_x, end_y = start_x + chunk_size, start_y + chunk_size
                        chunk = image.crop((start_x, start_y, end_x, end_y))
                        chunk_tensor = to_tensor(chunk).unsqueeze(0).to(device)

                        logits = model(chunk_tensor)
                        probs = F.softmax(logits, dim=1).cpu().numpy()
                        all_probs.append(probs)

        # Take the mean across all chunk probabilities
        mean_probs = np.mean(np.vstack(all_probs), axis=0)
        return mean_probs


    def test_and_save_submission(self, learn, test_df_path, submission_df_path, submission_output_path, model_base, device):
        test_df = pd.read_csv(test_df_path)
        test_df['path'] = test_df['filename'].map(lambda x: f'images/images/{x}')

        all_probs = []
        model = learn.model
        for img_path in test_df['path']:
            probs = self.predict_from_chunks(model, img_path, model_base, device)
            all_probs.append(probs)

        # Convert to a proper numpy array
        all_probs = np.vstack(all_probs)

        # Create the submission DataFrame
        submission_df = pd.read_csv(submission_df_path)
        for i, label in enumerate(learn.dls.vocab):
            submission_df[label] = all_probs[:, i]  # Assign probabilities to each label

        # Save the submission DataFrame to a CSV file
        submission_df.to_csv(submission_output_path, index=False)
        print(f"Submission file saved to: {submission_output_path}")




#################################### TOP LEVEL CODE ############################


#################################### PREP TRAIN DATA ###########################

# Get the device your model will be running on (either 'cuda' or 'cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train = pd.read_csv(f'{DATASET_DIR}Train.csv')
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

output_dir = '/content/chunks'
os.makedirs(output_dir, exist_ok=True)

#No real elegant way to do this.
# now that we're not always training.  unique_classes = train_data['target'].nunique()
unique_classes = 5


# Instantiate DataPreparation class
data_prep = DataPreparation(DATASET_DIR, 'images/images/', model_input_size)
# Instantiate ModelTraining class
model_training = ModelTraining(MODELS_DIR, SEED, N_FOLDS, BATCH_SIZE, model_input_size, EPOCHS, INIT_LR, NUM_WORKER, PATIENCE, device, unique_classes)
# Instantiate TestAndSubmission class
test_sub = TestAndSubmission(MODELS_DIR, DATASET_DIR, unique_classes)




MAX_SAMPLES = None # 250  # Limit to 10 images for quick testing

if not skip_training:
    train_data = data_prep.prepare_train_data(train, skf, output_dir, BATCH_SIZE, max_samples=MAX_SAMPLES)

    # Actual training
    for model_base in model_bases:
        model_training.train_and_evaluate_model(model_base, train_data, current_datetime, scores_file_path, test_sub, data_prep)
else:
    print("Skipping training as .pth files exist for all model bases.")




##############TESTING
def find_recent_model_dirs(model_bases, models_dir):
    recent_model_dirs = {}
    for model_base in model_bases:
        model_dirs = sorted(glob.glob(f'{models_dir}/{model_base}_*'), reverse=True)
        for model_dir in model_dirs:
            for fold in range(N_FOLDS):
                pkl_files = glob.glob(f'{model_dir}/{model_base}_fold{fold}.pkl')
                pth_files = glob.glob(f'{model_dir}/{model_base}_fold{fold}_epoch*.pth')
                if pkl_files or pth_files:
                    recent_model_dirs[model_base] = model_dir
                    break
            if model_base in recent_model_dirs:
                break
        if model_base not in recent_model_dirs:
            recent_model_dirs[model_base] = None
    return recent_model_dirs

def get_model_architecture(model_name, num_classes):

    model = create_model(model_name, pretrained=False, num_classes=num_classes)
    return model

def test_with_saved_models(model_bases, scores_file_path, device, test_sub, test_df, recent_model_dirs):
    scores_df = pd.read_csv(scores_file_path)
    model_weights = scores_df.groupby('Model')['Score'].min().apply(lambda x: 1/x).to_dict()
    weighted_ensemble = None
    models_found = False

    for model_base in model_bases:
        for fold in range(N_FOLDS):
            pkl_path = f'{recent_model_dirs[model_base]}/{model_base}_fold{fold}.pkl'
            pth_paths = glob.glob(f'{recent_model_dirs[model_base]}/{model_base}_fold{fold}_epoch*.pth')

            if os.path.exists(pkl_path):
                model_path = pkl_path
            elif pth_paths:
                model_path = pth_paths[0]  # Take the first file if there are multiple
            else:
                print(f"No model file found for {model_base} fold {fold}")
                continue

            models_found = True
            try:
                learn = load_learner(model_path, cpu=False)
                learn.model.to(device)
            except Exception as e:
                print(f"Error loading model from {model_path}: {e}")
                continue

            model_weight = model_weights[f'{model_base}_fold{fold}']
            if weighted_ensemble is None:
                weighted_ensemble = np.zeros((len(test_df), len(learn.dls.vocab)))

            preds = np.array([test_sub.predict_from_chunks(learn, img_path, model_base, device) for img_path in test_df['path']])
            weighted_ensemble += preds * model_weight

    if not models_found:
        print("No models were found for any base or fold. Cannot proceed with testing.")
        return None

    normalized_preds = weighted_ensemble / sum(model_weights.values())

    for i, label in enumerate(learn.dls.vocab):
        test_df[label] = normalized_preds[:, i]

    return test_df


if skip_training:
    recent_model_dirs = find_recent_model_dirs(model_bases, MODELS_DIR)
    print(recent_model_dirs)

    if all(dir is None for dir in recent_model_dirs.values()):
        print("No models were found for any base or fold. Cannot proceed with testing.")
    else:
        test_df = pd.read_csv(f'{DATASET_DIR}Test.csv')
        test_df['path'] = test_df['filename'].map(lambda x: f'images/images/{x}')
        test_df = test_with_saved_models(model_bases, scores_file_path, device, test_sub, test_df, recent_model_dirs)

        if test_df is not None:
            submission_dir = f"{MODELS_PATH}/submission"
            os.makedirs(submission_dir, exist_ok=True)
            submission_path = f"{submission_dir}/{MODEL_BASE}_final.csv"
            sample_submission_df = pd.read_csv(f"{DATASET_DIR}SampleSubmission.csv")
            sample_submission_df = sample_submission_df[['ID']]
            sample_submission_df = pd.merge(sample_submission_df, test_df[['ID'] + list(test_df.columns.drop(['ID', 'path']))], on='ID')
            sample_submission_df.to_csv(submission_path, index=False)
            print(f"Final submission file saved to: {submission_path}")
        else:
            print("Testing could not be completed. No submission file created.")



Mounted at /content/drive
Collecting timm
  Downloading timm-0.9.12-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: timm
Successfully installed timm-0.9.12
cp: target '/content/cgiar' is not a directory
Copying and unzipping files to local directory...


Processing images: 100%|██████████| 1304/1304 [12:18<00:00,  1.77it/s]


Weights tensor: tensor([  2.3493,   2.6517,   5.6584, 437.7974,  54.9067], device='cuda:0')
Device: cuda
Training efficientvit_l2.r384_in1k - Fold 0
Starting training for efficientvit_l2.r384_in1k, fold 0


epoch,train_loss,valid_loss,time
0,1.359639,1.28319,2:59:54


Model exported successfully: /content/models//efficientvit_l2.r384_in1k_fold0_epoch0.pkl
Submission file saved to: /content/models//submission_efficientvit_l2.r384_in1k_fold0_epoch0.csv
Testing and submission saved for epoch 0


epoch,train_loss,valid_loss,time


KeyboardInterrupt: 

If we want to try modify the weights after inference

In [None]:
import pandas as pd

# Load the submission data
file_path = '/content/models/submission_efficientvit_l2.r384_in1k_fold0_epoch0.csv'
df = pd.read_csv(file_path)

# Original weights and modified weights
original_weights = [2.3493, 2.6517, 5.6584, 437.7974, 54.9067]
modified_weights = [1.5, 2.4, 25, 320, 70]

# Adjust the probabilities
for i in range(1, 6):  # Columns 1 to 5
    adjustment_factor = modified_weights[i-1] / original_weights[i-1]
    df.iloc[:, i] *= adjustment_factor

# Normalize the probabilities so they sum to 1
df.iloc[:, 1:6] = df.iloc[:, 1:6].div(df.iloc[:, 1:6].sum(axis=1), axis=0)

df.iloc[:, 1:6] = df.iloc[:, 1:6].round(5)

# Save the adjusted data
adjusted_file_path = '/content/models/adjusted_submission.csv'
df.to_csv(adjusted_file_path, index=False)


To test submission with 1 0 0 0 0 instead of probabilities and see how many of each column is picked

In [None]:
import pandas as pd
import numpy as np

file_path = '/content/models/adjusted_submission.csv'

# Load the adjusted submission data
df_submission = pd.read_csv(file_path)

# Function to apply the transformation
def apply_max_to_one_np(row):
    numeric_row = row.iloc[1:].to_numpy()
    max_val_index = np.argmax(numeric_row)
    numeric_row[:] = 0
    numeric_row[max_val_index] = 1
    return pd.Series(np.concatenate(([row.iloc[0]], numeric_row)), index=row.index)

# Ensure that all columns except 'ID' are of the type 'float64'
for col in df_submission.columns:
    if col != 'ID':
        df_submission[col] = df_submission[col].astype('float64')

# Apply the numpy-based function to the dataframe
transformed_df_submission_np = df_submission.apply(apply_max_to_one_np, axis=1)

# Count the number of 1s in each column
column_counts = transformed_df_submission_np.iloc[:, 1:].sum()
print("Column Counts:")
print(column_counts)

# Save the transformed dataframe back to a CSV file
output_path = '/content/models/transformed_submission.csv'
transformed_df_submission_np.to_csv(output_path, index=False)

# Output path of the saved file
output_path


Column Counts:
DR       1638
G        3624
ND       1110
WD       1530
other     761
dtype: int64


'/content/models/transformed_submission.csv'