# Ez lesz a baseline CNN model, amit mi készítünk és tanítunk

In [None]:
import os
from PIL import Image, ImageFile
from torchvision import datasets, transforms, models
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import seaborn as sns
import pandas as pd

# Allow the loading of images even if they are corrupted
ImageFile.LOAD_TRUNCATED_IMAGES = True

**Mount our drive folder and define the .zip path**

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the ZIP file in your Google Drive
zip_file_path = '/content/drive/MyDrive/ADATELEMZÉS_HF/top_100_species_images.zip'

**Transformations** (converting from raw input data to processable data)

In [None]:
from torchvision import transforms

# Define transformations to be applied to each image (resizing, converting to tensor, normalizing)
transform = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize images to 288x288 pixels
    transforms.ToTensor(),  # Convert image to a tensor (used by PyTorch)
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

**Define our custom convolutional layer**

In [None]:
import torch
import torch.nn as nn

# Convolutional module (Conv+ReLU+BatchNorm)
class Conv(nn.Module):

    # Constructor gets in and output channels and stride
    def __init__(self, in_channels, channels, stride=1):
        super().__init__()

        # Create 2D Convolution (3x3)
        self.conv = nn.Conv2d(in_channels, channels, kernel_size=3,
                              stride=stride, padding=1, bias=False)

        # Create Batchnorm
        self.bn = nn.BatchNorm2d(channels)

    # Overwrite forward
    def forward(self,x):
        # Call the layers in the proper order
        return self.bn(torch.relu(self.conv(x)))

**Our convolutional network**

In [None]:
class ConvNet(nn.Module):
    def __init__(self, base_channels=16, in_channels=3, num_classes=100):
        super().__init__()

        # First two filters
        self.c11 = Conv(in_channels, base_channels)
        self.c12 = Conv(base_channels, base_channels)

        # Downscale using strided convolution and expand channels
        self.d1 = Conv(base_channels, base_channels*2, 2)             # 128x128 --> 64x64

        # Repeat this 4 times
        self.c21 = Conv(base_channels*2, base_channels*2)
        self.c22 = Conv(base_channels*2, base_channels*2)
        self.d2 = Conv(base_channels*2, base_channels*4, 2)             # 64x64 --> 32x32

        self.c31 = Conv(base_channels*4, base_channels*4)
        self.c32 = Conv(base_channels*4, base_channels*4)
        self.d3 = Conv(base_channels*4, base_channels*8, 2)             # 32x32 --> 16x16

        self.c41 = Conv(base_channels*8, base_channels*8)
        self.c42 = Conv(base_channels*8, base_channels*8)
        self.d4 = Conv(base_channels*8, base_channels*16, 2)             # 16x16 --> 8x8

        # self.c51 = Conv(base_channels*16, base_channels*16)
        # self.c52 = Conv(base_channels*16, base_channels*16)
        # self.d5 = Conv(base_channels*16, base_channels*32, 2)

        # Add two average pooling layers to reduce spatial dimensions to 1x1
        self.avgpool1 = nn.AvgPool2d(kernel_size=2, stride=2)  # 8x8 --> 4x4
        self.avgpool2 = nn.AvgPool2d(kernel_size=2, stride=2)  # 4x4 --> 2x2
        self.avgpool3 = nn.AvgPool2d(kernel_size=2, stride=2)  # 2x2 --> 1x1

        # Final classifier layer using 1x1 convolution
        self.classifier = nn.Conv2d(base_channels*16, num_classes, kernel_size=1)

    def forward(self, x):
        # Pass input through convolutional layers
        x = self.d1(self.c12(self.c11(x)))
        x = self.d2(self.c22(self.c21(x)))
        x = self.d3(self.c32(self.c31(x)))
        x = self.d4(self.c42(self.c41(x)))
        # x = self.d5(self.c52(self.c51(x)))

        # Apply average pooling to reduce spatial dimensions
        # multikolineáris?
        x = self.avgpool1(x)
        x = self.avgpool2(x)
        x = self.avgpool3(x)

        # Pass through classifier layer
        x = self.classifier(x)

        return x.view(x.size(0), -1)  # Flatten output to (batch_size, num_classes)

We have to get the class_id of a picture based on its observationID from the csv file

In [None]:
#### Load the dataset from CSV file
df = pd.read_csv('/content/drive/MyDrive/ADATELEMZÉS_HF/FungiCLEF2023_train_metadata_PRODUCTION.csv')

#### Find the top 100 species by number of examples
top_100_species = df['species'].value_counts().head(100).index.tolist()

#### Filter the dataframe to only include rows from the top 100 species
df_top_100 = df[df['species'].isin(top_100_species)]

#### Create a new dataframe with only the 'image_path' and 'class_id' columns
top_100_pairs = df_top_100[['image_path', 'species']]

#### Print the shape of the final dataframe
print(f"Final dataframe shape: {top_100_pairs.shape}")

unique_species_ids = top_100_pairs['species'].nunique()
print(f"Number of unique species: {unique_species_ids}")
print(top_100_species)

# Create a dictionary that maps each species to a unique integer label
species_labels = {species: index for index, species in enumerate(top_100_species)}

print(species_labels)

# Defining a custom dataset for loading our data and labels

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset
import zipfile

# Define a custom dataset class for loading our images and labels
class CustomDataset(Dataset):
    def __init__(self, zip_file_path, top_100_df, transform=None):
        self.zip_file_path = zip_file_path
        self.top_100_df = top_100_df
        self.transform = transform
        self.image_paths = []  # List to store image paths
        self.labels = []       # List to store image labels
        image_count = 0
        label_count = 0
        iter = 0

        print("Zip folder path:", zip_file_path)  # Print the folder path

        # Open the ZIP file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            # Get the name of the folder
            folder_name = os.path.splitext(os.path.basename(zip_file_path))[0]
            print("Folder name:", folder_name)  # Print the subfolder name

            # Loop through each file in the ZIP file
            for file_info in zip_file.infolist():
                file_name = file_info.filename
                # print("File name:", file_name)  # Print the file name

                if file_name.startswith(folder_name + '/') and file_name.endswith('.JPG') and not file_name.startswith('__MACOSX/'):
                    #print("Matched file:", file_name)  # Print the matched file name

                    if(iter % 10 == 0):
                        # Get the image path after the '/' symbol
                        image_path_after_slash = os.path.basename(file_name)
                        #print("Needed name: ", image_path_after_slash)

                        # Extract the label from the file name
                        label = species_labels.get(self.top_100_df.loc[self.top_100_df['image_path'] == image_path_after_slash, 'species'].values[0])
                        # print("Extracted label:", label)  # Print the extracted label

                        self.image_paths.append(file_name)  # Store the image path within the ZIP file
                        image_count += 1
                        self.labels.append(label)  # Store the corresponding label
                        label_count += 1

                    iter = iter + 1

                    if(label_count % 100 == 0):
                      print(label_count)

        # print("Image paths:", self.image_paths)
        # print("Labels:", self.labels)
        print(len(set(self.labels)))
        print("ZIP file path:", zip_file_path)  # Print the ZIP file path
        print("Subfolder name:", folder_name)  # Print the subfolder name
        print("Image count:", image_count)
        print("Label count:", label_count)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Open the ZIP file and read the image
        with zipfile.ZipFile(self.zip_file_path, 'r') as zip_file:
            img_path = self.image_paths[idx]
            image = Image.open(zip_file.open(img_path)).convert("RGB")
            label = self.labels[idx]

            if self.transform:
                image = self.transform(image)

        return image, label

**Initialize dataset and dataloader**

In [None]:
# Create dataset and dataloader objects for the training dataset
dataset = CustomDataset(zip_file_path=zip_file_path, top_100_df=top_100_pairs, transform=transform)

# Create DataLoaders to efficiently load data in batches
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)


**Function to instantiate Netzwerk**

In [None]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

haveCuda = torch.cuda.is_available()
print(haveCuda)

# Instantiate network and convert it to CUDA
def createNet():
    net = ConvNet()
    if haveCuda:
        net = net.cuda()
    return net

net = createNet()

**Loss**

In [None]:
def createLoss():
    return nn.CrossEntropyLoss()

**Optimizer**

In [None]:
from torch import optim

# Weight decay is the relative weight of the L2 regularization term
def createOptimizer():
    return optim.Adam(net.parameters(), lr=1e-5, weight_decay=1e-6)

**Learning rate scheduler (we will adjust the actual learning rate gradually)**

In [None]:
# Number of epochs (times iterating through the whole dataset)
numEpoch = 5

# Cosine annealing learning rate scheduler - in 50 epochs the lr will become 0.01
def createScheduler():
    return optim.lr_scheduler.CosineAnnealingLR(optimizer,numEpoch,eta_min=1e-2)

Progress bar to make the training more interesting

In [None]:
from IPython.display import HTML, display

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

In [None]:
net = createNet()
loss = createLoss()
optimizer = createOptimizer()
scheduler = createScheduler()

**Define a training for a given model and dataset**

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    running_corrects = 0
    total = 0

    for batch_idx, (inputs, labels) in enumerate(dataloader):
        print("Itt vagyok:", batch_idx + 1)
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss = running_loss + (loss.item() * inputs.size(0))
        running_corrects = running_corrects + torch.sum(preds == labels.data).item()
        total = total + labels.size(0)
        print("Total correct:", running_corrects)
        print("Total:", total)

    epoch_loss = running_loss / total
    epoch_acc = running_corrects.double() / total

    return epoch_loss, epoch_acc

**Define evaluation for a given model and dataset (no change in gradients, just evaluating the model)**

In [None]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    total = 0

    with torch.no_grad():
        for batch_idx, (inputs, labels) in enumerate(dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, labels)

            running_loss = running_loss + (loss.item() * inputs.size(0))
            running_corrects = running_corrects + torch.sum(preds == labels.data).item()
            total = total + labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = running_corrects.double() / total

    return epoch_loss, epoch_acc

**Reset weights on a model (needed between epochs)**

In [None]:
def reset_weights(m):
    if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
        m.reset_parameters()

**Define k-fold training with a given number of epochs in each split**

In [None]:
from sklearn.model_selection import KFold
from torch.utils.data import SubsetRandomSampler

def train_kfold(model, dataset, batch_size, num_folds, num_epochs, optimizer, criterion, device):
    # Create a KFold object
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Initialize lists to store the training and validation losses and accuracies
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    # Perform cross-validation
    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        print(f"Fold {fold+1}")

        # Create samplers for training and validation indices
        train_sampler = SubsetRandomSampler(train_idx)
        val_sampler = SubsetRandomSampler(val_idx)

        # Create DataLoaders for training and validation sets
        train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
        val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

        # Train the model for the current fold
        for epoch in range(num_epochs):
            train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device)
            val_loss, val_acc = evaluate(model, val_loader, criterion, device)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            train_accuracies.append(train_acc)
            val_accuracies.append(val_acc)

            print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

        # Reset the model weights for the next fold
        model.apply(reset_weights)

    return train_losses, val_losses, train_accuracies, val_accuracies

# Training loop

In [None]:
# Train the model using k-fold cross-validation
num_folds = 5
num_epochs = 1
batch_size = 16

train_losses, val_losses, train_accuracies, val_accuracies = train_kfold(
    net, dataset, batch_size=16, num_folds=10, num_epochs=3, optimizer=optimizer, criterion= loss, device=device)

# Save the model
torch.save(net.state_dict(), '/content/drive/MyDrive/ADATELEMZÉS_HF/trained_model.pth')

In [None]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

y_pred_classes = y_pred.argmax(axis=1)

macro_f1 = f1_score(y_test, y_pred_classes, average='macro')
print(f"Macro F1 score: {macro_f1:.4f}")

print("Classification Report:\n", classification_report(y_test, y_pred_classes))