In [1]:
import torch
from Helper.ml_models import MapillaryTrainedModel

# Setze hier den gewünschten Modellnamen ein (z.B. 'fcn_resnet101' oder 'deeplabv3_resnet50')
model_name = 'fcn_resnet101'

# Initialisiere das Modell (verwende einen Dummy-Ordner, falls du nur die Initialisierung testen möchtest)
dummy_folder = "dummy_folder_for_testing"
# Erstelle den Ordner, falls er nicht existiert:
import os
os.makedirs(dummy_folder, exist_ok=True)

model_instance = MapillaryTrainedModel(
    model_name=model_name,
    width=520,
    height=520,
    weights_name='',
    folder_path=dummy_folder,
    start_epoch='latest'
)

# Ausgabe: Auf welchem Gerät befindet sich das Modell und wie viele Klassen wurden initialisiert?
print("Modellgerät:", next(model_instance.model.parameters()).device)
print("Anzahl der Klassen (num_classes):", model_instance.num_classes)


2025-03-04 11:54:28.068419: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using CUDA GPU
Model loaded: fcn_resnet101 | Device: cuda 
Error loading Model with Epoch latest: Error(s) in loading state_dict for FCN:
	size mismatch for classifier.4.weight: copying a param with shape torch.Size([20, 512, 1, 1]) from checkpoint, the shape in current model is torch.Size([124, 512, 1, 1]).
	size mismatch for classifier.4.bias: copying a param with shape torch.Size([20]) from checkpoint, the shape in current model is torch.Size([124]).
Skipping local .pth load due to error above.
Modellgerät: cuda:0
Anzahl der Klassen (num_classes): 124


In [2]:
import os
import ray.cloudpickle as pickle

# Gib hier den Pfad zum Checkpoint an, den du laden möchtest.
# Passe den Pfad ggf. an den tatsächlichen Speicherort an.
checkpoint_path = "/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/fcn_resnet101/train_hyper_c75247d6_1_auto_cast=True,batch_size=4,learning_rate=0.0001,max_epochs=100,weight_decay=0.0000_2025-02-22_22-23-45/checkpoint_000099/checkpoint.pkl"

if os.path.isfile(checkpoint_path):
    with open(checkpoint_path, "rb") as fp:
        checkpoint_data = pickle.load(fp)
    
    # Überprüfe die Form der Parameter im Klassifikationslayer
    if "classifier.4.weight" in checkpoint_data["model_state"]:
        weight_shape = checkpoint_data["model_state"]["classifier.4.weight"].shape
        bias_shape = checkpoint_data["model_state"]["classifier.4.bias"].shape
        print("Checkpoint classifier.4.weight shape:", weight_shape)
        print("Checkpoint classifier.4.bias shape:", bias_shape)
    else:
        print("Parameter 'classifier.4.weight' nicht im Checkpoint gefunden.")
else:
    print("Kein Checkpoint gefunden an:", checkpoint_path)


Checkpoint classifier.4.weight shape: torch.Size([124, 512, 1, 1])
Checkpoint classifier.4.bias shape: torch.Size([124])


In [3]:
import os

# Passe diesen Pfad an den Ordner an, der in deinem Modell als folder_path verwendet wird.
# In unserem Test haben wir einen Dummy-Ordner benutzt.
model_folder = "/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG"  # oder den tatsächlichen Ordner, der für deine Modelle genutzt wird

# Suche nach .pth-Dateien im Modellordner
pth_files = [f for f in os.listdir(model_folder) if f.endswith('.pth')]
print("Gefundene .pth-Dateien im Modellordner:", pth_files)


Gefundene .pth-Dateien im Modellordner: []


In [4]:
# -----------------------------
# 5) REMAINDER OF YOUR EVAL CODE
# -----------------------------
import pandas as pd
from Helper.ml_models import TrainedModel, K_Fold_Dataset

def compute_confusion_matrix(predicted, ground_truth, num_classes):
    mask = (ground_truth >= 0) & (ground_truth < num_classes)
    label = num_classes * ground_truth[mask] + predicted[mask]
    count = torch.bincount(label, minlength=num_classes**2)
    confusion_matrix = count.reshape(num_classes, num_classes)
    return confusion_matrix

def compute_miou(confusion_matrix):
    intersection = torch.diag(confusion_matrix)
    ground_truth_set = confusion_matrix.sum(1)
    predicted_set = confusion_matrix.sum(0)
    union = ground_truth_set + predicted_set - intersection
    IoU = intersection / (union + 1e-6)
    mIoU = torch.mean(IoU)
    return mIoU.item(), IoU

def compute_mean_pixel_accuracy(confusion_matrix):
    true_positive = torch.diag(confusion_matrix)
    total_pixels = confusion_matrix.sum(1)
    pixel_accuracy = true_positive / (total_pixels + 1e-6)
    mPA = torch.mean(pixel_accuracy)
    return mPA.item(), pixel_accuracy

def compute_fwiou(confusion_matrix):
    total_pixels = confusion_matrix.sum()
    ground_truth_set = confusion_matrix.sum(1)
    intersection = torch.diag(confusion_matrix)
    union = ground_truth_set + confusion_matrix.sum(0) - intersection
    IoU = intersection / (union + 1e-6)
    FWIoU = (ground_truth_set * IoU) / total_pixels
    FWIoU = FWIoU.sum()
    return FWIoU.item()

def compute_dice_coefficient(confusion_matrix):
    intersection = torch.diag(confusion_matrix)
    ground_truth_set = confusion_matrix.sum(1)
    predicted_set = confusion_matrix.sum(0)
    dice = (2 * intersection) / (ground_truth_set + predicted_set + 1e-6)
    mean_dice = torch.mean(dice)
    return mean_dice.item(), dice


In [5]:
import torch
from tqdm import tqdm
from Helper.ml_models import MapillaryTrainedModel, MapillaryDataLoader
NUM_CLASSES = 124

mapillary_loader = MapillaryDataLoader(
    train_images_dir="/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/Mapillary_Vistas/training/images",
    train_annotations_dir="/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/Mapillary_Vistas/training_own",
    val_images_dir="/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/Mapillary_Vistas/validation/images",
    val_annotations_dir="/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/Mapillary_Vistas/validation_own"
)

# Stelle sicher, dass das Testdataset definiert ist:
test_dataset = mapillary_loader.test_dataset  # Falls noch nicht gesetzt

# Setze das Zielgerät (CUDA oder CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Zielgerät:", device)

# Hole ein einzelnes Beispiel aus dem Testdatensatz
sample_image, sample_annotation = test_dataset[0]

# Verschiebe die Daten auf das Zielgerät (falls nicht schon geschehen)
sample_image_device = sample_image.to(device)
sample_annotation_device = sample_annotation.to(device)

# Führe die Inferenz mit dem geladenen Modell durch (model_loaded muss vorher erfolgreich geladen worden sein)
output = model_instance.inference(sample_image_device)
print("Output befindet sich auf:", output.device)

# Berechne die Vorhersage und prüfe das Gerät
predicted = output.argmax(1).squeeze(0)
print("Predicted Tensor befindet sich auf:", predicted.device)
print("Annotation befindet sich auf:", sample_annotation_device.device)

# Berechne die Confusion Matrix; beachte, dass compute_confusion_matrix mit CPU-Tensoren arbeitet
conf_mat = compute_confusion_matrix(predicted.cpu(), sample_annotation_device.cpu(), NUM_CLASSES)
print("Confusion Matrix (vor .to(device)) befindet sich auf:", conf_mat.device)

# Übertrage die Confusion Matrix auf das Zielgerät
conf_mat = conf_mat.to(device)
print("Confusion Matrix (nach .to(device)) befindet sich auf:", conf_mat.device)



Zielgerät: cuda
Output befindet sich auf: cuda:0
Predicted Tensor befindet sich auf: cuda:0
Annotation befindet sich auf: cuda:0
Confusion Matrix (vor .to(device)) befindet sich auf: cpu
Confusion Matrix (nach .to(device)) befindet sich auf: cuda:0


In [6]:
import os
import ray.cloudpickle as pickle

# Pfad aus der Fehlermeldung – passe diesen ggf. an deinen konkreten Fall an
checkpoint_path = "/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/fcn_resnet101/train_hyper_c75247d6_1_auto_cast=True,batch_size=4,learning_rate=0.0001,max_epochs=100,weight_decay=0.0000_2025-02-22_22-23-45/checkpoint_000099/checkpoint.pkl"

if os.path.isfile(checkpoint_path):
    with open(checkpoint_path, "rb") as fp:
        checkpoint_data = pickle.load(fp)
    
    # Überprüfe die Form der Parameter im Klassifikationslayer
    weight_shape = checkpoint_data["model_state"].get("classifier.4.weight", None)
    bias_shape = checkpoint_data["model_state"].get("classifier.4.bias", None)
    
    if weight_shape is not None and bias_shape is not None:
        print("Checkpoint classifier.4.weight shape:", weight_shape.shape)
        print("Checkpoint classifier.4.bias shape:", bias_shape.shape)
    else:
        print("Die Schlüssel 'classifier.4.weight' oder 'classifier.4.bias' wurden im Checkpoint nicht gefunden.")
else:
    print("Kein Checkpoint gefunden an:", checkpoint_path)


Checkpoint classifier.4.weight shape: torch.Size([124, 512, 1, 1])
Checkpoint classifier.4.bias shape: torch.Size([124])


In [7]:
# Schritt 3: Überprüfe die Form des Klassifikationslayers im aktuell instanzierten Modell
model_instance = MapillaryTrainedModel(
    model_name=model_name,
    width=520,
    height=520,
    weights_name='',
    folder_path="dummy_folder_for_testing",  # Dummy-Pfad oder ein anderer Testpfad
    start_epoch='latest'
)

state_dict_model = model_instance.model.state_dict()
print("Model classifier.4.weight shape:", state_dict_model["classifier.4.weight"].shape)
print("Model classifier.4.bias shape:", state_dict_model["classifier.4.bias"].shape)


Using CUDA GPU
Model loaded: fcn_resnet101 | Device: cuda 
Error loading Model with Epoch latest: Error(s) in loading state_dict for FCN:
	size mismatch for classifier.4.weight: copying a param with shape torch.Size([20, 512, 1, 1]) from checkpoint, the shape in current model is torch.Size([124, 512, 1, 1]).
	size mismatch for classifier.4.bias: copying a param with shape torch.Size([20]) from checkpoint, the shape in current model is torch.Size([124]).
Skipping local .pth load due to error above.
Model classifier.4.weight shape: torch.Size([124, 512, 1, 1])
Model classifier.4.bias shape: torch.Size([124])


In [8]:
import os
import ray.cloudpickle as pickle
from Helper.ml_models import MapillaryTrainedModel

def load_checkpointed_model_ray(model_name, checkpoint_path):
    """
    Debug-Version der Funktion: Lädt ein Modell und gibt die Shapes der classifier-Parameter aus.
    """
    # Initialisiere das Modell (MapillaryTrainedModel erwartet 124 Klassen, wenn die Colormap so definiert ist)
    loaded_model = MapillaryTrainedModel(model_name=model_name, width=520, height=520, weights_name='')
    
    # Lade den Checkpoint
    with open(checkpoint_path, "rb") as fp:
        checkpoint_data = pickle.load(fp)
    
    # Debug-Ausgabe: Shapes der Parameter des Klassifikationslayers aus dem Checkpoint
    cp_weight = checkpoint_data["model_state"].get("classifier.4.weight")
    cp_bias = checkpoint_data["model_state"].get("classifier.4.bias")
    print("DEBUG: Checkpoint classifier.4.weight shape:", cp_weight.shape)
    print("DEBUG: Checkpoint classifier.4.bias shape:", cp_bias.shape)
    
    # Versuch, den State Dict zu laden
    try:
        loaded_model.model.load_state_dict(checkpoint_data["model_state"], strict=True)
    except Exception as e:
        print("Error loading checkpoint:", e)
    
    return loaded_model

# Bitte passe diesen Pfad an den tatsächlichen Checkpoint an, den du debuggen möchtest.
checkpoint_path = "/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/fcn_resnet101/train_hyper_c75247d6_1_auto_cast=True,batch_size=4,learning_rate=0.0001,max_epochs=100,weight_decay=0.0000_2025-02-22_22-23-45/checkpoint_000099/checkpoint.pkl"
model_name = "fcn_resnet101"

# Lade das Modell über die Debug-Funktion
model_loaded = load_checkpointed_model_ray(model_name, checkpoint_path)


Using CUDA GPU
Model loaded: fcn_resnet101 | Device: cuda 
Error loading Model with Epoch latest: Error(s) in loading state_dict for FCN:
	size mismatch for classifier.4.weight: copying a param with shape torch.Size([20, 512, 1, 1]) from checkpoint, the shape in current model is torch.Size([124, 512, 1, 1]).
	size mismatch for classifier.4.bias: copying a param with shape torch.Size([20]) from checkpoint, the shape in current model is torch.Size([124]).
Skipping local .pth load due to error above.
DEBUG: Checkpoint classifier.4.weight shape: torch.Size([124, 512, 1, 1])
DEBUG: Checkpoint classifier.4.bias shape: torch.Size([124])


In [9]:
import os
import ray.cloudpickle as pickle

# Passe den Pfad ggf. an deinen Checkpoint an
checkpoint_path = "/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/fcn_resnet101/train_hyper_c75247d6_1_auto_cast=True,batch_size=4,learning_rate=0.0001,max_epochs=100,weight_decay=0.0000_2025-02-22_22-23-45/checkpoint_000099/checkpoint.pkl"

with open(checkpoint_path, "rb") as fp:
    checkpoint_data = pickle.load(fp)

# Drucke alle Schlüssel, die "classifier" enthalten, zusammen mit ihrer Shape:
for key, tensor in sorted(checkpoint_data["model_state"].items()):
    if "classifier" in key:
        print(f"{key}: {tensor.shape}")


classifier.0.weight: torch.Size([512, 2048, 3, 3])
classifier.1.bias: torch.Size([512])
classifier.1.num_batches_tracked: torch.Size([])
classifier.1.running_mean: torch.Size([512])
classifier.1.running_var: torch.Size([512])
classifier.1.weight: torch.Size([512])
classifier.4.bias: torch.Size([124])
classifier.4.weight: torch.Size([124, 512, 1, 1])


In [10]:
import os
import ray.cloudpickle as pickle
from Helper.ml_models import MapillaryTrainedModel

# Verwende den Parameter skip_local_load=True, um das Laden eines lokalen .pth Checkpoints zu verhindern.
model_name = "fcn_resnet101"
dummy_folder = "dummy_folder_for_testing"
os.makedirs(dummy_folder, exist_ok=True)

# Initialisiere das Modell ohne lokalen Checkpoint-Ladevorgang
model_instance = MapillaryTrainedModel(
    model_name=model_name,
    width=520,
    height=520,
    weights_name='',
    folder_path=dummy_folder,
    start_epoch='latest',
    skip_local_load=True  # Wichtig: überspringt das Laden des lokalen .pth Checkpoints
)

print("Modell initialisiert mit skip_local_load=True")
print("Model classifier.4.weight shape:", model_instance.model.state_dict()["classifier.4.weight"].shape)
print("Anzahl der Klassen (num_classes):", model_instance.num_classes)

# Jetzt laden wir den Ray Tune Checkpoint (das .pkl), der korrekt 124 Klassen hat.
def load_checkpointed_model_ray(model_instance, checkpoint_path):
    with open(checkpoint_path, "rb") as fp:
        checkpoint_data = pickle.load(fp)
    
    # Debug-Ausgabe
    cp_weight = checkpoint_data["model_state"].get("classifier.4.weight")
    cp_bias = checkpoint_data["model_state"].get("classifier.4.bias")
    print("DEBUG: Checkpoint classifier.4.weight shape:", cp_weight.shape)
    print("DEBUG: Checkpoint classifier.4.bias shape:", cp_bias.shape)
    
    try:
        model_instance.model.load_state_dict(checkpoint_data["model_state"], strict=True)
        print("Checkpoint erfolgreich geladen.")
    except Exception as e:
        print("Fehler beim Laden des Checkpoints:", e)
    
    return model_instance

# Bitte passe den Pfad an deinen tatsächlichen Checkpoint an:
checkpoint_path = "/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/fcn_resnet101/train_hyper_c75247d6_1_auto_cast=True,batch_size=4,learning_rate=0.0001,max_epochs=100,weight_decay=0.0000_2025-02-22_22-23-45/checkpoint_000099/checkpoint.pkl"

model_loaded = load_checkpointed_model_ray(model_instance, checkpoint_path)


Using CUDA GPU
Model loaded: fcn_resnet101 | Device: cuda 
Error loading Model with Epoch latest: Error(s) in loading state_dict for FCN:
	size mismatch for classifier.4.weight: copying a param with shape torch.Size([20, 512, 1, 1]) from checkpoint, the shape in current model is torch.Size([124, 512, 1, 1]).
	size mismatch for classifier.4.bias: copying a param with shape torch.Size([20]) from checkpoint, the shape in current model is torch.Size([124]).
Skipping local .pth load due to error above.
Modell initialisiert mit skip_local_load=True
Model classifier.4.weight shape: torch.Size([124, 512, 1, 1])
Anzahl der Klassen (num_classes): 124
DEBUG: Checkpoint classifier.4.weight shape: torch.Size([124, 512, 1, 1])
DEBUG: Checkpoint classifier.4.bias shape: torch.Size([124])
Checkpoint erfolgreich geladen.
