In [1]:
from fastai.vision.all import *
from fastcore.parallel import *
import fastai
import pandas as pd
from pathlib import Path
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from fastai.callback.tracker import SaveModelCallback
import timm
import os
from PIL import Image
import torchvision.transforms as transforms




In [None]:
from google.colab import drive
drive.mount('/content/drive')

model = timm.create_model("efficientformerv2_s2", pretrained=True)
torch.save(model.state_dict(), "efficientformerv2_s2_weights.pth")

metadata_path = Path("/content/drive/MyDrive/CS675-repo/isic_2024_data/train-metadata.csv")
df = pd.read_csv(metadata_path)

columns_to_drop = ['copyright_license', 'attribution', 'image_type', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4',
                   'iddx_5', 'iddx_full', 'mel_mitotic_index', 'mel_thick_mm', 'tbp_tile_type',
                   'tbp_lv_dnn_lesion_confidence', 'lesion_id']
cat_names = ['sex', 'anatom_site_general', 'tbp_lv_location', 'tbp_lv_location_simple']
new_cat_columns = pd.read_csv('/content/drive/MyDrive/CS675-repo/isic_2024_data/new_cat_columns.csv')['new_cat_columns'].tolist()
new_cat_columns = [col for col in new_cat_columns if isinstance(col, str) and col in df.columns]
cont_names = [x for x in df.columns if x not in (cat_names + ['target', 'isic_id','patient_id'] + columns_to_drop)]
y_col = 'target'
image_col = 'isic_id'

In [3]:
image_dir = '/content/drive/MyDrive/CS675-repo/isic_2024_data/train-image/image'

df = df[df['isic_id'].apply(lambda x: os.path.exists(os.path.join(image_dir, f"{x}.jpg")))].reset_index(drop=True)

for col in new_cat_columns:
    df[col] = df[col].astype('category').cat.codes



In [26]:
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder

image_dir = '/content/drive/MyDrive/CS675-repo/isic_2024_data/train-image/image'

df = df[df['isic_id'].apply(lambda x: os.path.exists(os.path.join(image_dir, f"{x}.jpg")))].reset_index(drop=True)

df = df.dropna(subset=[y_col]).reset_index(drop=True)

for col in new_cat_columns:
    df[col] = df[col].astype(str)
    df[col] = LabelEncoder().fit_transform(df[col])

df[cont_names] = df[cont_names].replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=cont_names).reset_index(drop=True)


In [11]:
for col in new_cat_columns:
    df[col] = df[col].astype('category')
    if df[col].isnull().any():
        df[col] = df[col].cat.add_categories("Missing").fillna("Missing")
    df[col] = df[col].cat.codes


In [None]:
from fastai.tabular.model import TabularModel

class ImageTabDataset(Dataset):
    def __init__(self, df, image_dir, cat_cols, cont_cols, y_col, image_size=224):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.cat_cols = cat_cols
        self.cont_cols = cont_cols
        self.y_col = y_col
        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      row = self.df.iloc[idx]
      img_path = os.path.join(self.image_dir, f"{row['isic_id']}.jpg")
      img = Image.open(img_path).convert("RGB")
      img = self.transform(img)

      x_cat = torch.tensor(row[self.cat_cols].astype('int32').values, dtype=torch.long)
      x_cont = torch.tensor(row[self.cont_cols].values.astype('float32'), dtype=torch.float)
      y = torch.tensor(row[self.y_col], dtype=torch.long)

      return img, x_cat, x_cont, y


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_indices, valid_indices = train_test_split(
    df.index,
    test_size=0.25,
    stratify=df[y_col],
    random_state=42
)

train_df = df.iloc[train_indices]
valid_df = df.iloc[valid_indices]

train_dataset = ImageTabDataset(train_df, image_dir, new_cat_columns, cont_names, y_col)
valid_dataset = ImageTabDataset(valid_df, image_dir, new_cat_columns, cont_names, y_col)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False, num_workers=4)
dls = DataLoaders(train_loader, valid_loader)

def get_emb_szs(df, new_cat_columns):
    return [(df[col].nunique() + 1, min(50, (df[col].nunique() + 1) // 2)) for col in new_cat_columns]

emb_szs = get_emb_szs(train_df, new_cat_columns)
n_cont = len(cont_names)
out_sz = len(train_df[y_col].unique())

class ImageTabularModel_1(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=0.5):
        super().__init__()
        self.cnn = models.resnet50(weights=True)
        for param in self.cnn.parameters():
            param.requires_grad = False
        num_ftrs = self.cnn.fc.in_features
        self.cnn.fc = nn.Linear(num_ftrs, out_sz)
        for param in self.cnn.fc.parameters():
            param.requires_grad = True
        self.tab_net = TabularModel(emb_szs, n_cont, out_sz, layers, ps)
        self.head = nn.Linear(out_sz * 2, out_sz)

    def forward(self, x_img, x_cat, x_cont):
        if x_cat.dtype != torch.long:
            x_cat = x_cat.long()
        img_out = self.cnn(x_img)
        tab_out = self.tab_net(x_cat, x_cont)
        return self.head(torch.cat([img_out, tab_out], dim=1))

class ImageTabularModel_2(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, ps=0.5):
        super().__init__()
        self.cnn = timm.create_model("efficientformerv2_s2", pretrained=True)
        self.fc = nn.Linear(self.cnn.num_features, out_sz)
        self.tab_net = TabularModel(emb_szs, n_cont, out_sz, layers, ps)
        self.head = nn.Linear(1002, out_sz)

    def forward(self, x_img, x_cat, x_cont):
        if x_cat.dtype != torch.long:
            x_cat = x_cat.long()
        img_out = self.cnn(x_img)
        tab_out = self.tab_net(x_cat, x_cont)
        return self.head(torch.cat([img_out, tab_out], dim=1))

model_dir = Path("/content/drive/MyDrive/CS675-repo/models")
model_dir.mkdir(parents=True, exist_ok=True)

loss_func = CrossEntropyLossFlat()

model_1 = ImageTabularModel_1(emb_szs, n_cont, out_sz, layers=[512, 256, 128], ps=0.5).to(device)
model_2 = ImageTabularModel_2(emb_szs, n_cont, out_sz, layers=[512, 256, 128], ps=0.5).to(device)

model_1 = torch.nn.DataParallel(model_1)
model_2 = torch.nn.DataParallel(model_2)

from fastai.callback.tracker import SaveModelCallback

from fastai.callback.core import Callback

class SaveEachEpochCallback(Callback):
    def after_epoch(self):
        epoch = self.epoch
        filename = f"epoch_{epoch}_model1"
        self.learn.save(filename)
        print(f"✅ Saved model: {filename}.pth")


learn_1 = Learner(
    dls,
    model_1,
    loss_func=loss_func,
    opt_func=partial(Adam, lr=0.001),
    metrics=accuracy,
    cbs=[
        SaveEachEpochCallback(),
        SaveModelCallback(monitor='valid_loss')
    ],
    wd=1e-3,
    model_dir=Path("/content/drive/MyDrive/CS675-repo/models")
)


learn_2 = Learner(
    dls, model_2, loss_func=loss_func, opt_func=partial(Adam, lr=0.001),
    metrics=accuracy,
    cbs=SaveModelCallback(monitor='valid_loss', fname='best_model_2'),
    wd=1e-3,
    model_dir=model_dir
)




In [None]:
custom_lr = 0.005
learn_1.save('model')
learn_1.fine_tune(5)
#learn_2.fine_tune(5)

In [None]:
learn_2.save('model1')
#learn_1.fine_tune(5)
learn_2.fine_tune(5)

In [34]:
train_columns = df.columns.tolist()
train_columns.remove('target')

df_test, _ = process_data(df_test, cat_names)

for col in train_columns:
    if col not in df_test.columns:
        df_test[col] = 0

df_test = df_test[train_columns]


In [35]:
from torchvision import transforms as T
from PIL import Image
from io import BytesIO
import numpy as np
import h5py
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

def process_data(df, cat_names):
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    df['numb_pic'] = df.groupby('patient_id')['patient_id'].transform('count')

    if 'age_approx' in df.columns:
        df['age_approx'] = df['age_approx'].fillna(df['age_approx'].mode()[0])
    if 'sex' in df.columns:
        df['sex'] = df['sex'].fillna(df['sex'].mode()[0])

    df = pd.get_dummies(df, columns=cat_names, prefix=cat_names)
    new_cat_columns = [col for col in df.columns if any(col.startswith(name + '_') for name in cat_names)]
    return df, new_cat_columns

ROOT_DIR = "/content/drive/MyDrive/CS675-repo/isic_2024_data/"
TEST_HDF = f'{ROOT_DIR}/test-image.hdf5'
TEST_CSV = f'{ROOT_DIR}/test-metadata.csv'

df_test = pd.read_csv(TEST_CSV)
df_test, _ = process_data(df_test, cat_names)

for col in new_cat_columns:
    if col not in df_test:
        df_test[col] = 0
df_test = df_test[[c for c in df.columns if c != 'target']]

In [None]:
class CombinedDataset_test(Dataset):
    def __init__(self, df, file_hdf, cat_names, cont_names, transforms=None, target_size=(224, 224)):
        self.df = df.reset_index(drop=True)
        self.fp_hdf = h5py.File(file_hdf, mode="r")
        self.isic_ids = df['isic_id'].values
        self.cat_names = cat_names
        self.cont_names = cont_names
        self.transforms = transforms
        self.target_size = target_size

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        isic_id = self.isic_ids[index]
        img = Image.open(BytesIO(self.fp_hdf[isic_id][()])).convert('RGB')
        img = img.resize(self.target_size)
        if self.transforms:
            img = self.transforms(img)

        row = self.df.iloc[index]
        x_cat = torch.tensor(row[self.cat_names].astype('int32').values, dtype=torch.long)
        x_cont = torch.tensor(row[self.cont_names].astype('float32').values, dtype=torch.float)
        dummy_y = torch.tensor(0, dtype=torch.long)
        return img, x_cat, x_cont, dummy_y

data_transforms = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_dataset = CombinedDataset_test(df_test, TEST_HDF, new_cat_columns, cont_names, transforms=data_transforms)
test_loader = DataLoader(test_dataset, batch_size=64, num_workers=4, shuffle=False)

learn_1.model.eval()
learn_2.model.eval()

def get_model_predictions(learn, test_loader):
    with torch.no_grad():
        logits, _ = learn.get_preds(dl=test_loader)
        return F.softmax(logits, dim=1)[:, 1].cpu().numpy()

probs_1 = get_model_predictions(learn_1, test_loader)
probs_2 = get_model_predictions(learn_2, test_loader)

avg_probs = (probs_1 + probs_2) / 2

submission_df = pd.DataFrame({
    "isic_id": test_dataset.isic_ids,
    "target": avg_probs
})

submission_df.to_csv("submission.csv", index=False)
submission_df.head()
