In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import re


import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet50, convnext_base, convnext_tiny, convnext_small, convnext_large
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from PIL import Image
from sklearn.model_selection import train_test_split
from torchvision.transforms.functional import to_tensor



import warnings
warnings.filterwarnings("ignore")

np.random.seed(1234)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
data = pd.read_csv('y_train.csv')
data

Unnamed: 0,file_id,cell_line
0,1,MCF7
1,2,RT4
2,3,U-2 OS
3,4,RT4
4,5,A549
...,...,...
9627,9628,PC-3
9628,9629,HEK 293
9629,9630,RT4
9630,9631,PC-3


In [4]:
label_counts = data.cell_line.value_counts()
label_counts

RT4         2100
CACO-2      1626
HEK 293     1378
MCF7        1082
U-2 OS       775
U-251 MG     768
PC-3         663
HeLa         632
A549         608
Name: cell_line, dtype: int64

In [5]:
def combine_images(data, img_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for idx, row in data.iterrows():
        sample_id = row['file_id']
        blue_img_path = os.path.join(img_dir, f"{str(sample_id).zfill(5)}_blue.png")
        red_img_path = os.path.join(img_dir, f"{str(sample_id).zfill(5)}_red.png")
        yellow_img_path = os.path.join(img_dir, f"{str(sample_id).zfill(5)}_yellow.png")

        blue_img = Image.open(blue_img_path).convert('L')
        red_img = Image.open(red_img_path).convert('L')
        yellow_img = Image.open(yellow_img_path).convert('L')

        combined_img = Image.merge("RGB", (red_img, blue_img, yellow_img))

        combined_img_path = os.path.join(output_dir, f"{str(sample_id).zfill(5)}_combined.png")
        combined_img.save(combined_img_path)

In [56]:
#combine_images(data, img_dir="images_train/images_train/", output_dir="images_combined/")

In [23]:
#train_data, val_data = train_test_split(
#    data, test_size=1/3, random_state=42, stratify=data['cell_line'])


In [24]:
#train_data.to_csv("train_data.csv", index=False)
#val_data.to_csv("val_data.csv", index=False)


In [3]:
class CellLineDataset(Dataset):
    def __init__(self, img_dir, labels_file=None, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        if labels_file:
            self.labels_df = pd.read_csv(labels_file)
            self.has_labels = True
            self.class_to_idx = {class_name: i for i, class_name in enumerate(
                self.labels_df["cell_line"].unique())}
        else:
            self.has_labels = False

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if self.has_labels:
            row = self.labels_df.iloc[idx]
            sample_id = row['file_id']
            img_path = os.path.join(self.img_dir, f"{str(sample_id).zfill(5)}_combined.png")
            img = Image.open(img_path)

            if self.transform:
                img = self.transform(img)

            label = self.class_to_idx[row['cell_line']]
            return img, label
        else:
            raise IndexError(f"No matching row found for index {idx}")


In [4]:
def calculate_mean_std(loader):
    mean = 0.
    std = 0.
    nb_samples = 0.
    for data, _ in tqdm(loader): 
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)
        mean += data.mean(2).sum(0)
        std += data.std(2).sum(0)
        nb_samples += batch_samples

    mean /= nb_samples
    std /= nb_samples
    return mean, std

In [6]:
epochs = 100
batch_size =64
lr = 0.001

In [7]:
transform_to_tensor = transforms.Compose([transforms.ToTensor()])

raw_train_data = CellLineDataset(
    img_dir="images_combined/", labels_file="y_train.csv", transform=transform_to_tensor)

raw_train_loader = DataLoader(raw_train_data, batch_size=batch_size, shuffle=True)

mean, std = calculate_mean_std(raw_train_loader)

  0%|          | 0/151 [00:00<?, ?it/s]

100%|██████████| 151/151 [00:08<00:00, 17.41it/s]


In [8]:
# transform = transforms.Compose([
#     transforms.RandomRotation(30),
#     transforms.RandomResizedCrop(64),
#     transforms.RandomHorizontalFlip(),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=mean, std=std)
# ])

transform = transforms.Compose([
    transforms.RandomRotation(30),
    transforms.RandomResizedCrop(112),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomHorizontalFlip(),
    transforms.RandomAffine(degrees=20, translate=(0.1,0.1), scale=(0.8, 1.2)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
    transforms.RandomErasing(p=0.1, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False)
])

raw_train_data = CellLineDataset(
    img_dir="images_combined/", labels_file="y_train.csv", transform=transform)

raw_train_loader = DataLoader(raw_train_data, batch_size=batch_size, shuffle=True)



In [48]:
# transform = transforms.Compose([
#     transforms.RandomRotation(30),
#     transforms.RandomResizedCrop(224),
#     transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomAffine(degrees=20, translate=(0.1,0.1), scale=(0.8, 1.2)),
#     transforms.Normalize(mean=mean, std=std),
#     transforms.RandomErasing(p=0.1, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False)
# ])

In [13]:
# train_dataset = CellLineDataset(
#     img_dir="images_combined/", labels_file="train_data.csv", transform=transform)
# val_dataset = CellLineDataset(
#     img_dir="images_combined/", labels_file='val_data.csv', transform=transform)


In [9]:
train_dataset = CellLineDataset(
    img_dir="images_combined/",
    labels_file="y_train.csv",
    transform=transform
)

In [14]:
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [10]:
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

In [11]:
#model = convnext_tiny(weights=torchvision.models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1, progress=True)
#model.classifier[2] = torch.nn.Linear(in_features=1024, out_features=9)
#model = model.to(device)  
model = convnext_base(weights=torchvision.models.ConvNeXt_Base_Weights.IMAGENET1K_V1, progress=True)
model.classifier[2].out_features = 9
#model.classifier[2] = torch.nn.Linear(in_features=1024, out_features=9)
model = model.to(device)


In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


In [13]:
for epoch in range(1, epochs + 1):
    model.train()
    running_loss = 0.0

    #progress_bar = tqdm(train_loader, desc='Epoch {}/{}'.format(epoch, epochs), leave=False)

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print('Epoch {}/{} - Training Loss: {:.4f}'.format(epoch, epochs, epoch_loss))

    exp_lr_scheduler.step()


Epoch 1/100 - Training Loss: 2.2731
Epoch 2/100 - Training Loss: 2.0002
Epoch 3/100 - Training Loss: 1.8523
Epoch 4/100 - Training Loss: 1.7296
Epoch 5/100 - Training Loss: 1.5989
Epoch 6/100 - Training Loss: 1.5285
Epoch 7/100 - Training Loss: 1.4223
Epoch 8/100 - Training Loss: 1.1874
Epoch 9/100 - Training Loss: 1.1181
Epoch 10/100 - Training Loss: 1.0924
Epoch 11/100 - Training Loss: 1.0779
Epoch 12/100 - Training Loss: 1.0493
Epoch 13/100 - Training Loss: 1.0284
Epoch 14/100 - Training Loss: 1.0116
Epoch 15/100 - Training Loss: 0.9695
Epoch 16/100 - Training Loss: 0.9604
Epoch 17/100 - Training Loss: 0.9592
Epoch 18/100 - Training Loss: 0.9699
Epoch 19/100 - Training Loss: 0.9490
Epoch 20/100 - Training Loss: 0.9563
Epoch 21/100 - Training Loss: 0.9698
Epoch 22/100 - Training Loss: 0.9574
Epoch 23/100 - Training Loss: 0.9509
Epoch 24/100 - Training Loss: 0.9495
Epoch 25/100 - Training Loss: 0.9372
Epoch 26/100 - Training Loss: 0.9473
Epoch 27/100 - Training Loss: 0.9484
Epoch 28/1

: 

: 

In [13]:
torch.save(model.state_dict(), 'ConvNext_2.pth')

In [12]:
model = convnext_small(weights=torchvision.models.ConvNeXt_Small_Weights.IMAGENET1K_V1, progress=True)
model.classifier[2].out_features = 9
model.load_state_dict(torch.load('ConvNext_1.pth'))
model = model.to(device)

In [20]:
def combine_images(img_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    img_filenames = os.listdir(img_dir)
    file_ids = set(re.match(r'(\d+)_', filename).group(1) for filename in img_filenames)

    for file_id in file_ids:
        blue_img_path = os.path.join(img_dir, f"{str(file_id).zfill(5)}_blue.png")
        red_img_path = os.path.join(img_dir, f"{str(file_id).zfill(5)}_red.png")
        yellow_img_path = os.path.join(img_dir, f"{str(file_id).zfill(5)}_yellow.png")

        blue_img = Image.open(blue_img_path).convert('L')
        red_img = Image.open(red_img_path).convert('L')
        yellow_img = Image.open(yellow_img_path).convert('L')

        combined_img = Image.merge("RGB", (red_img, blue_img, yellow_img))

        combined_img_path = os.path.join(output_dir, f"{str(file_id).zfill(5)}_combined.png")
        combined_img.save(combined_img_path)

In [21]:
#combine_images(img_dir="images_test/images_test/", output_dir="images_test/")

In [23]:
class CellLineDataset(Dataset):
    def __init__(self, img_dir, labels_file=None, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        self.labels_df = pd.read_csv(labels_file) if labels_file else None
        if self.labels_df is not None:
            self.has_labels = True
            self.class_to_idx = {class_name: i for i, class_name in enumerate(
                self.labels_df["cell_line"].unique())}
        else:
            self.has_labels = False

    def __len__(self):
        if self.has_labels:
            return len(self.labels_df)
        else:
            return len([file for file in os.listdir(self.img_dir) if file.endswith("_combined.png")])

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if self.has_labels:
            row = self.labels_df.iloc[idx]
        else:
            file_list = sorted([f for f in os.listdir(self.img_dir) if f.endswith("_combined.png")])
            file_id_with_zeros = os.path.splitext(file_list[idx])[0].replace('_combined', '')
            row = {'file_id': file_id_with_zeros}

        sample_id = row['file_id']
        img_path = os.path.join(self.img_dir, f"{sample_id}_combined.png")
        img = Image.open(img_path)

        if self.transform:
            img = self.transform(img)

        if self.has_labels:
            # Convert label to integer
            label = self.class_to_idx[row['cell_line']]
            return img, label
        else:
            file_id_without_zeros = str(int(sample_id))
            return img, file_id_without_zeros






In [24]:
class_to_idx = train_dataset.class_to_idx
idx_to_class = {idx: class_name for class_name, idx in class_to_idx.items()}

test_dataset = CellLineDataset(
    img_dir="images_test/", transform=transform)
print(f"Test dataset length: {len(test_dataset)}")

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Test dataset length: 6869


In [25]:
model.eval()
predictions = []
file_ids = []
with torch.no_grad():
    for inputs, file_id in tqdm(test_loader, desc='Predicting'):
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        predictions.extend([idx_to_class[pred.item()] for pred in preds])
        file_ids.extend(file_id)

# Save predictions to a CSV file
df_predictions = pd.DataFrame({'file_id': file_ids, 'cell_line': predictions})
df_predictions.to_csv('predictions_convnext_1.csv', index=False)




Predicting:   0%|          | 0/108 [00:00<?, ?it/s]

Predicting: 100%|██████████| 108/108 [00:56<00:00,  1.90it/s]
