In [1]:
import os
from matplotlib import pyplot as plt
import pandas as pd; pd.options.mode.chained_assignment = None
import numpy as np
import scipy
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from PIL import Image

In [2]:
#LOAD THE NEW TRAIN AND TEST

DATA_DIR = "/kaggle/input/amia-public-challenge-2024/"
TRAIN_DIR = os.path.join(DATA_DIR, "train/train")
TEST_DIR = os.path.join(DATA_DIR, "test/test")


# Capture all the relevant full train/test paths
TRAIN_DICOM_PATHS = [os.path.join(TRAIN_DIR, f_name) for f_name in os.listdir(TRAIN_DIR)]
TEST_DICOM_PATHS = [os.path.join(TEST_DIR, f_name) for f_name in os.listdir(TEST_DIR)]
print(f"\n... The number of training files is {len(TRAIN_DICOM_PATHS)} ...")
print(f"... The number of testing files is {len(TEST_DICOM_PATHS)} ...")

# Define paths to the relevant csv files
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
# SS_CSV = os.path.join(DATA_DIR, "sample_submission.csv")

# Create the relevant dataframe objects
train_df = pd.read_csv(TRAIN_CSV)
# ss_df_org = pd.read_csv(SS_CSV)

print("\n\nTRAIN DATAFRAME\n\n")
display(train_df.head(3))


TEST_CSV = os.path.join(DATA_DIR, "test.csv")
# SS_CSV = os.path.join(DATA_DIR, "sample_submission.csv")
# Create the relevant dataframe objects
test_df = pd.read_csv(TEST_CSV)


IMG_SIZE_CSV = os.path.join(DATA_DIR, "img_size.csv")
img_size_df = pd.read_csv(IMG_SIZE_CSV)



... The number of training files is 8573 ...
... The number of testing files is 6427 ...


TRAIN DATAFRAME




Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,bM8C97htulC9fHKIDurJHquCXr1KZuug,No finding,14,R5,,,,
1,0FDQVdLgDKI1sRnPL94LzVh9EvXDVM9m,Aortic enlargement,0,R10,1148.0,503.0,1466.0,823.0
2,Dwk2TnGJFaMhyi3OfCrhdZG9ppGglC5w,Consolidation,4,R8,264.0,732.0,550.0,1119.0


Custom dataset in pytorch

In [3]:
batch_size = 64 # HERE ADJUST IF NEEDED

In [4]:
 # HERE ADJUST IF NEEDED

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.Grayscale(num_output_channels=3)
])

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, root_dir, transform=None, train=True):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform
        self.train = train

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.iloc[idx, 0] + '.png'  # Assuming images are stored as png files
        img_path = os.path.join(self.root_dir, img_name)
        image = Image.open(img_path)
        #convert to color
        # Convert to RGB if the image is grayscale
        if image.mode != 'RGB':
            image = image.convert('RGB')
        if self.train:
#             if (self.dataframe.iloc[idx, 2]==14):
#                 class_id = 0 # normal
#             else:
#                 class_id = 1 # lesion
            class_id = self.dataframe.iloc[idx, 2]  # Class ID
        else:
            class_id = 0 # we don't know the label for the test.
        if self.transform:
                image = self.transform(image)

        return image, class_id


In [5]:
# Create custom datasets and dataloaders
#train_dataset = CustomDataset(dataframe=train_df, root_dir=TRAIN_DIR, train=True, transform=transform)
#train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(dataframe=test_df, root_dir=TEST_DIR,train=False,transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,shuffle=False)

In [6]:
# Define model architecture
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
# model.fc = nn.Linear(num_ftrs, 2)  # Binary classification
model.fc = nn.Linear(num_ftrs, 15)  # Binary classification

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 144MB/s] 

cuda:0





In [18]:
'''

model.to(device)
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_dataloader, 0):  # Unpack data from dataloader
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to device

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 10 == 9:    # Print every 10 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 10))
            running_loss = 0.0

print('Finished Training')
'''

"model.to(device)\nmodel.train()\n\nfor epoch in range(num_epochs):\n    running_loss = 0.0\n    for i, (inputs, labels) in enumerate(train_dataloader, 0):  # Unpack data from dataloader\n        inputs, labels = inputs.to(device), labels.to(device)  # Move data to device\n\n        optimizer.zero_grad()\n\n        outputs = model(inputs)\n        loss = criterion(outputs, labels)\n        loss.backward()\n        optimizer.step()\n\n        running_loss += loss.item()\n        if i % 10 == 9:    # Print every 10 mini-batches\n            print('[%d, %5d] loss: %.3f' %\n                  (epoch + 1, i + 1, running_loss / 10))\n            running_loss = 0.0\n\nprint('Finished Training')\n"

In [8]:
#torch.save(model.state_dict(), 'amia-resnet18_128batch_inference.pth')

In [9]:
#torch.save(model,'amia-resnet18-128b-model.pth')


In [9]:
# HERE PUT THE MODEL WE WANT TO PREDICT

model.load_state_dict(torch.load("/kaggle/input/resnet18-amia-b128-epoch5/pytorch/resnet18-v2-b64-epoch15/1/amia-resnet18_64batch_15epochs_inference.pth"))

<All keys matched successfully>

In [10]:
model.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [11]:
import torch.nn.functional as F
# Inference on the test dataset
model.eval()
test_predictions = []
test_probabilities = []
with torch.no_grad():
    for i, data in enumerate (test_dataloader):
        print("doing batch",i)
        # Unpack the data batch
        inputs, _ = data  # Assuming data is a tuple (images, labels)

        # Move inputs to the device
        inputs = inputs.to(device)

        # Move model to the same device as inputs
        model = model.to(device)

        # Perform inference
        outputs = model(inputs)

        # Get predictions
        _, predicted = torch.max(outputs, 1)

        # Get class probabilities
        probabilities,_ = torch.max(F.softmax(outputs, dim=1),1)

        # Convert tensors to CPU numpy arrays
        test_predictions.extend(predicted.cpu().numpy())
        test_probabilities.extend(probabilities.cpu().numpy())
#         print (test_probabilities)

# Assuming you want to add predictions to the test dataframe
test_df['predicted_class'] = test_predictions
test_df['probabilities'] = test_probabilities
print(test_df.head())


doing batch 0
doing batch 1
doing batch 2
doing batch 3
doing batch 4
doing batch 5
doing batch 6
doing batch 7
doing batch 8
doing batch 9
doing batch 10
doing batch 11
doing batch 12
doing batch 13
doing batch 14
doing batch 15
doing batch 16
doing batch 17
doing batch 18
doing batch 19
doing batch 20
doing batch 21
doing batch 22
doing batch 23
doing batch 24
doing batch 25
doing batch 26
doing batch 27
doing batch 28
doing batch 29
doing batch 30
doing batch 31
doing batch 32
doing batch 33
doing batch 34
doing batch 35
doing batch 36
doing batch 37
doing batch 38
doing batch 39
doing batch 40
doing batch 41
doing batch 42
doing batch 43
doing batch 44
doing batch 45
doing batch 46
doing batch 47
doing batch 48
doing batch 49
doing batch 50
doing batch 51
doing batch 52
doing batch 53
doing batch 54
doing batch 55
doing batch 56
doing batch 57
doing batch 58
doing batch 59
doing batch 60
doing batch 61
doing batch 62
doing batch 63
doing batch 64
doing batch 65
doing batch 66
doing

In [12]:
test_df['Prob']  = test_df['probabilities'].max()
    
test_df

Unnamed: 0,image_id,predicted_class,probabilities,Prob
0,3r9OdPSdvQ58qI3VUFUeSKyCvxBpFc0c,13,0.339730,1.0
1,LO2jAm8E96Ih87wJVoqiOXHixrwPMeOm,14,1.000000,1.0
2,PN7S4HbhNp4fht9TTc6DXGOKGkeRTR7W,14,1.000000,1.0
3,l7f2KDvrnrh26v4aYgi0Slj7lVBZMQIL,14,1.000000,1.0
4,if5Pqu95xLUtURzAo72YiSg8GNzJb1F3,14,1.000000,1.0
...,...,...,...,...
21984,k576EmhRJuLOIBHFyzH7LRcr2JbYFnHM,14,1.000000,1.0
21985,yFiQoOEOTP6yO3KMmiAQ5zkBjdww7icn,14,0.999976,1.0
21986,yg7B1t1DO9tMk2uJV0SkqA82y97SPZHa,14,1.000000,1.0
21987,1oO2FHrNonZqP9i854X6sio2hZj4R4h0,5,0.348019,1.0


In [13]:
test_df.to_csv('test_predictions.csv', index=False) #SAVE THIS RESULT! 

In [16]:
'''test_df = pd.read_csv('test_predictions.csv')
test_df.head()'''

# change 0 for 14
# change 1 for 0
test_df['predicted_class'] = test_df['predicted_class'].replace(0, 14)
test_df['predicted_class'] = test_df['predicted_class'].replace(1, 0)

def prepare_pred(row):
    prob = str(row['probabilities']) if not pd.isna(row['probabilities']) else '0'  # Convert NaN to '0'
    if row['predicted_class'] == 14: 
        return '14 ' + prob + ' 0 0 1 1'
    else:
        return str(row['predicted_class']) + ' ' + prob + ' 700 700 1000 1000'
    
# Apply the function to add text to specific rows
test_df['PredictionString'] = test_df.apply(prepare_pred, axis=1)

# # Create a new column by concatenating 'predicted_class' and 'Prob' as strings
# test_df['PredictionString'] = test_df['predicted_class'].astype(str) + ' ' + test_df['Prob'].astype(str) + ' 0 0 1 1'

# Create a new DataFrame with 'image_id' and 'new_column'
test_df_submit = test_df[['image_id', 'PredictionString']]

print(test_df_submit)

test_df_submit = test_df_submit.drop_duplicates()
# duplicate_rows = test_df_submit[test_df_submit.duplicated()]

# if duplicate_rows.empty:
#     print("No duplicate rows found.")
# else:
#     print("Duplicate rows found:")
#     print(duplicate_rows)


#### Part of Submit in the Challenge form ####


# Concatenar los resultados de filas con IDs repetidos
df_concatenado = test_df_submit.groupby('image_id')['PredictionString'].apply(' '.join).reset_index()

# Guardar el dataframe en un archivo CSV
df_concatenado.to_csv('submission.csv', index=False)


df_concatenado


                               image_id  \
0      3r9OdPSdvQ58qI3VUFUeSKyCvxBpFc0c   
1      LO2jAm8E96Ih87wJVoqiOXHixrwPMeOm   
2      PN7S4HbhNp4fht9TTc6DXGOKGkeRTR7W   
3      l7f2KDvrnrh26v4aYgi0Slj7lVBZMQIL   
4      if5Pqu95xLUtURzAo72YiSg8GNzJb1F3   
...                                 ...   
21984  k576EmhRJuLOIBHFyzH7LRcr2JbYFnHM   
21985  yFiQoOEOTP6yO3KMmiAQ5zkBjdww7icn   
21986  yg7B1t1DO9tMk2uJV0SkqA82y97SPZHa   
21987  1oO2FHrNonZqP9i854X6sio2hZj4R4h0   
21988  8Q8fPobVc11InzzHAKDfjH2emkfnEdnC   

                              PredictionString  
0      13 0.3397303819656372 700 700 1000 1000  
1                               14 1.0 0 0 1 1  
2                               14 1.0 0 0 1 1  
3                               14 1.0 0 0 1 1  
4                               14 1.0 0 0 1 1  
...                                        ...  
21984                           14 1.0 0 0 1 1  
21985            14 0.9999759197235107 0 0 1 1  
21986                           14 1.0 0 0

Unnamed: 0,image_id,PredictionString
0,00X4Pb5TcOhWWwrDwn9UoRDJhwYRuusp,14 1.0 0 0 1 1
1,00eCz0yTwisqK7dgZKrdhLh4cMP9FewR,14 1.0 0 0 1 1
2,00wsXaGGLhOo977BBHmhbKVNu02fWdPl,14 1.0 0 0 1 1
3,02IEFam0BlSztSMY3YeA9svnDJOxTKDg,13 0.306283563375473 700 700 1000 1000
4,02fQeJYiEhOeebwkwE8wsD0FPyz8EWHD,14 1.0 0 0 1 1
...,...,...
6422,zxCwOtAINzbYU681ZHjc8GZvtOz9ErEr,14 1.0 0 0 1 1
6423,zxq5d7Jh3j2DTwdFqXMmH1OLUFRweQBE,14 0.44158539175987244 0 0 1 1
6424,zxxt4VNvrRQHUL58LBI4zDb11JZZ5NKz,14 1.0 0 0 1 1
6425,zyD6VqKYEQArknozKmmitQJEjhWqGxZI,14 1.0 0 0 1 1


In [20]:
# Inference on the test dataset
test_predictions = []
test_probabilities = []
all_labels = []

with torch.no_grad():
    for i, data in enumerate(test_dataloader):
        print("doing batch", i)
        # Unpack the data batch
        inputs, labels = data  # Assuming data is a tuple (images, labels)

        # Move inputs to the device
        inputs, labels = inputs.to(device), labels.to(device)

        # Perform inference
        outputs = model(inputs)

        # Get predictions
        _, predicted = torch.max(outputs, 1)

        # Get class probabilities
        probabilities = F.softmax(outputs, dim=1).max(dim=1)[0]

        # Convert tensors to CPU numpy arrays
        test_predictions.extend(predicted.cpu().numpy())
        test_probabilities.extend(probabilities.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

doing batch 0
doing batch 1
doing batch 2
doing batch 3
doing batch 4
doing batch 5
doing batch 6
doing batch 7
doing batch 8
doing batch 9
doing batch 10
doing batch 11
doing batch 12
doing batch 13
doing batch 14
doing batch 15
doing batch 16
doing batch 17
doing batch 18
doing batch 19
doing batch 20
doing batch 21
doing batch 22
doing batch 23
doing batch 24
doing batch 25
doing batch 26
doing batch 27
doing batch 28
doing batch 29
doing batch 30
doing batch 31
doing batch 32
doing batch 33
doing batch 34
doing batch 35
doing batch 36
doing batch 37
doing batch 38
doing batch 39
doing batch 40
doing batch 41
doing batch 42
doing batch 43
doing batch 44
doing batch 45
doing batch 46
doing batch 47
doing batch 48
doing batch 49
doing batch 50
doing batch 51
doing batch 52
doing batch 53
doing batch 54
doing batch 55
doing batch 56
doing batch 57
doing batch 58
doing batch 59
doing batch 60
doing batch 61
doing batch 62
doing batch 63
doing batch 64
doing batch 65
doing batch 66
doing

FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'

In [23]:
from sklearn.metrics import precision_recall_fscore_support


In [24]:
# Assuming you want to add predictions to the test dataframe
test_df = pd.read_csv('/kaggle/input/amia-public-challenge-2024/test.csv')
test_df['predicted_class'] = test_predictions
test_df['probabilities'] = test_probabilities
print(test_df.head())

# Calculate the F1, Precision, and Recall scores for each class
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, test_predictions, average=None)

for i in range(len(precision)):
    print(f'Class {i}:')
    print(f'Precision: {precision[i]}')
    print(f'Recall: {recall[i]}')
    print(f'F1 Score: {f1[i]}')

                           image_id  predicted_class  probabilities
0  3r9OdPSdvQ58qI3VUFUeSKyCvxBpFc0c               13        0.33973
1  LO2jAm8E96Ih87wJVoqiOXHixrwPMeOm               14        1.00000
2  PN7S4HbhNp4fht9TTc6DXGOKGkeRTR7W               14        1.00000
3  l7f2KDvrnrh26v4aYgi0Slj7lVBZMQIL               14        1.00000
4  if5Pqu95xLUtURzAo72YiSg8GNzJb1F3               14        1.00000
Class 0:
Precision: 1.0
Recall: 0.13906953476738368
F1 Score: 0.24418093983311373
Class 1:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 2:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 3:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 4:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 5:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 6:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 7:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 8:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 9:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 10:
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Class 11:

  _warn_prf(average, modifier, msg_start, len(result))
