In [55]:
import os
import cv2
import pickle
import random as rnd
import numpy as np
import pandas as pd

In [56]:
# Image Preprocessing Function with default image size
def preprocess(image_path, image_size=224):
    image = cv2.imread(image_path)
    image = cv2.resize(image, (image_size, image_size))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = image / 255.0
    return torch.tensor(image).permute(2, 0, 1)

In [57]:
# Create DataFrame from Directory Structure
def create_dataframe(datadir):
    result_dataframe = []
    for train_test_name in os.listdir(datadir):
        label_dir = os.path.join(datadir, train_test_name)
        for label_name in os.listdir(label_dir):
            image_dir = os.path.join(label_dir, label_name)
            for image_path in os.listdir(image_dir):
                record_dict = {
                    'is_train': 0 if train_test_name == 'test' else 1,
                    'label': label_name.replace('healthy', 'Healthy'),
                    'image_path': os.path.join(image_dir, image_path)
                }
                result_dataframe.append(record_dict)
    return result_dataframe


In [58]:
# Data Loading and Preparation
datadir = '/kaggle/input/lungs-disease-data/data'
result_dataframe = create_dataframe(datadir)
result_dataframe = pd.DataFrame.from_records(result_dataframe)  # Ensure this conversion is correctly done
train_dataframe = result_dataframe[result_dataframe['is_train'] == 1].drop('is_train', axis=1)
test_dataframe = result_dataframe[result_dataframe['is_train'] == 0].drop('is_train', axis=1)

In [59]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_dataframe['label'] = le.fit_transform(train_dataframe['label'].values)
test_dataframe['label'] = le.transform(test_dataframe['label'].values)

In [60]:
train_dataframe['label'].value_counts()

label
1    111
0     70
2     70
Name: count, dtype: int64

In [61]:
test_dataframe['label'].value_counts()

label
1    26
0    20
2    20
Name: count, dtype: int64

In [62]:
test_dataframe['label'].value_counts().keys() == train_dataframe['label'].value_counts().keys()

array([ True,  True,  True])

In [63]:
# Data Splitting
from sklearn.model_selection import train_test_split

train_images, validation_images, train_labels, validation_labels = train_test_split(train_dataframe['image_path'].values, train_dataframe['label'].values, test_size=0.2, stratify=train_dataframe['label'].values, random_state=42, shuffle=True)
test_images, test_labels = test_dataframe['image_path'].values, test_dataframe['label'].values


In [64]:
print(train_images.shape, train_labels.shape)
print(validation_images.shape, validation_labels.shape)
print(test_images.shape, test_labels.shape)

(200,) (200,)
(51,) (51,)
(66,) (66,)


In [65]:
print(np.unique(train_labels,return_counts=True))
print(np.unique(validation_labels,return_counts=True))
print(np.unique(test_labels,return_counts=True))

(array([0, 1, 2]), array([56, 88, 56]))
(array([0, 1, 2]), array([14, 23, 14]))
(array([0, 1, 2]), array([20, 26, 20]))


In [76]:
import torch
from torch import nn, optim
import torchvision
from torchvision import models, transforms, datasets
from torchvision.models import VGG16_Weights
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

In [77]:
test_images, test_labels = test_dataframe['image_path'].values, test_dataframe['label'].values

In [78]:
# Define Transformations for Data Augmentation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [79]:
# Model Definition with Corrected Pretrained Argument
vgg16_model = models.vgg16(pretrained=True)
for param in vgg16_model.features.parameters():
    param.requires_grad = False
num_features = vgg16_model.classifier[6].in_features
vgg16_model.classifier[6] = nn.Linear(num_features, len(np.unique(train_labels)))

In [80]:
# Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(vgg16_model.classifier.parameters(), lr=0.01, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

In [81]:
def calculate_metrics(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return f1, precision, recall

In [84]:
# Training and Evaluation Functions
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()
        true_labels, predictions = [], []
        for image_paths, labels in DataLoader(list(zip(train_images, train_labels)), batch_size=32, shuffle=True):
            inputs = torch.stack([transform(preprocess(path)) for path in image_paths])
            labels = torch.from_numpy(np.array(labels))
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            _, preds = torch.max(outputs, 1)
            true_labels.extend(labels.tolist())
            predictions.extend(preds.tolist())

        scheduler.step()
        f1, precision, recall = calculate_metrics(true_labels, predictions)
        print(f'Epoch {epoch+1}/{num_epochs}: F1={f1:.2f}, Precision={precision:.2f}, Recall={recall:.2f}')

In [85]:
def evaluate_model(model, image_paths, labels):
    model.eval()
    true_labels, predictions = [], []
    inputs = torch.stack([transform(preprocess(path)) for path in image_paths])
    labels = torch.from_numpy(np.array(labels))
    outputs = model(inputs)
    _, predicted = torch.max(outputs.data, 1)
    true_labels.extend(labels.tolist())
    predictions.extend(predicted.tolist())

    f1, precision, recall = calculate_metrics(true_labels, predictions)
    cr = classification_report(true_labels, predictions)
    print(f'Accuracy: {100 * sum(np.array(true_labels) == np.array(predictions)) / len(labels):.2f}%')
    print(f'Classification Report:\n{cr}')

In [86]:
if __name__ == '__main__':
    train_model(vgg16_model, criterion, optimizer, scheduler, num_epochs=25)
    evaluate_model(vgg16_model, test_images, test_labels)

Epoch 1/25: F1=0.53, Precision=0.53, Recall=0.53


KeyboardInterrupt: 