In [1]:
import pandas as pd
import os
from PIL import Image

In [60]:
def create_image_df(folder_path, is_positive):
    # List to store file names and images
    file_data = []
    
    # Loop through all files in the folder
    for file in os.listdir(folder_path):
        # Check if the file is an image file
        if file.endswith(('.jpg', '.jpeg', '.png', '.gif')):
            # Open the image file
            image_path = os.path.join(folder_path, file)
            if is_positive:
                if 'bacteria' in file:
                    y_val = 'bacteria'
                elif 'virus' in file:
                    y_val = 'virus'
                else:
                    y_val = 'pneumonia'
            else:
                y_val = 'normal'
            with Image.open(image_path) as image:
                # Append the file name and image to the list
                file_data.append({'File_Name': file, 'Image': image, 'y_val': y_val})
    
    # Create a DataFrame from the list of dictionaries
    return pd.DataFrame(file_data)

In [61]:
normal_test_df = create_image_df('./test/NORMAL', False)
pneumonia_test_df = create_image_df('./test/PNEUMONIA', True)
normal_train_df = create_image_df('./train/NORMAL', False)
pneumonia_train_df = create_image_df('./train/PNEUMONIA', True)
normal_val_df = create_image_df('./val/NORMAL', False)
pneumonia_val_df = create_image_df('./val/PNEUMONIA', True)

normal_train_df

Unnamed: 0,File_Name,Image,y_val
0,IM-0115-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
1,IM-0117-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
2,IM-0119-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
3,IM-0122-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
4,IM-0125-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
...,...,...,...
1336,NORMAL2-IM-1406-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
1337,NORMAL2-IM-1412-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
1338,NORMAL2-IM-1419-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
1339,NORMAL2-IM-1422-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal


In [64]:
pneumonia_train_df

Unnamed: 0,File_Name,Image,y_val
0,person1000_bacteria_2931.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
1,person1000_virus_1681.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,virus
2,person1001_bacteria_2932.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
3,person1002_bacteria_2933.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
4,person1003_bacteria_2934.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
...,...,...,...
3870,person99_virus_183.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,virus
3871,person9_bacteria_38.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
3872,person9_bacteria_39.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
3873,person9_bacteria_40.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria


In [62]:
normal_train_df['y_val'].value_counts()

y_val
normal    1341
Name: count, dtype: int64

In [63]:
pneumonia_train_df['y_val'].value_counts()

y_val
bacteria    2530
virus       1345
Name: count, dtype: int64

In [66]:
train_df = pd.concat([normal_train_df, pneumonia_train_df], ignore_index=True)
train_df

Unnamed: 0,File_Name,Image,y_val
0,IM-0115-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
1,IM-0117-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
2,IM-0119-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
3,IM-0122-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
4,IM-0125-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
...,...,...,...
5211,person99_virus_183.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,virus
5212,person9_bacteria_38.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
5213,person9_bacteria_39.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
5214,person9_bacteria_40.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria


In [67]:
test_df = pd.concat([normal_test_df, pneumonia_test_df], ignore_index=True)
test_df

Unnamed: 0,File_Name,Image,y_val
0,IM-0001-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
1,IM-0003-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
2,IM-0005-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
3,IM-0006-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
4,IM-0007-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
...,...,...,...
619,person96_bacteria_465.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
620,person96_bacteria_466.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
621,person97_bacteria_468.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
622,person99_bacteria_473.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria


In [68]:
val_df = pd.concat([normal_val_df, pneumonia_val_df], ignore_index=True)
val_df

Unnamed: 0,File_Name,Image,y_val
0,NORMAL2-IM-1427-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
1,NORMAL2-IM-1430-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
2,NORMAL2-IM-1431-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
3,NORMAL2-IM-1436-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
4,NORMAL2-IM-1437-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
5,NORMAL2-IM-1438-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
6,NORMAL2-IM-1440-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
7,NORMAL2-IM-1442-0001.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,normal
8,person1946_bacteria_4874.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria
9,person1946_bacteria_4875.jpeg,<PIL.JpegImagePlugin.JpegImageFile image mode=...,bacteria


In [69]:
y_train = train_df['y_val']
X_train = train_df['Image']
y_test = test_df['y_val']
X_test = test_df['Image']
y_val = val_df['y_val']
X_val = val_df['Image']

In [70]:
y_train

0         normal
1         normal
2         normal
3         normal
4         normal
          ...   
5211       virus
5212    bacteria
5213    bacteria
5214    bacteria
5215    bacteria
Name: y_val, Length: 5216, dtype: object