In [2]:
import sys
from pathlib import Path

ROOT = Path.cwd().resolve().parent.parent
DATA_PIPELINE = ROOT / 'data-pipeline' / 'src'

sys.path.append(str(DATA_PIPELINE))

In [3]:
# data_pipeline/ingestion ?
from data_pipeline.utils.common import load_config
from data_pipeline.constants import CONFIG_FILE_PATH

config = load_config(CONFIG_FILE_PATH)

[2024-12-29 13:23:13,802: INFO: common: Loaded yaml from /Users/ismasadou/Documents/kuude/ocular-detection/data-pipeline/src/data_pipeline/config.yml]


In [4]:
## Download dataset
from data_pipeline.ingestion.fetch_kaggle import download_datasets

datasets = config['datasets']
# download_datasets(datasets)

In [5]:
## Validate dataset
metadata_file = config['datasets'][0]['metadata_file']
dataset_name = config['datasets'][0]['name']
image_dir_name = config['datasets'][0]['image_folder']

dataset_dir = ROOT / 'data-pipeline' / 'outputs' / 'raw' / dataset_name

In [6]:
import pandas as pd

metadata = pd.read_csv(dataset_dir / metadata_file)
metadata.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


In [7]:
def check_null_or_empty(metadata):
    null_or_empty = metadata.isnull() | (metadata == '')

    rows_with_null_or_empty = null_or_empty.any(axis=1)

    if rows_with_null_or_empty.any():
        row_count = rows_with_null_or_empty.sum()
        print(f'Found {row_count} rows with null or empty values.')
        for idx in metadata[rows_with_null_or_empty].index:
            missing = metadata.columns[null_or_empty.loc[idx]].tolist()
            print(f'Row {idx} has missing/empty values for columns: {missing}')
    else:
        print('No missing or empty values found in the dataset.')

check_null_or_empty(metadata)


No missing or empty values found in the dataset.


In [15]:
metadata.describe()

Unnamed: 0,ID,Patient Age,N,D,G,C,A,H,M,O
count,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0,6392.0
mean,2271.150814,57.857947,0.328692,0.332134,0.062109,0.062891,0.049906,0.031758,0.047872,0.248436
std,1417.559018,11.727737,0.469775,0.471016,0.241372,0.242786,0.217768,0.17537,0.213513,0.432139
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,920.75,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2419.5,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3294.0,66.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4784.0,91.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
import re

# Get all unique filenames and their counts
filename_counts = metadata['filename'].value_counts()
filename_counts = filename_counts.sort_index()
print(filename_counts)

# Extract numbers from filenames and check for pairs

# Extract numbers and sides from filenames
metadata['number'] = metadata['filename'].apply(lambda x: re.findall(r'(\d+)_', x)[0])
metadata['side'] = metadata['filename'].apply(lambda x: re.findall(r'_(left|right)', x)[0])

# Create a pivot table to check for pairs
pivot_table = metadata.pivot_table(index='number', columns='side', values='filename', aggfunc='count', fill_value=0)

# Find numbers without pairs
no_pairs = pivot_table[(pivot_table['left'] == 0) | (pivot_table['right'] == 0)]
print(f'Numbers without pairs: {no_pairs.index.tolist()}')

# Sort metadata by number and then by side
metadata = metadata.sort_values(by=['number', 'side'])

# Check for repeated filenames
repeated_filenames = metadata[metadata.duplicated('filename', keep=False)]
if not repeated_filenames.empty:
    print(f'Repeated filenames found:\n{repeated_filenames}')
else:
    print('No repeated filenames found.')

filename
0_left.jpg        1
0_right.jpg       1
1005_right.jpg    1
1006_left.jpg     1
1006_right.jpg    1
                 ..
999_right.jpg     1
99_left.jpg       1
99_right.jpg      1
9_left.jpg        1
9_right.jpg       1
Name: count, Length: 6392, dtype: int64
Numbers without pairs: ['1005', '1014', '1018', '1020', '1033', '1061', '1062', '1065', '1066', '1077', '108', '1089', '1095', '1096', '1116', '1121', '1123', '1127', '1130', '1137', '114', '1142', '1145', '1148', '1156', '1162', '1177', '1228', '124', '1242', '1243', '1254', '1263', '1273', '1274', '1310', '1319', '1369', '138', '141', '1412', '1442', '1456', '147', '1475', '151', '154', '1540', '155', '1560', '1566', '1567', '1571', '1573', '1574', '1581', '1585', '1591', '1595', '1597', '1598', '1614', '1626', '1628', '1638', '1640', '1642', '1643', '1652', '1657', '1659', '1662', '1664', '1677', '1683', '1706', '1710', '1716', '1799', '1801', '183', '1865', '188', '192', '195', '1965', '1968', '197', '2', '201', '2010

In [22]:
image_dir = dataset_dir / image_dir_name

# Check if every filename in the metadata exists in image_dir
missing_files = [f for f in metadata['filename'] if not (image_dir / f).exists()]
if missing_files:
    print(f'Missing files in image_dir: {missing_files}')
else:
    print('All files in metadata exist in image_dir.')

# Check if all of the images in the dir are part of the metadata
all_files_in_dir = set(f.name for f in image_dir.glob('*'))
metadata_files = set(metadata['filename'])
extra_files = all_files_in_dir - metadata_files
if extra_files:
    print(f'Extra files in image_dir not listed in metadata: {extra_files}')
else:
    print('All files in image_dir are listed in metadata.')

All files in metadata exist in image_dir.
All files in image_dir are listed in metadata.
