In [1]:
import sys
from pathlib import Path

ROOT = Path.cwd().resolve().parent.parent
DATA_PIPELINE = ROOT / 'data-pipeline' / 'src'

sys.path.append(str(DATA_PIPELINE))



In [2]:
from data_pipeline.utils.common import load_config
from data_pipeline.constants import CONFIG_FILE_PATH

config = load_config(CONFIG_FILE_PATH)

[2024-12-28 22:18:01,610: INFO: common: Loaded yaml from /Users/ismasadou/Documents/kuude/ocular-detection/data-pipeline/src/data_pipeline/config.yml]


In [3]:
from data_pipeline.ingestion.fetch_kaggle import download_datasets

datasets = config['datasets']
metadata_file = config['datasets'][0]['metadata_file']
dataset_name = config['datasets'][0]['name']
image_dir_name = config['datasets'][0]['image_folder']

dataset_dir = ROOT / 'data-pipeline' / 'outputs' / 'raw' / dataset_name

import pandas as pd

metadata = pd.read_csv(dataset_dir / metadata_file)
metadata.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


In [4]:
import ast

def parse_target_column(metadata: pd.DataFrame) -> pd.DataFrame:
    def parse_target(x):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []

    metadata['parsed_target'] = metadata['target'].apply(parse_target)
    return metadata


Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename,parsed_target
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]"
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]"
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]"
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]"
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]"


In [30]:
def preprocess_metadata(metadata: pd.DataFrame) -> pd.DataFrame:
    metadata['parsed_gender'] = metadata['Patient Sex'].map({'Male': 0, 'Female': 1})
    metadata = parse_target_column(metadata)
    return metadata

metadata = preprocess_metadata(metadata)
metadata.head()


Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,...,H,M,O,filepath,labels,target,filename,parsed_target,parsed_gender,age_group
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,1,0,0,...,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",1,60-69
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,...,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg,"[1, 0, 0, 0, 0, 0, 0, 0]",0,50-59
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,...,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",0,40-49
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,...,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",0,50-59
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,...,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg,"[0, 1, 0, 0, 0, 0, 0, 0]",1,40-49


In [6]:
def get_age_distribution(metadata: pd.DataFrame, age_col: str = "Patient Age") -> pd.DataFrame:
    if age_col not in metadata.columns:
        raise ValueError(f"Column '{age_col}' not found in metadata")
    age_stats = metadata[age_col].describe().to_frame().T
    return age_stats
    

age_stats = get_age_distribution(metadata)
age_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Patient Age,6392.0,57.857947,11.727737,1.0,51.0,59.0,66.0,91.0


In [7]:
# now with bins of 10 years
def get_age_group_distribution(metadata: pd.DataFrame, age_col: str = "Patient Age", bin_size: int = 10) -> pd.DataFrame:
    if age_col not in metadata.columns:
        raise ValueError(f"Column '{age_col}' not found in metadata")

    bins = range(0, metadata[age_col].max() + bin_size, bin_size)
    labels = [f'{i}-{i+bin_size-1}' for i in range(0, metadata[age_col].max(), bin_size)]

    metadata['age_group'] = pd.cut(metadata[age_col], bins=bins, labels=labels)
    age_distribution = metadata['age_group'].value_counts().sort_index().reset_index()
    age_distribution.columns = ['age_group', 'count']
    return age_distribution

age_distribution = get_age_group_distribution(metadata)
age_distribution

Unnamed: 0,age_group,count
0,0-9,28
1,10-19,10
2,20-29,68
3,30-39,309
4,40-49,1097
5,50-59,2148
6,60-69,1994
7,70-79,623
8,80-89,113
9,90-99,2


In [8]:
def get_gender_distribution(metadata: pd.DataFrame, gender_col: str = "Patient Sex") -> pd.DataFrame:
    gender_counts = metadata[gender_col].value_counts().reset_index()
    gender_counts.columns = ['Gender', 'Count']

    gender_counts['Proportion'] = gender_counts['Count'] / gender_counts['Count'].sum()
    return gender_counts

gender_distribution = get_gender_distribution(metadata)
gender_distribution

Unnamed: 0,Gender,Count,Proportion
0,Male,3424,0.53567
1,Female,2968,0.46433


In [9]:
import numpy as np

def get_disease_distribution(metadata: pd.DataFrame, target_col: str = "parsed_target", labels_mapper: list[str] = None) -> pd.DataFrame:
    # Extract the one-hot encoded target values
    target_values = np.array(metadata[target_col].tolist())
    
    # Sum the one-hot encoded values along the columns to get the count of each disease
    disease_counts = target_values.sum(axis=0)
    
    if labels_mapper is None:
        labels_mapper = ['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']
    
    # Create a DataFrame with the disease labels and their corresponding counts
    disease_distribution = pd.DataFrame({
        'Disease': labels_mapper,
        'Count': disease_counts
    })
    
    return disease_distribution

def get_disease_distribution_via_labels(metadata: pd.DataFrame, target_col: str = "labels", labels_mapper: list[str] = None) -> pd.DataFrame:
    disease_counts = metadata[target_col].value_counts().reset_index()
    disease_counts.columns = ['Disease', 'Count']
    return disease_counts

disease_distribution = get_disease_distribution(metadata)
print(disease_distribution)

disease_distribution_via_labels = get_disease_distribution_via_labels(metadata)
print(disease_distribution_via_labels)

  Disease  Count
0       N   2873
1       D   1608
2       G    284
3       C    293
4       A    266
5       H    128
6       M    232
7       O    708
  Disease  Count
0   ['N']   2873
1   ['D']   1608
2   ['O']    708
3   ['C']    293
4   ['G']    284
5   ['A']    266
6   ['M']    232
7   ['H']    128


In [10]:
# now get correlation between diseases and age / gender
def get_disease_correlation(metadata: pd.DataFrame) -> pd.DataFrame:

    numeric_df = metadata.select_dtypes(include=[np.number])
    disease_correlation = numeric_df.corr()
    
    return disease_correlation

disease_correlation = get_disease_correlation(metadata)
print(disease_correlation)

                     ID  Patient Age         N         D         G         C  \
ID             1.000000    -0.148889  0.291269  0.311308 -0.164544 -0.084622   
Patient Age   -0.148889     1.000000 -0.066347 -0.091023  0.100070  0.189807   
N              0.291269    -0.066347  1.000000 -0.493628 -0.180131 -0.181097   
D              0.311308    -0.091023 -0.493628  1.000000 -0.104401 -0.081076   
G             -0.164544     0.100070 -0.180131 -0.104401  1.000000 -0.050538   
C             -0.084622     0.189807 -0.181097 -0.081076 -0.050538  1.000000   
A             -0.187629     0.058467 -0.160428 -0.112809  0.000557 -0.059295   
H             -0.044079    -0.026488 -0.126773  0.040872  0.008842 -0.032139   
M             -0.152493    -0.025278 -0.156958 -0.102116 -0.018233 -0.058012   
O             -0.623431     0.071769 -0.402450 -0.023391 -0.030945 -0.065133   
parsed_gender -0.058814     0.092731 -0.032065 -0.030487 -0.022536  0.060547   

                      A         H      

In [11]:
def get_distribution_metrics(metadata: pd.DataFrame) -> pd.DataFrame:
    metrics = pd.DataFrame(columns=['Column', 'Skewness', 'Kurtosis'])
    numeric_df = metadata.select_dtypes(include=[np.number])

    for col in numeric_df.columns:
        skewness = numeric_df[col].skew()
        kurtosis = numeric_df[col].kurtosis()
        metrics = pd.concat([metrics, pd.DataFrame({'Column': [col], 'Skewness': [skewness], 'Kurtosis': [kurtosis]})], ignore_index=True)

    return metrics

distribution_metrics = get_distribution_metrics(metadata)
print(distribution_metrics)

           Column  Skewness   Kurtosis
0              ID  0.061031  -1.242108
1     Patient Age -0.697314   2.047403
2               N  0.728793  -1.469320
3               D  0.713007  -1.492088
4               G  3.629485  11.176657
5               C  3.607380  11.016638
6               A  4.134993  15.102895
7               H  5.341711  26.542181
8               M  4.236460  15.952583
9               O  1.164639  -0.643818
10  parsed_gender  0.143076  -1.980149


  metrics = pd.concat([metrics, pd.DataFrame({'Column': [col], 'Skewness': [skewness], 'Kurtosis': [kurtosis]})], ignore_index=True)


In [23]:
from scipy import stats

def detect_outliers_zscore(metadata: pd.DataFrame, column: str = None, threshold: int = 3) -> pd.DataFrame:
    outliers = pd.DataFrame(columns=['Column', 'Outliers'])
    numeric_df = metadata.select_dtypes(include=[np.number])

    for col in numeric_df.columns:
        z_scores = np.abs((numeric_df[col] - numeric_df[col].mean()) / numeric_df[col].std())
        outliers = pd.concat([outliers, pd.DataFrame({'Column': [col], 'Outliers': [z_scores[z_scores > threshold].count()],'Vals': [z_scores[z_scores > threshold]]})], ignore_index=True)

    z_scores = stats.zscore(numeric_df)
    abs_z_scores = np.abs(z_scores)
    o3 = numeric_df.iloc[np.where(abs_z_scores > threshold)[0]]
    outliers_2 = (abs_z_scores > threshold).all(axis=1)

    return outliers, outliers_2, o3

outliers_zscore, o2, o3 = detect_outliers_zscore(metadata)
print(outliers_zscore)
print("*"*10)
print(o2)
print("*"*10)
print(o3)
print(outliers_zscore['Column'])

           Column Outliers                                               Vals
0              ID        0               Series([], Name: ID, dtype: float64)
1     Patient Age       40  1013    4.848160
1218    4.848160
1219    4.84...
2               N        0                Series([], Name: N, dtype: float64)
3               D        0                Series([], Name: D, dtype: float64)
4               G      397  34      3.885665
78      3.885665
129     3.88...
5               C      401  20      3.864947
65      3.864947
86      3.86...
6               A      319  34      4.362871
39      4.362871
44      4.36...
7               H      203  10      5.521133
19      5.521133
23      5.52...
8               M      306  11      4.459347
14      4.459347
16      4.45...
9               O        0                Series([], Name: O, dtype: float64)
10  parsed_gender        0    Series([], Name: parsed_gender, dtype: float64)
**********
0       False
1       False
2       False
3       Fal

In [27]:
import pandas as pd
import numpy as np
from scipy import stats
from typing import List, Optional, Dict, Any

def detect_outliers_zscore(
    metadata: pd.DataFrame,
    columns: Optional[List[str]] = None,
    threshold: float = 3.0
) -> Dict[str, Any]:
    """
    Detects outliers in specified numerical columns of a DataFrame using the Z-score method.

    Args:
        metadata (pd.DataFrame): The input DataFrame containing the data.
        columns (Optional[List[str]]): List of numerical columns to analyze for outliers.
                                        If None, all numerical columns are analyzed.
        threshold (float): The Z-score threshold to identify outliers. Default is 3.0.

    Returns:
        Dict[str, Any]: A dictionary containing:
            - 'outlier_counts': DataFrame with columns ['Column', 'Outlier_Count'] indicating the number of outliers per column.
            - 'outlier_rows': DataFrame containing all rows that have at least one outlier in the specified columns.
            - 'detailed_outliers': DataFrame listing each outlier with its column, index, and Z-score.
    """
    # Select numerical columns if columns not specified
    if columns is None:
        numeric_df = metadata.select_dtypes(include=[np.number])
    else:
        # Ensure specified columns are in the DataFrame and numerical
        missing_cols = [col for col in columns if col not in metadata.columns]
        if missing_cols:
            raise KeyError(f"The following specified columns are not in the DataFrame: {missing_cols}")
        numeric_df = metadata[columns].select_dtypes(include=[np.number])
        if numeric_df.empty:
            raise ValueError("No numerical columns found in the specified columns.")

    if numeric_df.empty:
        raise ValueError("No numerical columns available for outlier detection.")

    # Compute Z-scores using scipy.stats.zscore (axis=0 for column-wise)
    z_scores = stats.zscore(numeric_df, nan_policy='omit')
    
    # Replace NaN Z-scores with 0 to handle constant columns
    z_scores = np.nan_to_num(z_scores)

    # Create a boolean DataFrame where True indicates an outlier
    abs_z_scores = np.abs(z_scores)
    outlier_mask = abs_z_scores > threshold

    # Outlier counts per column
    outlier_counts = pd.DataFrame({
        'Column': numeric_df.columns,
        'Outlier_Count': outlier_mask.sum(axis=0)
    })

    # Identify rows with any outliers
    rows_with_outliers = metadata[outlier_mask.any(axis=1)]

    # Detailed outliers with column, index, and Z-score
    detailed_outliers_list = []
    for col_idx, col in enumerate(numeric_df.columns):
        outlier_indices = np.where(outlier_mask[:, col_idx])[0]
        for idx in outlier_indices:
            detailed_outliers_list.append({
                'Index': metadata.index[idx],
                'Column': col,
                'Value': numeric_df.iloc[idx, col_idx],
                'Z-score': z_scores[idx, col_idx]
            })
    detailed_outliers = pd.DataFrame(detailed_outliers_list)

    return {
        'outlier_counts': outlier_counts,
        'outlier_rows': rows_with_outliers,
        'detailed_outliers': detailed_outliers
    }


outliers = detect_outliers_zscore(metadata)
print(outliers['outlier_counts'])
print(outliers['outlier_rows'])
print(len(outliers['detailed_outliers']))
# print(outliers['detailed_outliers'])

           Column  Outlier_Count
0              ID              0
1     Patient Age             40
2               N              0
3               D              0
4               G            397
5               C            401
6               A            319
7               H            203
8               M            306
9               O              0
10  parsed_gender              0
        ID  Patient Age Patient Sex    Left-Fundus    Right-Fundus  \
10      11           60      Female    11_left.jpg    11_right.jpg   
11      13           60      Female    13_left.jpg    13_right.jpg   
14      16           54      Female    16_left.jpg    16_right.jpg   
16      18           58        Male    18_left.jpg    18_right.jpg   
19      23           47        Male    23_left.jpg    23_right.jpg   
...    ...          ...         ...            ...             ...   
6302  4579           58        Male  4579_left.jpg  4579_right.jpg   
6303  4581           43        Male  4581_le

In [29]:
def detect_outliers_iqr(df: pd.DataFrame, 
                        column: str, 
                        multiplier: float = 1.5) -> pd.DataFrame:
    """
    Detects outliers in a specified column using the IQR method.
    
    Args:
        df (pd.DataFrame): The metadata dataframe.
        column (str): The column to check for outliers.
        multiplier (float): The IQR multiplier to define outlier thresholds.
    
    Returns:
        pd.DataFrame: DataFrame containing the outlier data points.
    """
    if column not in df.columns:
        raise KeyError(f"Column '{column}' not found in DataFrame.")
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    print(Q1, Q3, IQR)
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    print(lower_bound, upper_bound)
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers_iqr = detect_outliers_iqr(metadata, 'Patient Age')
print(outliers_iqr)

51.0 66.0 15.0
28.5 88.5
        ID  Patient Age Patient Sex    Left-Fundus    Right-Fundus  \
33      42           89        Male    42_left.jpg    42_right.jpg   
75      91           28        Male    91_left.jpg    91_right.jpg   
597    689           28      Female   689_left.jpg   689_right.jpg   
741    858           25        Male   858_left.jpg   858_right.jpg   
893   1058           28      Female  1058_left.jpg  1058_right.jpg   
...    ...          ...         ...            ...             ...   
5945  4178           27      Female  4178_left.jpg  4178_right.jpg   
5996  4232           26      Female  4232_left.jpg  4232_right.jpg   
6052  4291           23        Male  4291_left.jpg  4291_right.jpg   
6144  4392           17        Male  4392_left.jpg  4392_right.jpg   
6305  4583           28        Male  4583_left.jpg  4583_right.jpg   

                    Left-Diagnostic Keywords  \
33                             normal fundus   
75                             normal 