In [None]:
# import sys
# IN_COLAB = 'google.colab' in sys.modules

# if IN_COLAB:
#     !pip install deriva
#     !pip install bdbag
#     !pip install --upgrade --force pydantic
#     !pip install git+https://github.com/informatics-isi-edu/deriva-ml git+https://github.com/informatics-isi-edu/eye-ai-ml

In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [None]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:

from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

In [None]:
# Variables to configure the rest of the notebook.

cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.

configuration_rid = "2-C8W6" # rid
# Change the confi_file with bag_url=["minid: train", "minid: Valid", "minid: test"]



In [None]:
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

In [None]:
# @title Initiate an Execution
configuration_records = EA.execution_init(configuration_rid=configuration_rid)
configuration_records.model_dump()

In [None]:
configuration_records

# 5 classes and Full fundus converted to 2 classes simialr to van quality labels good and bad (first 3 classes are good and last 2 are bad)

In [None]:



# import os
# import shutil
# import pandas as pd
# from pathlib import Path

# def create_LACDHS_quality_dataset(train_dir: str, validation_dir: str, test_dir: str, output_dir: str, exclude_train: list = [], exclude_valid: list = []) -> tuple:
#     """
#     Creates a dataset for LACDHS image quality classification by organizing images into train, valid, and test folders
#     based on their Subject_image_quality from the subject table.

#     Parameters:
#     - train_dir (str): Path to the raw train dataset bag.
#     - validation_dir (str): Path to the raw validation dataset bag.
#     - test_dir (str): Path to the raw test dataset bag.
#     - output_dir (str): Path to the output directory where the organized dataset will be created.
#     - exclude_train (list): List of image RIDs to exclude from the train set.
#     - exclude_valid (list): List of image RIDs to exclude from the validation set.

#     Returns:
#     - tuple: A tuple containing the paths to the train, validation, and test directories.
#     """
#     def process_dataset(bag_path: str, output_subdir: str, exclude_list: list = []):
#         image_csv_path = os.path.join(bag_path, 'data', 'Image.csv')
#         observation_csv_path = os.path.join(bag_path, 'data', 'Observation.csv')
#         subject_csv_path = os.path.join(bag_path, 'data', 'Subject.csv')

#         image_df = pd.read_csv(image_csv_path)
#         observation_df = pd.read_csv(observation_csv_path)
#         subject_df = pd.read_csv(subject_csv_path)

#         # Merge dataframes
#         merged_df = image_df.merge(observation_df, left_on='Observation', right_on='RID')
#         merged_df = merged_df.merge(subject_df, left_on='Subject', right_on='RID')

#         # Filter for specific quality categories
#         quality_categories = ['69XP', '69XJ', '69XT', '69XM', '69XR']
#         merged_df = merged_df[merged_df['Subject_image_quality'].isin(quality_categories)]

#         image_root_path = os.path.join(bag_path, 'data', 'assets', 'Image')

#         for _, row in merged_df.iterrows():
#             if row['RID_x'] not in exclude_list:  # RID_x is the Image RID
#                 quality = row['Subject_image_quality']
#                 filename = row['Filename']
#                 src_path = os.path.join(image_root_path, filename)
#                 dst_dir = os.path.join(output_dir, output_subdir, quality)
#                 os.makedirs(dst_dir, exist_ok=True)
#                 dst_path = os.path.join(dst_dir, filename)
#                 shutil.copy2(src_path, dst_path)

#     # Process train dataset
#     process_dataset(train_dir, 'train', exclude_train)

#     # Process validation dataset
#     process_dataset(validation_dir, 'valid', exclude_valid)

#     # Process test dataset
#     process_dataset(test_dir, 'test')

#     train_path = os.path.join(output_dir, 'train')
#     valid_path = os.path.join(output_dir, 'valid')
#     test_path = os.path.join(output_dir, 'test')

#     return train_path, valid_path, test_path


import os
import shutil
import pandas as pd
from pathlib import Path
from tqdm import tqdm

def create_LACDHS_binary_quality_dataset(train_dir: str, validation_dir: str, test_dir: str, output_dir: str) -> tuple:
    """
    Creates a binary dataset for LACDHS image quality classification by organizing images into train, valid, and test folders
    based on their Subject_image_quality from the subject table, mapped to binary "Good" and "Bad" categories.
    
    Parameters:
    - train_dir (str): Path to the raw train dataset bag.
    - validation_dir (str): Path to the raw validation dataset bag.
    - test_dir (str): Path to the raw test dataset bag.
    - output_dir (str): Path to the output directory where the organized dataset will be created.
    
    Returns:
    - tuple: A tuple containing the paths to the train, validation, and test directories.
    """
    def process_dataset(bag_path: str, output_subdir: str):
        image_csv_path = os.path.join(bag_path, 'data', 'Image.csv')
        observation_csv_path = os.path.join(bag_path, 'data', 'Observation.csv')
        subject_csv_path = os.path.join(bag_path, 'data', 'Subject.csv')
        
        image_df = pd.read_csv(image_csv_path)
        observation_df = pd.read_csv(observation_csv_path)
        subject_df = pd.read_csv(subject_csv_path)
        
        # Merge dataframes
        merged_df = image_df.merge(observation_df, left_on='Observation', right_on='RID')
        merged_df = merged_df.merge(subject_df, left_on='Subject', right_on='RID')
        
        # Define quality mapping
        quality_mapping = { 
            '69XT': '690J',  # Excellent #690J eye-ai:690J Good
            '69XP': '690J',  # Adequate #690J eye-ai:690J Good
            '69XJ': '690J',  # Good #690J eye-ai:690J Good
            '69XM': '692J',   # Insufficient for Full Interpretation # 692J eye-ai:692J Bad
            '69XR': '692J'    # Insufficient for Any Interpretation # 692J eye-ai:692J Bad
        }
        
        # Filter for specific quality categories and map to binary categories
        merged_df = merged_df[merged_df['Subject_image_quality'].isin(quality_mapping.keys())]
        merged_df['binary_quality'] = merged_df['Subject_image_quality'].map(quality_mapping)
        
        image_root_path = os.path.join(bag_path, 'data', 'assets', 'Image')
        
        # Using tqdm to show progress
        for _, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc=f"Processing {output_subdir}"):
            binary_quality = row['binary_quality']
            filename = row['Filename']
            src_path = os.path.join(image_root_path, filename)
            dst_dir = os.path.join(output_dir, output_subdir, binary_quality)
            os.makedirs(dst_dir, exist_ok=True)
            dst_path = os.path.join(dst_dir, filename)
            shutil.copy2(src_path, dst_path)

    print("Starting dataset creation...")
    
    # Process train dataset
    process_dataset(train_dir, 'train')
    
    # Process validation dataset
    process_dataset(validation_dir, 'valid')
    
    # Process test dataset
    process_dataset(test_dir, 'test')

    train_path = os.path.join(output_dir, 'train')
    valid_path = os.path.join(output_dir, 'valid')
    test_path = os.path.join(output_dir, 'test')

    print("Dataset creation completed.")
    return train_path, valid_path, test_path



In [None]:
configuration_records.working_dir

In [None]:
# @title Data Preprocessing (Filtering Image.csv for just Field_2 Images)
train_dir = configuration_records.bag_paths[0] # path to the raw train dataset
validation_dir = configuration_records.bag_paths[1]
test_dir = configuration_records.bag_paths[2]

# Call the create_LACDHS_binary_quality_dataset function
train_path, valid_path, test_path = create_LACDHS_binary_quality_dataset(
    train_dir=str(configuration_records.bag_paths[0]),
    validation_dir=str(configuration_records.bag_paths[1]),
    test_dir=str(configuration_records.bag_paths[2]),
    output_dir=str(configuration_records.working_dir),
)

# Print the paths to verify
print("Train dataset path:", train_path)
print("Validation dataset path:", valid_path)
print("Test dataset path:", test_path)

In [None]:
import os

def count_files(directory):
    return len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])

def analyze_lacdhs_angle_dataset(base_path):
    main_folders = ['train', 'valid', 'test']
    
    for main_folder in main_folders:
        main_folder_path = os.path.join(base_path, main_folder)
        if not os.path.exists(main_folder_path):
            print(f"{main_folder} folder not found")
            continue
        
        print(f"\nAnalyzing {main_folder} folder:")
        
        total_files = 0
        for angle_folder in os.listdir(main_folder_path):
            angle_folder_path = os.path.join(main_folder_path, angle_folder)
            if os.path.isdir(angle_folder_path):
                file_count = count_files(angle_folder_path)
                print(f"  {angle_folder}: {file_count} images")
                total_files += file_count
        
        print(f"Total images in {main_folder}: {total_files}")

# Usage
base_path = "/data/sreenidhi/EyeAI_working/"
analyze_lacdhs_angle_dataset(base_path)





In [None]:
import os
import random
import matplotlib.pyplot as plt
from PIL import Image

def visualize_lacdhs_quality_dataset(base_path, samples_per_angle=6):
    main_folders = ['train', 'valid', 'test']
    
    for main_folder in main_folders:
        main_folder_path = os.path.join(base_path, main_folder)
        if not os.path.exists(main_folder_path):
            print(f"{main_folder} folder not found")
            continue
        
        print(f"\nVisualizing samples from {main_folder} folder:")
        
        angle_folders = [f for f in os.listdir(main_folder_path) if os.path.isdir(os.path.join(main_folder_path, f))]
        
        # Calculate grid size
        n_angles = len(angle_folders)
        n_cols = samples_per_angle
        n_rows = n_angles
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5))
        fig.suptitle(f'Sample Images from {main_folder.capitalize()} Set', fontsize=16)
        
        for i, angle_folder in enumerate(angle_folders):
            angle_folder_path = os.path.join(main_folder_path, angle_folder)
            image_files = [f for f in os.listdir(angle_folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            
            if len(image_files) < samples_per_angle:
                print(f"Warning: Not enough images in {angle_folder}. Using all available images.")
                selected_files = image_files
            else:
                selected_files = random.sample(image_files, samples_per_angle)
            
            for j, image_file in enumerate(selected_files):
                img_path = os.path.join(angle_folder_path, image_file)
                img = Image.open(img_path)
                axes[i, j].imshow(img)
                axes[i, j].axis('off')
                
                # Add image filename as title for each subplot
                axes[i, j].set_title(image_file, fontsize=8)
                
                if j == 0:
                    axes[i, j].set_ylabel(angle_folder, rotation=0, labelpad=40, va='center', fontsize=10)
        
        plt.tight_layout()
        plt.subplots_adjust(top=0.95, bottom=0.05, left=0.2, right=0.98)
        plt.show()
        
        # Print confirmation of angles
        print(f"Angles in {main_folder} set:")
        for angle in angle_folders:
            print(f"  - {angle}")

# Usage
base_path = "/data/sreenidhi/EyeAI_working/"
# visualize_lacdhs_quality_dataset(base_path)

In [None]:

output_path = str(EA.working_dir) + "/Execution_Assets/" + configuration_records.vocabs['Execution_Asset_Type'][0].name
os.mkdir(output_path)

In [None]:
output_path

In [None]:
best_hyper_parameters_json_path = str(configuration_records.assets_paths[0])

In [None]:
best_hyper_parameters_json_path

In [None]:
import json

# Open and load the JSON file
with open(best_hyper_parameters_json_path, 'r') as file:
    data = json.load(file)

# Print the contents of the JSON file
print(json.dumps(data, indent=4))

In [None]:
# @title Execute Training algorithm

from eye_ai.models.vgg19_lacdhs_quality_train import main

with EA.execution(execution_rid=configuration_records.execution_rid) as exec:
  main(train_path=train_path,
       valid_path=valid_path, 
       output_path=output_path,
       best_hyperparameters_json_path=best_hyper_parameters_json_path,
       model_name="VGG19_Catalog_LAC_DHS_Quality_Trained_model_June_30_2024"
      )


In [None]:
data

In [None]:
# @title Execute Evaluation algorithm

from eye_ai.models.vgg19_lacdhs_quality_predict import predict_and_evaluate
with EA.execution(execution_rid=configuration_records.execution_rid) as exec:
    predict_and_evaluate(
        model_path=output_path + '/VGG19_Catalog_LAC_DHS_Quality_Trained_model_June_30_2024.h5',
        image_path=test_path,
        output_dir=output_path,
        best_hyperparameters_json_path=best_hyper_parameters_json_path
    )

In [None]:
# # @title Save Execution Assets (model) and Metadata
uploaded_assets = EA.execution_upload(configuration_records.execution_rid, True)