<a href="https://colab.research.google.com/github/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/VGG19_Diagnosis_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VGG19 Training --- Fine tuning on Van's Labels

This notebook is used to train VGG19 model for glacoma diagnosis.

In [7]:
# import sys
# IN_COLAB = 'google.colab' in sys.modules

# if IN_COLAB:
#     !pip install deriva
#     !pip install bdbag
#     !pip install --upgrade --force pydantic
#     !pip install git+https://github.com/informatics-isi-edu/deriva-ml git+https://github.com/informatics-isi-edu/eye-ai-ml

In [8]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [9]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [10]:

from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2024-07-01 07:39:04,787 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-07-01 07:39:04,788 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


You are already logged in.


Connect to Eye-AI catalog.  Configure to store data local cache and working directories.  Initialize Eye-AI for pending execution based on the provided configuration file.

In [11]:
# Variables to configure the rest of the notebook.

cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.

configuration_rid = "2-C94P" # rid
# Change the confi_file with bag_url=["minid: train", "minid: Valid", "minid: test"]


In [12]:
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

2024-07-01 07:39:04,843 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-07-01 07:39:04,844 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


In [13]:
# @title Initiate an Execution
configuration_records = EA.execution_init(configuration_rid=configuration_rid)
configuration_records.model_dump()

2024-07-01 07:39:05,396 - INFO - File [/data/sreenidhi/EyeAI_working/Execution_Metadata/Execution_Config-vgg19_catalog_model_training_LACDHS_van_finetuning_graded_labels_sreenidhi_june_30_2024.json] transfer successful. 1.05 KB transferred. Elapsed time: 0:00:00.000126.
2024-07-01 07:39:05,397 - INFO - Verifying MD5 checksum for downloaded file [/data/sreenidhi/EyeAI_working/Execution_Metadata/Execution_Config-vgg19_catalog_model_training_LACDHS_van_finetuning_graded_labels_sreenidhi_june_30_2024.json]
2024-07-01 07:39:05,414 - INFO - Configuration validation successful!
2024-07-01 07:39:14,690 - INFO - File [/data/sreenidhi/EyeAI_working/Execution_Assets/best_hyperparameters_exluding_no_optic_disc_images_june_24_2024.json] transfer successful. 0.69 KB transferred. Elapsed time: 0:00:00.000076.
2024-07-01 07:39:14,690 - INFO - Verifying SHA256 checksum for downloaded file [/data/sreenidhi/EyeAI_working/Execution_Assets/best_hyperparameters_exluding_no_optic_disc_images_june_24_2024.jso

{'caching_dir': PosixPath('/data'),
 'working_dir': PosixPath('/data/sreenidhi/EyeAI_working'),
 'vocabs': {'Workflow_Type': [{'name': 'VGG19_Catalog_Model_LACDHS_Van_Finetuning',
    'rid': '2-C94T'}],
  'Execution_Asset_Type': [{'name': 'VGG19_Catalog_Model_LACDHS_Van_Finetuning',
    'rid': '2-C94W'}]},
 'execution_rid': '2-C96R',
 'workflow_rid': '2-C8ZM',
 'bag_paths': [PosixPath('/data/2-277G_6aa1a6861eee5a79bce4bf071065355f95a066c2a1ff326089d43048a7e0f185/Dataset_2-277G'),
  PosixPath('/data/2-277J_81c873a311aa6a67cf2eef44bd9056cb19181b299a6e44327ea3553616f18725/Dataset_2-277J'),
  PosixPath('/data/2-36BW_772f62deab4b12b67bf6fa0cd347a095ec28f75aa11c9c9f068e22ee390aec36/Dataset_2-36BW'),
  PosixPath('/data/2-39FY_1d2a0510049e238d0206d75476122ce12750ea9a5da642328afc62d52bd34813/Dataset_2-39FY'),
  PosixPath('/data/2-277M_8c4b855c2752e098580a5bb0d1b63a8cedde4462805fe74cddc912a72fb39963/Dataset_2-277M')],
 'assets_paths': [PosixPath('/data/sreenidhi/EyeAI_working/Execution_Assets/be

In [14]:
configuration_records

ConfigurationRecord(caching_dir=PosixPath('/data'), working_dir=PosixPath('/data/sreenidhi/EyeAI_working'), vocabs={'Workflow_Type': [Term(name='VGG19_Catalog_Model_LACDHS_Van_Finetuning', rid='2-C94T')], 'Execution_Asset_Type': [Term(name='VGG19_Catalog_Model_LACDHS_Van_Finetuning', rid='2-C94W')]}, execution_rid='2-C96R', workflow_rid='2-C8ZM', bag_paths=[PosixPath('/data/2-277G_6aa1a6861eee5a79bce4bf071065355f95a066c2a1ff326089d43048a7e0f185/Dataset_2-277G'), PosixPath('/data/2-277J_81c873a311aa6a67cf2eef44bd9056cb19181b299a6e44327ea3553616f18725/Dataset_2-277J'), PosixPath('/data/2-36BW_772f62deab4b12b67bf6fa0cd347a095ec28f75aa11c9c9f068e22ee390aec36/Dataset_2-36BW'), PosixPath('/data/2-39FY_1d2a0510049e238d0206d75476122ce12750ea9a5da642328afc62d52bd34813/Dataset_2-39FY'), PosixPath('/data/2-277M_8c4b855c2752e098580a5bb0d1b63a8cedde4462805fe74cddc912a72fb39963/Dataset_2-277M')], assets_paths=[PosixPath('/data/sreenidhi/EyeAI_working/Execution_Assets/best_hyperparameters_exluding_no

In [15]:
exclude_train = pd.read_csv(configuration_records.assets_paths[1])['ID'].to_list()
exclude_valid = pd.read_csv(configuration_records.assets_paths[2])['ID'].to_list()

In [23]:
# @title Data Preprocessing (Filtering Image.csv for just Field_2 Images)
train_dir = configuration_records.bag_paths[2] # path to the raw train dataset
validation_dir = configuration_records.bag_paths[3]
test_dir = configuration_records.bag_paths[4]

train_cropped_image_path, train_cropped_csv = EA.create_cropped_images(str(train_dir),
                                                                       output_dir = str(EA.working_dir) +'/train',
                                                                       crop_to_eye=True,
                                                                       exclude_list=exclude_train)
validation_cropped_image_path, validation_cropped_csv = EA.create_cropped_images(str(validation_dir),
                                                                                 output_dir = str(EA.working_dir) +'/valid',
                                                                                 crop_to_eye=True,
                                                                                 exclude_list=exclude_valid)
test_cropped_image_path, test_cropped_csv = EA.create_cropped_images(str(test_dir),
                                                                     output_dir = str(EA.working_dir) +'/test',
                                                                     crop_to_eye=True)


In [26]:
# # without no optic disc images

import os

def count_files(directory):
    return len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])

def analyze_directory(base_path):
    main_folders = ['train', 'test', 'valid']
    
    for main_folder in main_folders:
        main_folder_path = os.path.join(base_path, main_folder)
        if not os.path.exists(main_folder_path):
            print(f"{main_folder} folder not found")
            continue
        
        print(f"\nAnalyzing {main_folder} folder:")
        
        image_cropped_path = os.path.join(main_folder_path, 'Image_cropped')
        if not os.path.exists(image_cropped_path):
            print("Image_cropped folder not found")
            continue
        
        total_files = 0
        for subfolder in os.listdir(image_cropped_path):
            subfolder_path = os.path.join(image_cropped_path, subfolder)
            if os.path.isdir(subfolder_path):
                file_count = count_files(subfolder_path)
                print(f"  {subfolder}: {file_count} files")
                total_files += file_count
        
        print(f"Total files in {main_folder}: {total_files}")

# Assuming you're running this script from the directory containing train, test, and valid folders
base_path = "/data/sreenidhi/EyeAI_working/" #os.getcwd()
analyze_directory(base_path)




Analyzing train folder:
  2SKC_No_Glaucoma: 1653 files
  2SKA_Suspected_Glaucoma: 1770 files
Total files in train: 3423

Analyzing test folder:
  2SKC_No_Glaucoma: 526 files
  2SKA_Suspected_Glaucoma: 568 files
Total files in test: 1094

Analyzing valid folder:
  2SKC_No_Glaucoma: 423 files
  2SKA_Suspected_Glaucoma: 438 files
Total files in valid: 861


In [27]:

output_path = str(EA.working_dir) + "/Execution_Assets/" + configuration_records.vocabs['Execution_Asset_Type'][0].name
os.mkdir(output_path)

In [44]:
output_path

'/data/sreenidhi/EyeAI_working/Execution_Assets/VGG19_Catalog_Model_LACDHS_Van_Finetuning'

In [45]:
best_hyper_parameters_json_path = str(configuration_records.assets_paths[0])

In [46]:
best_hyper_parameters_json_path

'/data/sreenidhi/EyeAI_working/Execution_Assets/best_hyperparameters_exluding_no_optic_disc_images_june_24_2024.json'

In [51]:
# @title Execute Training algorithm
from eye_ai.models.vgg19_diagnosis_fine_tune_train import main
with EA.execution(execution_rid=configuration_records.execution_rid) as exec:
  main(train_path=train_cropped_image_path,
       valid_path=validation_cropped_image_path, 
       test_path=test_cropped_image_path, 
       output_path = output_path,
       best_hyperparameters_json_path = best_hyper_parameters_json_path,
       model_name = "VGG19_Catalog_LAC_DHS_Cropped_Data_exlcuding_no_Optic_disc_fundus_Trained_model_June_24_2024_Van_Fine_Tuned",
       original_model_path = str(configuration_records.assets_paths[3])
       )
                    


Found 3423 images belonging to 2 classes.
Found 861 images belonging to 2 classes.
Found 1094 images belonging to 2 classes.
train_generator.class_indices :  {'2SKC_No_Glaucoma': 0, '2SKA_Suspected_Glaucoma': 1}
validation_generator.class_indices :  {'2SKC_No_Glaucoma': 0, '2SKA_Suspected_Glaucoma': 1}
test_generator.class_indices :  {'2SKC_No_Glaucoma': 0, '2SKA_Suspected_Glaucoma': 1}
Layer input_2: trainable = True
Layer vgg19: trainable = True
  Inner Layer input_1: trainable = True
  Inner Layer block1_conv1: trainable = True
  Inner Layer block1_conv2: trainable = True
  Inner Layer block1_pool: trainable = True
  Inner Layer block2_conv1: trainable = True
  Inner Layer block2_conv2: trainable = True
  Inner Layer block2_pool: trainable = True
  Inner Layer block3_conv1: trainable = True
  Inner Layer block3_conv2: trainable = True
  Inner Layer block3_conv3: trainable = True
  Inner Layer block3_conv4: trainable = True
  Inner Layer block3_pool: trainable = True
  Inner Layer bl

2024-07-01 08:49:25,612 - INFO - Test results - [0.4419810175895691, 0.8901255130767822, 0.8184044361114502, 0.8180987238883972]


Model Eval results: [0.4419810175895691, 0.8901255130767822, 0.8184044361114502, 0.8180987238883972]


  saving_api.save_model(
2024-07-01 08:49:25,842 - INFO - VGG19_Catalog_LAC_DHS_Cropped_Data_exlcuding_no_Optic_disc_fundus_Trained_model_June_24_2024_Van_Fine_Tuned Model fine-tuned, Model and training history are saved successfully.


In [52]:
# @title Save Execution Assets (model) and Metadata
uploaded_assets = EA.execution_upload(configuration_records.execution_rid, True)

2024-07-01 08:50:16,314 - INFO - Initializing uploader: GenericUploader v1.7.1 [Python 3.10.13, Linux-5.10.210-201.852.amzn2.x86_64-x86_64-with-glibc2.26]
2024-07-01 08:50:16,316 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-07-01 08:50:16,317 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2024-07-01 08:50:16,353 - INFO - Checking for updated configuration...
2024-07-01 08:50:16,469 - INFO - Updated configuration found.
2024-07-01 08:50:16,471 - INFO - Scanning files in directory [/data/sreenidhi/EyeAI_working/Execution_Assets/VGG19_Catalog_Model_LACDHS_Van_Finetuning]...
2024-07-01 08:50:16,474 - INFO - Including file: [/data/sreenidhi/EyeAI_working/Execution_Assets/VGG19_Catalog_Model_LACDHS_Van_Finetuning/VGG19_Catalog_LAC_DHS_Cropped_Data_exlcuding_no_Optic_disc_fundus_Trained_