# 1.) Imports

In [None]:
from google.colab import drive
import os
import pandas as pd
import nibabel as nib

# 2.) Download and Extract Data

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# Define base path
base_path = '/content/drive/MyDrive/BrainAgeRegression/data'
os.makedirs(base_path, exist_ok=True)

# List of disc numbers to download and extract
disc_numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

# Base URL
base_url = 'https://download.nrg.wustl.edu/data/oasis_cross-sectional_disc'

for i in disc_numbers:
    filename = f'oasis_cross-sectional_disc{i}.tar.gz'
    url = f'{base_url}{i}.tar.gz'
    file_path = os.path.join(base_path, filename)

    # Download
    print(f'📥 Downloading disc {i}...')
    !wget -q -O "{file_path}" "{url}"
    print(f'✅ Downloaded {filename}')

    # Extract
    print(f'📂 Extracting {filename}...')
    !tar -xvzf "{file_path}" -C "{base_path}"
    print(f'✅ Extracted disc {i}\n')

# 3.) Convert .hdr + .img into .nii.gz

In [None]:
import os
import nibabel as nib

base_path = '/content/drive/MyDrive/BrainAgeRegression/data'
nifti_path = os.path.join(base_path, 'nifti')
os.makedirs(nifti_path, exist_ok=True)

disc_folders = ['disc1', 'disc2', 'disc3', 'disc4', 'disc5', 'disc6', 'disc7', 'disc8', 'disc9', 'disc10', 'disc11', 'disc12']

for disc in disc_folders:
    disc_path = os.path.join(base_path, disc)
    for folder in os.listdir(disc_path):
        subj_path = os.path.join(disc_path, folder)
        if os.path.isdir(subj_path) and folder.startswith('OAS1_'):
            hdr_file = os.path.join(subj_path, 'PROCESSED', 'MPRAGE', 'T88_111',
                                    f'{folder}_mpr_n4_anon_111_t88_gfc.hdr')
            if os.path.exists(hdr_file):
                try:
                    img = nib.load(hdr_file)
                    out_file = os.path.join(nifti_path, f'{folder}.nii.gz')
                    nib.save(img, out_file)
                    print(f'✅ Converted {folder}')
                except Exception as e:
                    print(f'❌ Failed to convert {folder}: {e}')


In [None]:
base_path = '/content/drive/MyDrive/BrainAgeRegression/data'
nifti_path = os.path.join(base_path, 'nifti')

In [None]:
nifti_files = os.listdir(nifti_path)
print(f'🧠 Total NIfTI files: {len(nifti_files)}')


# 4.) Load our Cross Sectional CSV and Match Scans to Metadata

In [None]:
# Paths
nifti_dir = '/content/drive/MyDrive/BrainAgeRegression/data/nifti'
xlsx_path = '/content/drive/MyDrive/BrainAgeRegression/data/oasis_cross-sectional.xlsx'

# Load Excel metadata
df = pd.read_excel(xlsx_path)
df['ID'] = df['ID'].astype(str).str.strip()

# Extract subject ID from Excel (e.g., "OAS1_0001" from "OAS1_0001_MR1")
df['SubjectID'] = df['ID'].str.extract(r'(OAS1_\d{4})')

# Get subject IDs from NIfTI filenames
nifti_ids = {
    f.split('_MR')[0]
    for f in os.listdir(nifti_dir)
    if f.endswith('.nii.gz')
}

# Filter metadata to matched subjects
matched_df = df[df['SubjectID'].isin(nifti_ids)].copy()
print(f'✅ Matched {len(matched_df)} subjects')

# Optional: Save matched metadata
matched_csv_path = '/content/drive/MyDrive/BrainAgeRegression/matched_metadata.csv'
matched_df.to_csv(matched_csv_path, index=False)
print(f'💾 Saved matched metadata to {matched_csv_path}')
