In [1]:
import GEOparse
study = "GSE236927"
gse_circrna = GEOparse.get_GEO(geo=study)

geo_id_pbmc = "GSE157007"
gse_pbmc = GEOparse.get_GEO(geo=geo_id_pbmc)

gse_objects = [gse_circrna, gse_pbmc]
geo_ids = [study, geo_id_pbmc]
import re # Import regex module for parsing

# --- Combine datasets and filter PBMC samples from GSE236927 ---

# Identify the GSE object for GSE236927
gse_circrna = None
target_geo_id = "GSE236927"
for i, gse in enumerate(gse_objects):
    if geo_ids[i] == target_geo_id:
        gse_circrna = gse
        break

if gse_circrna is None:
    print(f"Error: {target_geo_id} not found in the provided gse_objects list.")
    # Handle the error appropriately, e.g., raise an exception or exit
    selected_pbmc_samples = {} # Keep variable defined
else:
    # Define sample numbers to exclude (based on the number in the title like "PBMC_X")
    # Sample numbers mentioned in the prompt: 2, 8, 14, 15, 18, 21, 28
    excluded_sample_numbers = {2, 8, 14, 15, 18, 21, 28}

    # Dictionary to store selected GSM objects {gsm_id: gsm_object}
    selected_pbmc_samples = {}

    print(f"--- Filtering samples from {target_geo_id} ---")
    print(f"Excluding sample numbers: {sorted(list(excluded_sample_numbers))}")

    processed_samples = 0
    skipped_excluded = 0
    skipped_not_pbmc = 0
    skipped_no_fi = 0
    skipped_parsing_error = 0

    # Iterate through samples (GSM objects) in the target GSE
    for gsm_id, gsm in gse_circrna.gsms.items():
        processed_samples += 1
        try:
            # --- 1. Extract Sample Number and Check Exclusion ---
            title = gsm.metadata.get('title', [''])[0]
            sample_num = None
            # Use regex to robustly find the number after "PBMC_"
            match = re.search(r'PBMC_(\d+)', title, re.IGNORECASE)
            if match:
                sample_num = int(match.group(1))
            else:
                # If title format is unexpected, log a warning and skip
                # print(f"Warning: Could not parse sample number from title '{title}' for {gsm_id}. Skipping.")
                skipped_parsing_error += 1
                continue

            if sample_num in excluded_sample_numbers:
                skipped_excluded += 1
                continue

            # --- 2. Check if Sample Type is PBMC ---
            # Check source name or title for PBMC indication
            source_name = gsm.metadata.get('source_name_ch1', [''])[0]
            is_pbmc = 'pbmc' in source_name.lower() or 'pbmc' in title.lower()

            if not is_pbmc:
                # print(f"Skipping {gsm_id} (Sample {sample_num}): Not identified as PBMC (Source: '{source_name}', Title: '{title}').")
                skipped_not_pbmc += 1
                continue

            # --- 3. Check for Available FI Score ---
            characteristics = gsm.metadata.get('characteristics_ch1', [])
            fi_score_available = False
            fi_score_value = None
            for char in characteristics:
                # Look for "frailty index" or "FI score", case-insensitive
                if re.search(r'(frailty index|fi score)\s*:', char, re.IGNORECASE):
                    # Extract value after the colon, strip whitespace
                    fi_score_str = char.split(':', 1)[-1].strip()
                    # Check if the value is meaningful (not empty, 'NA', etc.)
                    if fi_score_str and fi_score_str.lower() not in ['na', 'n/a', 'not available', 'unknown']:
                        fi_score_available = True
                        fi_score_value = fi_score_str # Store the score if needed
                        break # Stop searching once a valid FI score is found

            if not fi_score_available:
                # print(f"Skipping {gsm_id} (Sample {sample_num}): FI score not available or is NA/empty.")
                skipped_no_fi += 1
                continue

            # --- If all checks pass, add the sample ---
            selected_pbmc_samples[gsm_id] = gsm
            # print(f"Selected {gsm_id} (Sample {sample_num}) - FI Score: {fi_score_value}")

        except ValueError:
            # Handle case where sample number extraction fails (e.g., non-integer)
            # print(f"Warning: Could not convert parsed sample number to integer for title '{title}' in {gsm_id}. Skipping.")
            skipped_parsing_error += 1
            continue
        except Exception as e:
            # Catch any other unexpected errors during processing
            print(f"Error processing sample {gsm_id}: {e}. Skipping.")
            skipped_parsing_error += 1
            continue

    # --- Report Filtering Summary ---
    print(f"\nFiltering complete for {target_geo_id}:")
    print(f"  - Total samples processed: {processed_samples}")
    print(f"  - Skipped (Excluded Number): {skipped_excluded}")
    print(f"  - Skipped (Not PBMC): {skipped_not_pbmc}")
    print(f"  - Skipped (FI Score Unavailable): {skipped_no_fi}")
    print(f"  - Skipped (Parsing/Other Error): {skipped_parsing_error}")
    print(f"  - Final selected PBMC samples: {len(selected_pbmc_samples)}")

    # Verify the count matches the expected n=88
    expected_count = 88
    if len(selected_pbmc_samples) == expected_count:
        print(f"Successfully selected the expected {expected_count} samples.")
    else:
        print(f"Warning: Selected {len(selected_pbmc_samples)} samples, but expected {expected_count}.")

    # The 'selected_pbmc_samples' dictionary now contains the filtered GSM objects
    # ready for further analysis. Example: Accessing a selected sample's metadata:
    # if selected_pbmc_samples:
    #     first_gsm_id = list(selected_pbmc_samples.keys())[0]
    #     print(f"\nMetadata for first selected sample ({first_gsm_id}):")
    #     print(selected_pbmc_samples[first_gsm_id].metadata)

# Note: This block replaces the original metadata printing loop.
# It focuses specifically on filtering GSE236927 as requested.
# The variable `selected_pbmc_samples` holds the result.
print() # Add a newline for separation


29-Apr-2025 12:54:01 DEBUG utils - Directory ./ already exists. Skipping.
29-Apr-2025 12:54:01 INFO GEOparse - Downloading ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE206nnn/GSE206762/soft/GSE206762_family.soft.gz to ./GSE206762_family.soft.gz
100%|██████████| 3.41k/3.41k [00:00<00:00, 108kB/s]
29-Apr-2025 12:54:02 DEBUG downloader - Size validation passed
29-Apr-2025 12:54:02 DEBUG downloader - Moving /tmp/tmplwv2f77u to /cis/home/iessien1/Documents/injury_atlas/GSE206762_family.soft.gz
29-Apr-2025 12:54:02 DEBUG downloader - Successfully downloaded ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE206nnn/GSE206762/soft/GSE206762_family.soft.gz
29-Apr-2025 12:54:02 INFO GEOparse - Parsing ./GSE206762_family.soft.gz: 
29-Apr-2025 12:54:02 DEBUG GEOparse - DATABASE: GeoMiame
29-Apr-2025 12:54:02 DEBUG GEOparse - SERIES: GSE206762
29-Apr-2025 12:54:02 DEBUG GEOparse - PLATFORM: GPL20795
29-Apr-2025 12:54:02 DEBUG GEOparse - SAMPLE: GSM6261950
29-Apr-2025 12:54:02 DEBUG GEOparse - SAMPLE: GSM6261951


--- Metadata for GSE206762 ---
Title: CircRNA in Frailty
Contact Name: David,,Otaegui
Contributors:
  - David,,Otaegui
  - Leire,,Iparraguirre
  - Ainhoa,,Alberro
  - Itziar,,Vergara
  - Ander,,Matheu

Expression Type: Expression profiling by high throughput sequencing
Overall Design: Frail vs Robust pools analysis

Summary:
This work describes for the first time a different circular RNA (circRNAs) expression pattern between frail and robust individuals. Moreover, the level of some circRNAs is modulated after a physical intervention. These results suggest that they could be used as minimally invasive biomarkers of frailty. Frailty is an intermediate and reversible geriatric syndrome that often precedes dependency. Therefore, its identification is essential to prevent dependency. Diverse molecules have been proposed as biomarkers of frailty, but none of them has reached clinical practice. Recently, circular RNAs have emerged as new non-coding RNAs. Their regulatory role together with th