In [None]:
#!/usr/bin/env python3
"""
Longevity Genes Details Fetcher

Fetches additional gene details from HGNC, Ensembl, and NCBI APIs to enrich
the master_longivity_genes.csv file with additional gene information.
"""

import os
import sys
import csv
import requests
import pandas as pd
from typing import Dict, List, Any, Optional, Tuple
import logging
from datetime import datetime
import time
import json
import re
from bs4 import BeautifulSoup

In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# Configuration parameters
HGNC_API_URL = "https://rest.genenames.org/fetch/symbol"
ENSEMBL_API_BASE = "https://rest.ensembl.org"
NCBI_EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
GENAGE_BASE_URL = "https://genomics.senescence.info"
REQUEST_TIMEOUT = 30
RATE_LIMIT_DELAY = 0.34  # NCBI requires <= 3 requests per second
HUMAN_TAXONOMY_ID = 9606

In [None]:
# Input and output file paths
INPUT_CSV = "master_longivity_genes.csv"
OUTPUT_CSV = "master_longivity_genes_enriched.csv"

In [None]:
def clean_references_from_text(text: str) -> str:
    """
    Remove reference citations like [2210], [2215], [4358] from text.
    Handles various formats including single references [123], comma-separated [123, 456], 
    and ranges [123-125].
    
    Args:
        text (str): Input text that may contain reference citations
        
    Returns:
        str: Cleaned text with reference citations removed
    """
    if not text:
        return text
    
    # Multiple patterns to match different reference citation formats
    reference_patterns = [
        r'\[\d+\]',                    # Single reference [123]
        r'\[\d+\s*,\s*\d+\]',         # Comma-separated [123, 456]
        r'\[\d+\s*-\s*\d+\]',         # Range references [123-125]
        r'\[\d+(?:\s*,\s*\d+)*\]',    # Multiple comma-separated [123, 456, 789]
    ]
    
    # Apply each pattern to remove references
    cleaned_text = text
    for pattern in reference_patterns:
        cleaned_text = re.sub(pattern, '', cleaned_text)
    
    # Remove spaces before punctuation marks (e.g., " ." -> ".")
    cleaned_text = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned_text)
    
    # Clean up any extra whitespace that might have been left behind
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    
    return cleaned_text

In [None]:
def read_input_csv() -> pd.DataFrame:
    """Read the master longevity genes CSV file."""
    try:
        df = pd.read_csv(INPUT_CSV)
        logger.info(f"Read {len(df)} genes from {INPUT_CSV}")
        return df
    except Exception as e:
        logger.error(f"Error reading input CSV: {e}")
        raise

In [None]:
def fetch_hgnc_data(gene_symbol: str) -> Dict[str, Any]:
    """Fetch HGNC data for a gene symbol."""
    try:
        headers = {
            'Accept': 'application/json',
            'User-Agent': 'Longevity-Genes-Fetcher/1.0'
        }
        
        # Use the symbol endpoint to get specific gene data
        url = f"{HGNC_API_URL}/{gene_symbol}"
        
        response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        
        if response.status_code == 404:
            logger.warning(f"No HGNC data found for {gene_symbol}")
            return {}
        
        response.raise_for_status()
        data = response.json()
        
        if 'response' not in data or 'docs' not in data['response']:
            logger.warning(f"Unexpected HGNC response format for {gene_symbol}")
            return {}
        
        docs = data['response']['docs']
        if not docs:
            logger.warning(f"No HGNC docs found for {gene_symbol}")
            return {}
        
        gene_data = docs[0]  # Take the first (and should be only) result
        
        # Extract relevant fields
        # Handle chromosome extraction from location field (format: "1p36.33")
        location = gene_data.get('location', '')
        chromosome = ''
        if location:
            # Extract chromosome number from location (e.g., "1p36.33" -> "1")
            parts = location.split()
            if parts:
                chrom_part = parts[0]
                # Extract just the chromosome number/letter
                chrom_match = re.match(r'^([1-9][0-9]?|X|Y)', chrom_part)
                if chrom_match:
                    chromosome = chrom_match.group(1)
        
        hgnc_data = {
            'hgnc_gene_id': gene_data.get('hgnc_id', ''),
            'gene_symbol_aliases': ', '.join(gene_data.get('alias_symbol', [])) if gene_data.get('alias_symbol') else '',
            'entrez_id': gene_data.get('entrez_id', ''),
            'ensembl_gene_id': gene_data.get('ensembl_gene_id', ''),
            'chromosome': chromosome,
        }
        
        return hgnc_data
        
    except Exception as e:
        logger.warning(f"Error fetching HGNC data for {gene_symbol}: {e}")
        return {}

In [None]:
def fetch_ensembl_data(gene_symbol: str) -> Dict[str, Any]:
    """Fetch Ensembl data for a gene symbol including detailed information."""
    try:
        headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json',
            'User-Agent': 'Longevity-Genes-Fetcher/1.0'
        }
        
        # Search for gene using symbol with expanded data including transcripts
        url = f"{ENSEMBL_API_BASE}/lookup/symbol/homo_sapiens/{gene_symbol}"
        params = {
            'expand': 1,  # Include transcripts to get exon count and protein info
            'format': 'full'
        }
        
        response = requests.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT)
        
        if response.status_code == 404:
            logger.warning(f"No Ensembl data found for {gene_symbol}")
            return {}
        
        response.raise_for_status()
        data = response.json()
        
        # Count exons from transcripts
        total_exons = 0
        canonical_exon_count = 0
        
        try:
            if 'Transcript' in data:
                transcript_data_raw = data.get('Transcript', {})
                
                # Handle both dict and list responses
                transcript_items = []
                if isinstance(transcript_data_raw, dict):
                    transcript_items = transcript_data_raw.items()
                elif isinstance(transcript_data_raw, list):
                    for transcript in transcript_data_raw:
                        if isinstance(transcript, dict):
                            transcript_id = transcript.get('id', '')
                            if transcript_id:
                                transcript_items.append((transcript_id, transcript))
                else:
                    transcript_items = []
                
                for transcript_id, transcript in transcript_items:
                    if not transcript_id or not transcript or not isinstance(transcript, dict):
                        continue
                    
                    try:
                        # Count exons for this transcript
                        exon_data = transcript.get('Exon', [])
                        if isinstance(exon_data, dict):
                            exon_count = len(exon_data)
                        elif isinstance(exon_data, list):
                            exon_count = len(exon_data)
                        else:
                            exon_count = 0
                        
                        total_exons = max(total_exons, exon_count)
                        
                        # Get canonical transcript exon count
                        if transcript.get('is_canonical', False):
                            canonical_exon_count = exon_count
                            
                    except Exception as e:
                        logger.warning(f"Error processing transcript {transcript_id}: {e}")
                        continue
        except Exception as e:
            logger.warning(f"Error processing transcript data for {gene_symbol}: {e}")
        
        # Use canonical exon count if available, otherwise use max
        final_exon_count = canonical_exon_count if canonical_exon_count > 0 else total_exons
        
        ensembl_data = {
            'ensembl_geneid': data.get('id', ''),
            'chromosome': data.get('seq_region_name', ''),
            'assembly': 'GRCh38',  # Ensembl uses GRCh38 as default
            'gene_type': data.get('biotype', ''),  # Gene type/biotype
            'number_of_exons': final_exon_count,
            'gene_start_position': data.get('start', ''),
            'gene_end_position': data.get('end', ''),
        }
        
        return ensembl_data
        
    except Exception as e:
        logger.warning(f"Error fetching Ensembl data for {gene_symbol}: {e}")
        return {}

In [None]:
def fetch_ncbi_data(gene_symbol: str) -> Dict[str, Any]:
    """Fetch NCBI gene data using E-utilities API."""
    try:
        # Step 1: Search for gene ID
        search_url = f"{NCBI_EUTILS_BASE}esearch.fcgi"
        search_params = {
            'db': 'gene',
            'term': f"{gene_symbol}[gene name] AND human[orgn]",
            'retmode': 'json',
            'retmax': 1
        }
        
        response = requests.get(search_url, params=search_params, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        search_data = response.json()
        
        id_list = search_data.get('esearchresult', {}).get('idlist', [])
        
        if not id_list:
            logger.warning(f"No NCBI gene ID found for {gene_symbol}")
            return {}
        
        gene_id = id_list[0]
        time.sleep(RATE_LIMIT_DELAY)  # Rate limiting
        
        # Step 2: Fetch gene details
        fetch_url = f"{NCBI_EUTILS_BASE}efetch.fcgi"
        fetch_params = {
            'db': 'gene',
            'id': gene_id,
            'retmode': 'xml'
        }
        
        response = requests.get(fetch_url, params=fetch_params, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        
        # Parse XML response (simplified - in production you'd want proper XML parsing)
        xml_content = response.text
        
        # Extract chromosome information from XML
        chromosome = ''
        if '<Chromosome>' in xml_content:
            start = xml_content.find('<Chromosome>') + len('<Chromosome>')
            end = xml_content.find('</Chromosome>', start)
            if start < end:
                chromosome = xml_content[start:end].strip()
        
        ncbi_data = {
            'ncbi_gene_id': gene_id,
            'chromosome': chromosome
        }
        
        time.sleep(RATE_LIMIT_DELAY)  # Rate limiting
        return ncbi_data
        
    except Exception as e:
        logger.warning(f"Error fetching NCBI data for {gene_symbol}: {e}")
        return {}

In [None]:
def fetch_genage_data(gene_symbol: str) -> Dict[str, Any]:
    """Fetch GenAge data for a gene symbol to get additional aliases."""
    try:
        headers = {
            'User-Agent': 'Longevity-Genes-Fetcher/1.0'
        }
        
        # Try both URL formats based on the GenAge structure
        # Based on testing, ?hgnc=SYMBOL works better than ?symbol=SYMBOL
        urls_to_try = [
            f"{GENAGE_BASE_URL}/genes/entry.php?hgnc={gene_symbol}",    # Try symbol as hgnc parameter
            f"{GENAGE_BASE_URL}/genes/entry.php?symbol={gene_symbol}",   # Fallback to symbol parameter
        ]
        
        for url in urls_to_try:
            try:
                response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
                
                if response.status_code != 200:
                    continue
                
                soup = BeautifulSoup(response.text, 'html.parser')
                page_text = soup.get_text()
                
                # Check if this is a valid gene page (not the "No query selected" page)
                if "No query selected" in page_text or len(page_text) < 5000:
                    continue
                
                # Look for gene aliases in the page content
                aliases = []
                
                # Method 1: Look for the specific "Aliases" section in GenAge
                # Based on the NFE2L2 example, look for "Aliases" as a header/field
                h2_headings = soup.find_all(['h2', 'h3', 'h4'])
                for heading in h2_headings:
                    if heading.get_text().strip().lower() in ['aliases', 'alias']:
                        # Look for the next content after this heading
                        next_element = heading.find_next_sibling()
                        while next_element:
                            if next_element.name in ['p', 'div', 'td']:
                                alias_text = next_element.get_text().strip()
                                if alias_text and alias_text.lower() != gene_symbol.lower():
                                    # Split by common separators and clean
                                    alias_parts = re.split(r'[,;|/\n]', alias_text)
                                    for part in alias_parts:
                                        clean_alias = part.strip()
                                        if clean_alias and clean_alias.lower() != gene_symbol.lower() and len(clean_alias) > 1:
                                            aliases.append(clean_alias)
                            elif next_element.name in ['h2', 'h3', 'h4', 'table']:
                                break
                            next_element = next_element.find_next_sibling()
                        break
                
                # Method 2: Look for aliases in table structures
                # Find tables and look for "Aliases" or "HGNC symbol" rows
                tables = soup.find_all('table')
                for table in tables:
                    rows = table.find_all('tr')
                    for row in rows:
                        cells = row.find_all(['td', 'th'])
                        if len(cells) >= 2:
                            key = cells[0].get_text().strip().lower()
                            value = cells[1].get_text().strip()
                            
                            # Look for alias/synonym related fields
                            if any(term in key for term in ['alias', 'aliases', 'synonym', 'alternative', 'other names']):
                                if value and value.lower() != gene_symbol.lower():
                                    # Split by common separators and clean
                                    alias_parts = re.split(r'[,;|/\n]', value)
                                    for part in alias_parts:
                                        clean_alias = part.strip()
                                        if clean_alias and clean_alias.lower() != gene_symbol.lower() and len(clean_alias) > 1:
                                            aliases.append(clean_alias)
                
                # Method 3: Look for aliases using text patterns specific to GenAge format
                # Based on the NFE2L2 page structure
                alias_patterns = [
                    r'Aliases\s*\n\s*([^\n]+)',  # Look for "Aliases" followed by content
                    r'Aliases[:\s]*([^\n\r]+)',  # Look for "Aliases:" format
                    r'HGNC symbol\s*\n\s*([^\n]+)\s*\n\s*Aliases\s*\n\s*([^\n]+)',  # Multi-line pattern
                ]
                
                for pattern in alias_patterns:
                    matches = re.findall(pattern, page_text, re.IGNORECASE | re.MULTILINE)
                    for match in matches:
                        if isinstance(match, tuple):
                            # Handle tuple matches (multiple groups)
                            for group in match:
                                if group.strip():
                                    # Clean special characters (like \xa0 non-breaking spaces)
                                    clean_group = group.replace('\xa0', ' ').strip()
                                    alias_parts = re.split(r'[,;|/\n]', clean_group)
                                    for part in alias_parts:
                                        clean_alias = part.strip()
                                        if clean_alias and clean_alias.lower() != gene_symbol.lower() and len(clean_alias) > 1:
                                            aliases.append(clean_alias)
                        else:
                            # Handle single string matches
                            # Clean special characters (like \xa0 non-breaking spaces)
                            clean_match = match.replace('\xa0', ' ').strip()
                            alias_parts = re.split(r'[,;|/\n]', clean_match)
                            for part in alias_parts:
                                clean_alias = part.strip()
                                if clean_alias and clean_alias.lower() != gene_symbol.lower() and len(clean_alias) > 1:
                                    aliases.append(clean_alias)
                
                # Method 4: Simple line-by-line parsing for GenAge format
                lines = page_text.split('\n')
                for i, line in enumerate(lines):
                    if 'aliases' in line.lower() and i < len(lines) - 1:
                        next_line = lines[i+1].strip().replace('\xa0', ' ').strip()
                        if next_line and next_line.lower() != gene_symbol.lower():
                            alias_parts = re.split(r'[,;|/\n]', next_line)
                            for part in alias_parts:
                                clean_alias = part.strip()
                                if clean_alias and clean_alias.lower() != gene_symbol.lower() and len(clean_alias) > 1:
                                    aliases.append(clean_alias)
                        break
                
                # Remove duplicates and empty entries, and filter out generic placeholder text
                unique_aliases = []
                seen = set()
                # Generic terms to exclude from aliases
                exclude_terms = {'common name', 'n/a', 'none', 'unknown', 'not available', 'na'}
                
                for alias in aliases:
                    clean_alias = alias.strip()
                    if (clean_alias and 
                        clean_alias.lower() not in seen and 
                        clean_alias.lower() not in exclude_terms):
                        unique_aliases.append(clean_alias)
                        seen.add(clean_alias.lower())
                
                genage_data = {}
                if unique_aliases:
                    genage_data['genage_aliases'] = ', '.join(unique_aliases)
                
                # Extract description from GenAge page
                description_text = ""
                
                # Look for the description section after "Description" heading
                lines = page_text.split('\n')
                for i, line in enumerate(lines):
                    if 'description' in line.lower() and i < len(lines) - 1:
                        # Found description heading, collect the next content
                        description_lines = []
                        j = i + 1
                        # Collect lines until we hit another heading or empty line followed by heading
                        while j < len(lines):
                            current_line = lines[j].strip()
                            if not current_line:
                                j += 1
                                continue
                            # Stop if we hit another major heading (usually capitalized)
                            if (len(current_line) < 100 and 
                                (current_line.isupper() or 
                                 any(word in current_line.lower() for word in ['cytogenetic', 'protein information', 'gene ontology', 'protein interactions', 'homologs']))):
                                break
                            # Add non-empty lines to description
                            if current_line and len(current_line) > 10:  # Skip very short lines
                                description_lines.append(current_line)
                            j += 1
                            # Limit description length to avoid very long text
                            if len('\n'.join(description_lines)) > 10000:
                                break
                        
                        # Join the description lines and clean up
                        if description_lines:
                            description_text = '\n'.join(description_lines).strip()
                            # Clean up any excessive whitespace
                            description_text = re.sub(r'\s+', ' ', description_text)
                            # Limit to reasonable length
                            if len(description_text) > 10000:
                                description_text = description_text[:10000] + "..."
                        
                        break
                
                if description_text:
                    # Clean references from the description text
                    cleaned_description = clean_references_from_text(description_text)
                    genage_data['genage_description'] = cleaned_description
                
                # Also check if this gene is in GenAge database for longevity/aging relevance
                if any(term in page_text.lower() for term in ['ageing', 'aging', 'longevity', 'senescence']):
                    genage_data['genage_relevance'] = True
                
                # If we found data, return it
                if genage_data:
                    return genage_data
                    
            except Exception as e:
                logger.debug(f"Error accessing GenAge URL {url} for {gene_symbol}: {e}")
                continue
        
        return {}
        
    except Exception as e:
        logger.debug(f"Error fetching GenAge data for {gene_symbol}: {e}")
        return {}

In [None]:
def enrich_gene_data(df: pd.DataFrame) -> pd.DataFrame:
    """Enrich the DataFrame with data from all three APIs."""
    logger.info(f"Starting to enrich {len(df)} genes with API data")
    
    enriched_records = []
    
    for index, row in df.iterrows():
        gene_symbol = row['gene_symbol']
        gene_name = row['gene_name']
        
        logger.info(f"Processing {index+1}/{len(df)}: {gene_symbol}")
        
        # Initialize the enriched record with original data and new columns
        enriched_record = {
            'gene_symbol': gene_symbol,
            'gene_name': gene_name,
            'gene_symbol_aliases': '',
            'hgnc_gene_id': '',
            'ncbi_gene_id': '',
            'ensembl_geneid': '',
            'chromosome': '',
            'assembly': 'GRCh38',  # Default assembly
            'gene_type': '',
            'number_of_exons': '',
            'gene_start_position': '',
            'gene_end_position': '',
            'description': '',
            'created_at': datetime.now().isoformat(),
            'updated_at': datetime.now().isoformat()
        }
        
        # Fetch data from HGNC
        hgnc_data = fetch_hgnc_data(gene_symbol)
        if hgnc_data:
            enriched_record.update({
                'gene_symbol_aliases': hgnc_data.get('gene_symbol_aliases', ''),
                'hgnc_gene_id': hgnc_data.get('hgnc_gene_id', ''),
            })
            # Use HGNC chromosome if available
            if hgnc_data.get('chromosome'):
                enriched_record['chromosome'] = hgnc_data['chromosome']
        
        # Fetch data from Ensembl
        ensembl_data = fetch_ensembl_data(gene_symbol)
        ensembl_gene_id = ''
        if ensembl_data:
            enriched_record.update({
                'ensembl_geneid': ensembl_data.get('ensembl_geneid', ''),
                'assembly': ensembl_data.get('assembly', 'GRCh38'),
                'gene_type': ensembl_data.get('gene_type', ''),
                'number_of_exons': ensembl_data.get('number_of_exons', ''),
                'gene_start_position': ensembl_data.get('gene_start_position', ''),
                'gene_end_position': ensembl_data.get('gene_end_position', ''),
            })
            # Use Ensembl chromosome if HGNC didn't provide one
            if not enriched_record['chromosome'] and ensembl_data.get('chromosome'):
                enriched_record['chromosome'] = ensembl_data['chromosome']
            
            ensembl_gene_id = ensembl_data.get('ensembl_geneid', '')
        
        # Fetch data from NCBI
        ncbi_data = fetch_ncbi_data(gene_symbol)
        if ncbi_data:
            enriched_record.update({
                'ncbi_gene_id': ncbi_data.get('ncbi_gene_id', ''),
            })
            # Use NCBI chromosome if neither HGNC nor Ensembl provided one
            if not enriched_record['chromosome'] and ncbi_data.get('chromosome'):
                enriched_record['chromosome'] = ncbi_data['chromosome']
        
        # Fetch data from GenAge to get additional aliases and description
        genage_data = fetch_genage_data(gene_symbol)
        if genage_data:
            # Handle GenAge aliases
            if genage_data.get('genage_aliases'):
                # Combine HGNC and GenAge aliases
                hgnc_aliases = enriched_record.get('gene_symbol_aliases', '')
                genage_aliases = genage_data.get('genage_aliases', '')
                
                # Combine aliases from both sources
                all_aliases = []
                if hgnc_aliases:
                    all_aliases.extend([alias.strip() for alias in hgnc_aliases.split(',') if alias.strip()])
                if genage_aliases:
                    all_aliases.extend([alias.strip() for alias in genage_aliases.split(',') if alias.strip()])
                
                # Remove duplicates, the original gene symbol, and generic placeholder text
                unique_aliases = []
                seen = set()
                # Generic terms to exclude from aliases
                exclude_terms = {'common name', 'n/a', 'none', 'unknown', 'not available', 'na'}
                
                for alias in all_aliases:
                    clean_alias = alias.strip()
                    if (clean_alias and 
                        clean_alias.lower() != gene_symbol.lower() and 
                        clean_alias.lower() not in seen and 
                        clean_alias.lower() not in exclude_terms):
                        unique_aliases.append(clean_alias)
                        seen.add(clean_alias.lower())
                
                # Update the aliases field with combined data
                if unique_aliases:
                    enriched_record['gene_symbol_aliases'] = ', '.join(unique_aliases)
            
            # Handle GenAge description (already cleaned of references)
            if genage_data.get('genage_description'):
                enriched_record['description'] = genage_data.get('genage_description', '')
        
        enriched_records.append(enriched_record)
        
        # Add delay between requests to be respectful to APIs
        time.sleep(0.1)
        
        # Progress update every 10 genes
        if (index + 1) % 10 == 0:
            logger.info(f"Processed {index+1}/{len(df)} genes")
    
    # Create new DataFrame
    enriched_df = pd.DataFrame(enriched_records)
    
    # Reorder columns to match requested order
    column_order = [
        'gene_symbol',
        'gene_symbol_aliases',
        'gene_name',
        'hgnc_gene_id',
        'ncbi_gene_id',
        'ensembl_geneid',
        'chromosome',
        'assembly',
        'gene_type',
        'number_of_exons',
        'gene_start_position',
        'gene_end_position',
        'description',
        'created_at',
        'updated_at'
    ]
    
    # Ensure all columns exist and reorder
    enriched_df = enriched_df.reindex(columns=column_order)
    logger.info(f"Enriched {len(enriched_df)} genes with API data")
    
    return enriched_df

In [None]:
def save_enriched_data(df: pd.DataFrame) -> None:
    """Save the enriched DataFrame to CSV."""
    try:
        df.to_csv(OUTPUT_CSV, index=False)
        logger.info(f"Saved enriched data to {OUTPUT_CSV}")
        
        # Print summary statistics
        logger.info("Enrichment Summary:")
        logger.info(f"  Total genes: {len(df)}")
        logger.info(f"  Genes with HGNC data: {len(df[df['hgnc_gene_id'] != ''])}")
        logger.info(f"  Genes with Ensembl data: {len(df[df['ensembl_geneid'] != ''])}")
        logger.info(f"  Genes with NCBI data: {len(df[df['ncbi_gene_id'] != ''])}")
        logger.info(f"  Genes with chromosome info: {len(df[df['chromosome'] != ''])}")
        
    except Exception as e:
        logger.error(f"Error saving enriched data: {e}")
        raise

In [None]:
def main(test_mode: bool = False):
    """Main execution function."""
    try:
        if test_mode:
            logger.info("Starting longevity genes details enrichment (TEST MODE - first 5 genes only)")
            output_file = "master_longivity_genes_enriched_test.csv"
        else:
            logger.info("Starting longevity genes details enrichment")
            output_file = OUTPUT_CSV
        
        # Read input CSV
        df = read_input_csv()
        
        if df.empty:
            logger.warning("Input CSV is empty. Exiting.")
            return
        
        # Limit to first 5 genes in test mode
        if test_mode:
            df = df.head(5)
            logger.info(f"TEST MODE: Processing only first 5 genes")
        
        # Enrich the data
        enriched_df = enrich_gene_data(df)
        
        # Save the enriched data
        enriched_df.to_csv(output_file, index=False)
        logger.info(f"Saved enriched data to {output_file}")
        
        # Print summary statistics
        logger.info("Enrichment Summary:")
        logger.info(f"  Total genes: {len(enriched_df)}")
        logger.info(f"  Genes with HGNC data: {len(enriched_df[enriched_df['hgnc_gene_id'] != ''])}")
        logger.info(f"  Genes with Ensembl data: {len(enriched_df[enriched_df['ensembl_geneid'] != ''])}")
        logger.info(f"  Genes with NCBI data: {len(enriched_df[enriched_df['ncbi_gene_id'] != ''])}")
        logger.info(f"  Genes with chromosome info: {len(enriched_df[enriched_df['chromosome'] != ''])}")
        logger.info(f"  Genes with gene type: {len(enriched_df[enriched_df['gene_type'] != ''])}")
        logger.info(f"  Genes with exon count: {len(enriched_df[enriched_df['number_of_exons'] != ''])}")
        logger.info(f"  Genes with position info: {len(enriched_df[(enriched_df['gene_start_position'] != '') & (enriched_df['gene_end_position'] != '')])}")
        logger.info(f"  Genes with description: {len(enriched_df[enriched_df['description'] != ''])}")
        
        logger.info("Longevity genes details enrichment completed successfully")
        
    except Exception as e:
        logger.error(f"Enrichment failed: {e}")
        sys.exit(1)

In [None]:
if __name__ == "__main__":
    import sys
    test_mode = len(sys.argv) > 1 and sys.argv[1] == "--test"
    main(test_mode=test_mode)