In [1]:
# --------------------------
# Load Libraries
# --------------------------

# Standard libraries
import os
import session_info
from pyhere import here
from pathlib import Path

# Project specific
import gzip
import pandas as pd
from typing import List, Dict, Tuple
import ast
import argparse
import sys


# --------------------------
# Parsing the file data
# --------------------------

def open_vcf_gz(path: str):
    return gzip.open(path, 'rt') if str(path).endswith('.gz') else open(path, 'r')

def parse_header_and_ancestries(path) -> Tuple[dict, List[str], List[str]]:
    ancestries = None
    sample_names = None
    with open_vcf_gz(path) as f:
        for line in f:
            if line.startswith('##'):
                if 'Ancestries' in line:
                    try:
                        ancestries = ast.literal_eval(line.split(':', 1)[1].strip())
                    except Exception:
                        ancestries = None             
                continue
            if line.startswith('#CHROM'):
                parts = line.strip().split('\t')
                try:
                    start_idx = parts.index('Sample_1')
                except ValueError:
                    # fallback: use previous default slicing
                    start_idx = 2
                sample_names = parts[start_idx:]
                break
    return ancestries, sample_names

def parse_ancestry_token(token: str) -> List[int]:
    if token is None or token == '.':
        return []
    token = token.split(':', 1)[0]
    parts = token.split('|')
    try:
        return [int(x) for x in parts]
    except ValueError:
        # if parsing fails, return empty to indicate missing data
        return []


In [4]:
print(here())

.
/gpfs/projects/p32505/users/manuel/rfmix_reader-benchmarking


In [29]:
# Path to your VCF (gzipped or not)
vcf_path = "../_m/chr22.vcf.gz"

# Parse header and ancestry info
ancestries, sample_names = parse_header_and_ancestries(vcf_path)

print("Ancestries:", ancestries)
print("Samples:", sample_names)


Ancestries: ['CEU', 'PUR', 'YRI']
Samples: ['Sample_1', 'Sample_2', 'Sample_3', 'Sample_4', 'Sample_5', 'Sample_6', 'Sample_7', 'Sample_8', 'Sample_9', 'Sample_10', 'Sample_11', 'Sample_12', 'Sample_13', 'Sample_14', 'Sample_15', 'Sample_16', 'Sample_17', 'Sample_18', 'Sample_19', 'Sample_20', 'Sample_21', 'Sample_22', 'Sample_23', 'Sample_24', 'Sample_25', 'Sample_26', 'Sample_27', 'Sample_28', 'Sample_29', 'Sample_30', 'Sample_31', 'Sample_32', 'Sample_33', 'Sample_34', 'Sample_35', 'Sample_36', 'Sample_37', 'Sample_38', 'Sample_39', 'Sample_40', 'Sample_41', 'Sample_42', 'Sample_43', 'Sample_44', 'Sample_45', 'Sample_46', 'Sample_47', 'Sample_48', 'Sample_49', 'Sample_50', 'Sample_51', 'Sample_52', 'Sample_53', 'Sample_54', 'Sample_55', 'Sample_56', 'Sample_57', 'Sample_58', 'Sample_59', 'Sample_60', 'Sample_61', 'Sample_62', 'Sample_63', 'Sample_64', 'Sample_65', 'Sample_66', 'Sample_67', 'Sample_68', 'Sample_69', 'Sample_70', 'Sample_71', 'Sample_72', 'Sample_73', 'Sample_74', 'Sa

In [8]:
test = open_vcf_gz(vcf_path)

test

<_io.TextIOWrapper name='../_m/chr11.vcf.gz' encoding='UTF-8'>