In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from scipy import stats
import math
import sys

In [None]:
projectDir = Path("/home/_shared/jscliu/project/2025/Flagship/analysis/secondary/recessive")
referenceDir = Path("/home/_shared/jscliu/project/2025/Flagship/reference")
dataDir = projectDir/"05.gene_level/data"
summaryDir = projectDir/"05.gene_level/summary"

# Reference
hg38_gtf = Path("/home/_shared/database/reference/Homo_sapiens.GRCh38.111.gtf.gz")
gene_panel = referenceDir/"hkgi_recessive.ext200bp.bed"
sample_info = dataDir/"recessive_founder_info.csv"
acmg_tiers_csv = dataDir/"acmg_genes.csv"
rm_parent = dataDir/"exclude_parents.txt"
acmg_genes_hkgp_naming = dataDir/"acmg_gene_list.hkgp_naming.txt"
mackenzies_genes_hkgp_naming = dataDir/"mackenzies_gene_list.hkgp_naming.txt"
hkgpT3_genes_hkgp_naming = dataDir/"HKGP-tier3_gene_list.hkgp_naming.txt"
sample_trio = dataDir/"sample_trios.chinese.2024-11-12.csv"

# Input
var_csv = projectDir/"03.filtering/summary/recessive.var.filtered.csv"
hkgi_HBA_sv_csv = projectDir/"04b.HBA/summary/HBA_per_sample_result.csv"
hkgi_SMN_sv_csv = projectDir/"04a.SMN/summary/filtered_smn_output.tsv"
sv_csv = projectDir/"04c.sv/summary/SF_recessive_manual_curated.csv"

gnomad_gene_csv = dataDir/"gnomADv4_suppTable6.csv"
gnomad_xgene_csv = dataDir/"gnomADv4_suppTable7.csv"
gnomad_ancestry_landscape = dataDir/"gnomAD_v4_ethnicity_landscape.csv"

# List of variants to be manually removed
rm_var_csv = dataDir/"recessive.rm_var_manual.csv"

# Output
recessive_final_csv = summaryDir/"recessive.cross_studies_comparison.csv"

In [None]:
def get_panel_genes(txt):
    with open(txt, 'r') as f:
        genes = [l.strip() for l in f]
    return genes
acmg_panel_genes = get_panel_genes(acmg_genes_hkgp_naming)
hkgi_panel_genes = get_panel_genes(hkgpT3_genes_hkgp_naming)
mackenzies_panel_genes = get_panel_genes(mackenzies_genes_hkgp_naming)

## Cohort cnt and sex

In [None]:
founder_info_df = pd.read_csv(sample_info, index_col=[0])

In [None]:
founder_sex_d = founder_info_df.to_dict()['inferred_sex']
founder_cnt = len(founder_info_df)
founder_male_cnt = len(founder_info_df.loc[founder_info_df['inferred_sex']=='Male', :])
founder_female_cnt = len(founder_info_df.loc[founder_info_df['inferred_sex']=='Female', :])

## gnomad_sample_size

In [None]:
gnomad_ancestry_df = pd.read_csv(gnomad_ancestry_landscape, index_col=[0])
gnomad_sample_size_d = gnomad_ancestry_df.loc[:, ['Ancestry_count']].drop_duplicates().to_dict()['Ancestry_count']
gnomad_sample_size_d = { a: int(c.replace(",", "")) for a, c in gnomad_sample_size_d.items() }

## Gene panels

In [None]:
# Handle gene panels
gene_df = pd.read_table(gene_panel, comment='#', names=['#chr',	'start', 'end', 'gene', 'score', 'strand'])
gene_list = gene_df['gene'].tolist()
gene_map_d = { re.split(",aka", g)[1]: re.split(",aka", g)[0] for g in gene_list if "aka" in g }
gene_list = [ re.split(",aka", g)[0] if "aka" in g else g for g in gene_list ]
gene_chr_d:dict = gene_df.set_index("gene").to_dict()['#chr']
gene_chr_d = { re.split(',aka', g)[0]:c for g, c in gene_chr_d.items() if 'aka' in g } | { g:c for g, c in gene_chr_d.items() if 'aka' not in g } 
gene_chr_d['HBA1/HBA2'] = 'chr16'

## hg38_gtf

In [None]:
gtf_cols = [
    "seqname", "source", "feature", "start", "end", 
    "score", "strand", "frame", "attribute"
]
hg38_df = pd.read_table(hg38_gtf, comment='#', names=gtf_cols)
hg38_df = hg38_df.loc[hg38_df['feature']=='gene', :].copy()

In [None]:
def extract_gene(attribute):
    m = re.search(r'gene_name "[A-Za-z0-9-]+"', attribute)
    if m:
        return m[0].split('"')[1]
    else:
        return np.nan
hg38_df['gene'] = hg38_df['attribute'].apply(extract_gene)
hg38_df['chr'] = hg38_df['seqname'].apply(lambda x: f"chr{x}")

## Internal landscape on variant level

In [None]:
var_df = pd.read_csv(var_csv, index_col=[0])

In [None]:
tblisted_c:list = ['recessive_genes', 'participant_id', 'QUAL', 'variant_info_record', 'variant_record']
var_df[tblisted_c[0]] = var_df[tblisted_c[0]].apply(lambda x: x.split(".") if not pd.isna(x) else np.nan)
for c in tblisted_c[1:]:
    var_df[c] = var_df[c].apply(lambda x: x.split("%") if not pd.isna(x) else np.nan)

In [None]:
# Explode by recessive_genes
exploded_var_df = var_df.explode("recessive_genes")

In [None]:
# Fix the gene columns to ensure its integrity
exploded_var_df['recessive_genes'] = exploded_var_df['recessive_genes'].apply(
    lambda g: re.split(",aka", g)[0] if "aka" in g else g
)
exploded_var_df['recessive_genes'] = exploded_var_df['recessive_genes'].apply(
    lambda g: gene_map_d[g] if g in gene_map_d.keys() else g
)

# Consider HBA1 and HBA2 altogether
exploded_var_df['recessive_genes'] = exploded_var_df['recessive_genes'].apply(
    lambda x: "HBA1/HBA2" if x in ["HBA1", "HBA2"] else x
)

# Remove variants after manual curation

In [None]:
rm_var_df = pd.read_csv(rm_var_csv)

In [None]:
exploded_var_df.drop(index=rm_var_df['cpra'].tolist(), inplace=True)

# Get subset and gene list for different gene panel

In [None]:
with open(acmg_genes_hkgp_naming, 'r') as f:
    acmg_genes = [l.strip() for l in f]

with open(mackenzies_genes_hkgp_naming, 'r') as f:
    mackenzies_genes = [l.strip() for l in f]

with open(hkgpT3_genes_hkgp_naming, 'r') as f:
    hkgpT3_genes = [l.strip() for l in f]

In [None]:
acmg_var_df = exploded_var_df.loc[exploded_var_df.recessive_genes.isin(acmg_genes), :].copy()
mackenzies_var_df = exploded_var_df.loc[exploded_var_df.recessive_genes.isin(mackenzies_genes), :].copy()
hkgpT3_var_df = exploded_var_df.loc[exploded_var_df.recessive_genes.isin(hkgpT3_genes), :].copy()

In [None]:
acmg_found_genes:set = set(acmg_var_df.recessive_genes)
mackenzies_found_genes:set = set(mackenzies_var_df.recessive_genes)
hkgpT3_found_genes:set = set(hkgpT3_var_df.recessive_genes)

## Convert to carrier level

In [None]:
gene_carrier_d = defaultdict(set)
for gene in exploded_var_df['recessive_genes'].unique():
    for id_ls in exploded_var_df.loc[exploded_var_df['recessive_genes']==gene, 'participant_id'].tolist():
        for id in id_ls:
            gene_carrier_d[gene].add(id)

In [None]:
# Append HBA and SMN carrier to gene_carrier_d
hkgi_hba_sv_df = pd.read_csv(hkgi_HBA_sv_csv)
carrier_labels = ['SEA', '3.7K', '4.2K', 'SEA + 3.7K', 'SEA + 4.2K', '4.2K_hom', '3.7K_hom']
hkgi_hba_sv_carrier:set = set(hkgi_hba_sv_df.loc[hkgi_hba_sv_df['Label'].isin(carrier_labels), 'SRE ID'].tolist())

hkgi_SMN_sv_df = pd.read_csv(hkgi_SMN_sv_csv)
hkgi_smn_sv_carrier:set = set([ s[:-3] for s in hkgi_SMN_sv_df.loc[hkgi_SMN_sv_df['isCarrier'], 'Sample'].tolist() ])

gene_carrier_d['HBA1/HBA2'] = gene_carrier_d['HBA1/HBA2'] | hkgi_hba_sv_carrier 
gene_carrier_d['SMN1'] = gene_carrier_d['SMN1'] | hkgi_smn_sv_carrier

# Append other SV to gene_carrier_d

In [None]:
# Read sv_csv and filter for variants found in unrelated individuals only
sv_df = pd.read_csv(sv_csv)
sv_df = sv_df.loc[sv_df.sample_id.isin(founder_info_df.index.tolist()), :].copy()

In [None]:
# Explode variants by "gene_symbols"
sv_df['recessive_genes'] = sv_df['gene_symbols'].apply(lambda x: x.split(','))
gene_sv_df = sv_df.explode('recessive_genes')
gene_sv_df['SVTYPE'] = gene_sv_df['INFO'].apply(lambda x: re.findall(r'SVTYPE=[A-Za-z]+', x)[0].split('=')[-1])
gene_sv_df['id'] = gene_sv_df.apply(lambda r: f"{r.chrom}-{r.start}-{r.end}-{r.SVTYPE}", axis=1)

In [None]:
# Filter for SV affecting carrier finding genes only
gene_sv_df['recessive_genes'] = gene_sv_df['recessive_genes'].apply(
    lambda g: gene_map_d[g] if g in gene_map_d.keys() else g
)
carrier_gene_sv_df = gene_sv_df.loc[gene_sv_df['recessive_genes'].isin(gene_list), :].copy()

In [None]:
for gene in carrier_gene_sv_df['recessive_genes'].unique():
    gene_sv_carrier:set = set(carrier_gene_sv_df.loc[
                              carrier_gene_sv_df['recessive_genes']==gene, 
                              'sample_id'].tolist())
    gene_carrier_d[gene] = gene_carrier_d[gene] | gene_sv_carrier

### Remove male from gene_carrier_d if the gene is X-linked

In [None]:
sex_adjusted_gene_carrier_d = dict()
for gene, carriers in gene_carrier_d.items():
    if gene_chr_d[gene] == 'chrX':
        f_carriers = [c for c in carriers if founder_sex_d[c]=='Female']
        sex_adjusted_gene_carrier_d[gene] = f_carriers
    else:
        sex_adjusted_gene_carrier_d[gene] = carriers

In [None]:
carrier_cnt_d = { g: [len(s), list(s)] for g, s in sex_adjusted_gene_carrier_d.items() }
missing_d = { g: [0, np.nan] for g in gene_list if (g not in carrier_cnt_d.keys()) & ('HBA' not in g) }
carrier_cnt_d = carrier_cnt_d | missing_d

### Construct hkgi_gene_df

In [None]:
hkgi_gene_df = pd.DataFrame(carrier_cnt_d, index=['carrier_count_hkgi', 'carriers_hkgi']).T
hkgi_gene_df['chr'] = hkgi_gene_df.apply(lambda r: gene_chr_d[r.name], axis=1)

In [None]:
# Calculate the carrier rate. Pay attension to X-linked genes
hkgi_gene_df['sample_size_hkgi'] = hkgi_gene_df.apply(
    lambda r: founder_female_cnt if r.chr=='chrX' else founder_cnt, axis=1
)
hkgi_gene_df['carrier_rate_hkgi'] = hkgi_gene_df.apply(
    lambda r: float(f"{r.carrier_count_hkgi/r.sample_size_hkgi:.5f}"), 
    axis=1
)
hkgi_gene_df.sort_values(by=['carrier_rate_hkgi'], ascending=False, inplace=True)

In [None]:
# Calculate the 95% Confidence interval for hkgi's carrier_rate
def calc_CI(successes, trials, confidence=0.95):
    '''Function to calculate confidence interval'''
    if trials == 0:
        return (0, 0)
    # Calculate the standard error
    p = successes / trials
    se = np.sqrt(p * (1 - p) / trials)
    # Z-score for the given confidence level
    z = stats.norm.ppf((1 + confidence) / 2)
    # Margin of error
    margin_of_error = z * se
    # Confidence interval
    return pd.Series([p - margin_of_error, p + margin_of_error])

hkgi_gene_df[['carrier_rate_CI_Lower_hkgi', 'carrier_rate_CI_Upper_hkgi']] = hkgi_gene_df.apply(
    lambda r: calc_CI(r.carrier_count_hkgi, r.sample_size_hkgi), axis=1
)

## Append carrier rates from gnomAD v4

In [None]:
master_df = hkgi_gene_df.copy()

In [None]:
gnomad_gene_df = pd.read_csv(gnomad_gene_csv, index_col=[0], na_values=['ND'])
# Manually drop duplicated genes due to synonyms
genes_to_drop = ["GBA", "SPATA5"]
gnomad_gene_df.drop(index=genes_to_drop, inplace=True)
pop_rank = ['EAS', 'SAS', 'AMR', 'ASJ', 'MID', 'NFE', 'FIN', 'AFR']

In [None]:
# Adjust gene symbols to match
gnomad_gene_df['gene'] = gnomad_gene_df.apply(lambda r: "HBA1/HBA2" if r.name in ["HBA1", "HBA2"] else r.name, axis=1)
gnomad_gene_df['gene'] = gnomad_gene_df['gene'].apply(lambda g: gene_map_d[g] if g in gene_map_d.keys() else g)
gnomad_gene_df.set_index('gene', inplace=True)

In [None]:
# Add x-linked genes
gnomad_xgene_df = pd.read_csv(gnomad_xgene_csv, index_col=[0], na_values=['ND'])
gnomad_gene_df = pd.concat([
    gnomad_gene_df.loc[:, gnomad_xgene_df.columns.tolist()], 
    gnomad_xgene_df
])

In [None]:
# Rename columns
for pop in pop_rank:
    gnomad_gene_df.rename(
        columns={f"{pop} Genomes GCF": f"carrier_rate_gnomAD-{pop}"}, 
        inplace=True
    )
gnomad_gene_df = gnomad_gene_df.loc[:, [f"carrier_rate_gnomAD-{pop}" for pop in pop_rank]]

In [None]:
for pop in pop_rank:
    gnomad_gene_df[f"sample_size_gnomAD-{pop}"] = gnomad_sample_size_d[pop]

In [None]:
# Merge df
master_df = master_df.merge(
    gnomad_gene_df, 
    left_index=True, right_index=True, 
    how='outer'
)

## Map the chromosomes of the genes

In [None]:
# Label the chr
gtf_gene_chr_d = hg38_df.set_index('gene').to_dict()['chr']
manual_gene_chr_d = {
    'ARMC4': 'chr10', 
    'CPAP': 'chr13', 
    'DNAAF19': 'chr17', 
    'HBA1/HBA2': 'chr16', 
    'SPATA5L1': 'chr15', 
    'TRU-TCA1-1': 'chr19', 
    'TTC26': 'chr7'
}

def find_gene_chr(gene, gtf_gene_chr_d, manual_gene_chr_d):
    if gene in gtf_gene_chr_d.keys():
        return gtf_gene_chr_d[gene]
    elif gene in manual_gene_chr_d.keys():
        return manual_gene_chr_d[gene]
    else:
        return np.nan
        
master_df['chr'] = master_df.apply(lambda r: find_gene_chr(r.name, gtf_gene_chr_d, manual_gene_chr_d), axis=1)

## Classify genes by tiers as suggested in the carrier frequency of other publications

In [None]:
def classify_tier(gene, chr, freq, pop, freq_lowCI=None, is_asian=False): 
    ###########################
    ##### Manual classify #####
    ###########################
    
    ##### Pan-ethinic tier 1
    if gene in ['SMN1', 'CFTR']:
        return 'tier_1'
    ##### Population-specific tier 1 recommended by ACOG
    if (is_asian) & (gene in ['HBA1/HBA2', 'HBB']):
        return 'tier_1'

    ##### Population-specific tier 3 (i.e, Table 5)
    manual_AFR_tier3_genes = ["HBA1/HBA2"]
    manual_AMR_tier3_genes = ["HBA1/HBA2", "HPS1", "HPS3", "FXN"]
    manual_ASJ_tier3_genes = ["HBA1/HBA2", "ELP1", "DLD", "NEB", "CLRN1", "BLM"]
    manual_EAS_tier3_genes = []
    manual_FIN_tier3_genes = ["HBA1/HBA2", "FXN"]
    manual_MID_tier3_genes = ["HBA1/HBA2"]
    manual_NFE_tier3_genes = ["HBA1/HBA2", "FXN"]
    manual_SAS_tier3_genes = []
    
    if (pop=='AFR') & (gene in manual_AFR_tier3_genes):
        return 'tier_3_pop'
    elif (pop=='AMR') & (gene in manual_AMR_tier3_genes):
        return 'tier_3_pop'
    elif (pop=='ASJ') & (gene in manual_ASJ_tier3_genes):
        return 'tier_3_pop'
    elif (pop=='EAS') & (gene in manual_EAS_tier3_genes):
        return 'tier_3_pop'
    elif (pop=='FIN') & (gene in manual_FIN_tier3_genes):
        return 'tier_3_pop'
    elif (pop=='MID') & (gene in manual_MID_tier3_genes):
        return 'tier_3_pop'
    elif (pop=='NFE') & (gene in manual_NFE_tier3_genes):
        return 'tier_3_pop'
    elif (pop=='SAS') & (gene in manual_SAS_tier3_genes):
        return 'tier_3_pop'


    ########################
    ##### By frequency #####
    ########################
    # Return tier_4 if no freq
    if pd.isna(freq):
        return 'tier_4'
        
    ##### X-linked tier 3 by frequency
    if chr == 'chrX':
        # Use the lower 95% interval of the frequency
        chrX_freq = freq if freq_lowCI is None else freq_lowCI
        if chrX_freq >=  0.000025:
            return 'tier_3_xlink'
        else:
            return 'tier_4'
        
    #### Classify by carrier frequency
    tier_threshold = {
        'tier_2': 0.01, 
        'tier_3': 0.005
    }
    if freq >= tier_threshold['tier_2']:
        return 'tier_2'
    elif freq >= tier_threshold['tier_3']:
        return 'tier_3'
    else:
        return 'tier_4'
    return 

In [None]:
# Internal
pop = 'hkgi'
master_df[f'tier_{pop}'] = master_df.apply(
    lambda r: classify_tier(r.name, r.chr, r.carrier_rate_hkgi, pop, 
        freq_lowCI=r.carrier_rate_CI_Lower_hkgi, is_asian=True), 
    axis=1
)

# Other Asian populations
asian_pops = ['EAS', 'SAS']
for pop in asian_pops:
    master_df[f'tier_gnomAD-{pop}'] = master_df.apply(
        lambda r: classify_tier(r.name, r.chr, r[f'carrier_rate_gnomAD-{pop}'], pop, is_asian=True), 
        axis=1
    )

# Remaining populations
other_pops = ['AMR', 'AMR', 'ASJ', 'MID', 'NFE', 'FIN', 'AFR']
for pop in other_pops:
    master_df[f'tier_gnomAD-{pop}'] = master_df.apply(
        lambda r: classify_tier(r.name, r.chr, r[f'carrier_rate_gnomAD-{pop}'], pop), axis=1
    )

## Append the ACMG tiers as a columns

In [None]:
# Read acmg tier csv as df
acmg_df = pd.read_csv(acmg_tiers_csv)
acmg_df['gene'] = acmg_df['OMIM gene name'].apply(lambda g: gene_map_d[g] if g in gene_map_d.keys() else g)
acmg_tiers_d = acmg_df.drop_duplicates(subset=['gene']).set_index('gene').to_dict()['Table']
acmg_tiers_d = { g: t.split('.')[0] for g, t in acmg_tiers_d.items() }
acmg_tiers_d['HBA1/HBA2'] = 'Table 5'
acmg_tiers_df = pd.DataFrame(acmg_tiers_d, index=['acmg_table']).T
acmg_tiers_df.drop(index=['HBA1', 'HBA2'], inplace=True)

In [None]:
# Pan-ethnic classification
def classify_acmg_tier(r):
    if r.name in ['SMN1', 'CFTR']:
        return 'tier_1'
    
    acmg_table = r['acmg_table']   
    if acmg_table in ['Table 1', 'Table 2']:
        return 'tier_2'
    elif acmg_table in ['Table 3', 'Table 4', 'Table 6']:
        return 'tier_3'
    else:
        return 'tier_4'
acmg_tiers_df['acmg_panethnic_tier'] = acmg_tiers_df.apply(classify_acmg_tier, axis=1)

In [None]:
for c in [c for c in master_df.columns if c.startswith('tier')]:
    acmg_tiers_df[f"acmg_{c}"] = acmg_tiers_df['acmg_panethnic_tier']

In [None]:
# Manual assign table 5 (SMN1 already in tier 1)
asian_pops = ['hkgi', 'gnomAD-EAS', 'gnomAD-SAS']
asian_spec_tab5_genes = ['HBA1/HBA2', 'HBB']    # To tier 1
for pop in asian_pops:
    acmg_tiers_df.loc[asian_spec_tab5_genes, f'acmg_tier_{pop}'] = 'tier_1'

amr_spec_tab5_genes = ['HBA1/HBA2', "HPS1", "HPS3", "FXN"]
acmg_tiers_df.loc[amr_spec_tab5_genes, f'acmg_tier_gnomAD-AMR'] = 'tier_3'

asj_spec_tab5_genes = ['HBA1/HBA2', "ELP1", "DLD", "NEB", "CLRN1", "BLM"]
acmg_tiers_df.loc[asj_spec_tab5_genes, f'acmg_tier_gnomAD-ASJ'] = 'tier_3'

mid_spec_tab5_genes = ['HBA1/HBA2']
acmg_tiers_df.loc[mid_spec_tab5_genes, f'acmg_tier_gnomAD-MID'] = 'tier_3'

nfe_spec_tab5_genes = ['HBA1/HBA2', "FXN"]
acmg_tiers_df.loc[nfe_spec_tab5_genes, f'acmg_tier_gnomAD-NFE'] = 'tier_3'

fin_spec_tab5_genes = ['HBA1/HBA2', "FXN"]
acmg_tiers_df.loc[fin_spec_tab5_genes, f'acmg_tier_gnomAD-FIN'] = 'tier_3'

afr_spec_tab5_genes = ['HBA1/HBA2']
acmg_tiers_df.loc[afr_spec_tab5_genes, f'acmg_tier_gnomAD-AFR'] = 'tier_3'

In [None]:
master_df = master_df.merge(
    acmg_tiers_df.drop(columns=['acmg_table', 'acmg_panethnic_tier']), 
    left_index=True, right_index=True, how='left'
)

In [None]:
# Classify as 'tier 4' if not mentioned in ACMG
for c in [col for col in master_df.columns if col.startswith('acmg_tier')]:
    master_df[c] = master_df[c].fillna('tier_4')

## Determine whether the gene is X-linked or not

In [None]:
master_df['is_xlinked'] = master_df['chr'].apply(lambda x: True if x=='chrX' else False)

## Calculate the at-risk couple rate

In [None]:
# Calculate the number of virtual couples
all_combinations = math.comb(founder_cnt, 2)
xlinked_combinations = founder_male_cnt*founder_female_cnt

In [None]:
# List and count all at-risk couples in HKGP
def list_at_risk_couples(carriers:list, is_xlinked:bool, founder_sex_d:dict):
    """List and count all at-risk couples in HKGP"""
    # Return np.nan if carrier count is 0
    if carriers != carriers:
        return pd.Series([set(), 0])
    
    at_risks_couples:set = set()
    if is_xlinked:
        # X-linked genes: all possible combinations of female carriers and any male are at-risk couples
        for c in carriers:
            at_risks_couples = at_risks_couples | { ".".join(sorted([c, m])) for m, sex in founder_sex_d.items() if sex=='Male' }
    else:
        # Autosomal recessive genes: all possible combinations of carriers are at-risk couples
        at_risks_couples:set = { ".".join(sorted([a, b])) for a in carriers for b in carriers if a!=b }
    
    return pd.Series([at_risks_couples, len(at_risks_couples)])

master_df[['at_risks_couples_hkgi', 'at_risks_couples_count_hkgi']] = master_df.apply(
    lambda r: list_at_risk_couples(r.carriers_hkgi, r.is_xlinked, founder_sex_d), 
    axis=1
)

In [None]:
# Specific to HBA1/HBA2: remove false-positive in at-risk couples
hba_label_d:dict = hkgi_hba_sv_df.set_index('SRE ID').to_dict()['Label']
def remove_false_arc(arc_set:set, hba_label_d):
    true_arc_set = set()
    all_comb = set()
    all_out_comb = set()
    for arc in arc_set:
        comb = ".".join([hba_label_d[s] for s in arc.split(".")])
        if 'SEA' in comb:
            true_arc_set.add(arc)
            all_comb.add(comb)
        else:
            all_out_comb.add(comb)
    return true_arc_set
master_df['at_risks_couples_hkgi'] = master_df.apply(
    lambda r: r.at_risks_couples_hkgi if r.name!='HBA1/HBA2' else remove_false_arc(r.at_risks_couples_hkgi, hba_label_d), 
    axis=1
)

In [None]:
# Calculate the at_risks_couple_rate_hkgi
master_df['at_risks_couples_rate_hkgi'] = master_df.apply(
    lambda r: r['at_risks_couples_count_hkgi']/xlinked_combinations if r.is_xlinked else r['at_risks_couples_count_hkgi']/all_combinations, 
    axis=1
)

## Extract the actual at-risk couple frequencies

In [None]:
trio_df = pd.read_csv(sample_trio)

In [None]:
# Remove trios where the mother and(or) father are not founders
def keep_trio(r, incl_founders):
    mother_is_founder = True if r['mother_sre_lab_id'][:-3] in incl_founders else False
    father_is_founder = True if r['father_sre_lab_id'][:-3] in incl_founders else False
    if mother_is_founder & father_is_founder:
        return True
    else:
        return False
trio_df['keep_trio'] = trio_df.apply(lambda r: keep_trio(r, founder_info_df.index.tolist()), axis=1)
trio_df = trio_df.loc[trio_df.keep_trio, :].drop(columns=['keep_trio'])

# Assign a trio ID
trio_df['couple_id'] = trio_df.apply(
    lambda r: ".".join(sorted([r['mother_sre_lab_id'][:-3], r['father_sre_lab_id'][:-3]])), 
    axis=1
)

all_couple_ids:set = set(trio_df['couple_id'].tolist())

In [None]:
# Append per-gene at risk couples
actual_at_risks_couples_hkgi:list = list()    # List of sets
for gene, row in master_df.iterrows():
    couples = row['at_risks_couples_hkgi']
    if pd.isna(couples):
        actual_at_risks_couples_hkgi.append(np.nan)
        continue
    arc = all_couple_ids.intersection(couples)
    actual_at_risks_couples_hkgi.append(arc)

master_df['actual_at_risks_couples_hkgi'] = pd.DataFrame({"actual_at_risks_couples_hkgi": actual_at_risks_couples_hkgi}, index=master_df.index)

In [None]:
master_df['actual_at_risks_couples_count_hkgi'] = master_df['actual_at_risks_couples_hkgi'].apply(len)
master_df['actual_at_risks_couples_rate_hkgi'] = master_df['actual_at_risks_couples_count_hkgi'].apply(lambda x: x/len(all_couple_ids))

## Subset columns for output

In [None]:
export_cols:list = [
    "is_xlinked", "carrier_count_hkgi", "sample_size_hkgi", 
    "carrier_rate_hkgi", "carrier_rate_CI_Lower_hkgi", "carrier_rate_CI_Upper_hkgi", 
    "sample_size_gnomAD-EAS", "sample_size_gnomAD-NFE", "sample_size_gnomAD-AFR", 
    "carrier_rate_gnomAD-EAS", "carrier_rate_gnomAD-NFE", "carrier_rate_gnomAD-AFR", 
    "acmg_tier_hkgi", "acmg_tier_gnomAD-EAS", "acmg_tier_gnomAD-NFE", "acmg_tier_gnomAD-AFR", 
    "tier_hkgi", "tier_gnomAD-EAS", "tier_gnomAD-NFE", "tier_gnomAD-AFR", 
    "at_risks_couples_count_hkgi", "at_risks_couples_rate_hkgi", 
    "actual_at_risks_couples_count_hkgi", "actual_at_risks_couples_rate_hkgi", 
]
export_df = master_df.loc[:, export_cols].sort_values(by=['carrier_rate_hkgi'], ascending=False)

In [None]:
export_df.to_csv(recessive_final_csv, index=True, index_label='gene')