## Setup

The following packages are required to run the analysis. If not already installed, the packages will be installed using `pip`.

In [None]:
!pip install numpy
!pip install pandas
!pip install scipy
!pip install hmmlearn
!pip install statsmodels
!pip install mpl_scatter_density
!pip install tqdm
!pip install colorama
!pip install pyliftover

In [3]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from tqdm import tqdm
import statsmodels.api as sm
import scipy.stats as stats
import math
import glob
from pyliftover import LiftOver
import requests
import warnings

# Suppress all warnings to keep the output clean
warnings.filterwarnings("ignore")

## Specify project directories

Define the paths to the data and results directories used in the project.

In [5]:
# Specify the directories containing data and results
data_path = '/oak/stanford/groups/mrivas/projects/wgs-constraint-llm/data/'
results_path = '/oak/stanford/groups/mrivas/projects/wgs-constraint-llm/osthoag/wgs-constraint-llm/results/'

# Specify the paths to the specific data files
gene_annotation_file_path = data_path + 'gencode.v44.basic.annotation.gtf.gz'
scz_variants_file_path = data_path + 'scz.tsv.gz'
constraint_predictions_file_path = results_path + f'HMM_rgc_0.9_over20_chr2_predictions_rgc_wes.tsv.gz'
alpha_missense_file_path = data_path + 'AlphaMissense_hg38.tsv.gz'

## Define helper methods

In [6]:
# Function to calculate effect size and variance for variants
def calculate_effect_size_and_variance(cases_df):
    ALT_AJ = cases_df['ac_case']
    ALT_ExAC = cases_df['ac_ctrl']
    REF_AJ = cases_df['an_case'] - cases_df['ac_case']
    REF_ExAC = cases_df['an_ctrl'] - cases_df['ac_ctrl']

    # Logarithmic transformation of effect size calculation
    cases_df['effect_size'] = np.log(((0.5 + ALT_AJ) * (0.5 + REF_ExAC)) / ((0.5 + REF_AJ) * (0.5 + ALT_ExAC)))
    # Variance calculation for the effect size
    cases_df['var_effect_size'] = (1 / (0.5 + REF_AJ) + 1 / (0.5 + REF_ExAC) + 1 / (0.5 + ALT_AJ) + 1 / (0.5 + ALT_ExAC))
    
    return cases_df

# Initialize liftover object to convert coordinates from hg19 to hg38
lo = LiftOver('hg19', 'hg38')

# Function to apply liftover to a single row
def liftover(row):
    lifted = lo.convert_coordinate(row['chr'], row['pos'])
    if lifted:
        # Return the first lifted coordinate if successful
        return lifted[0][0], int(lifted[0][1])
    else:
        # Return the original chromosome with position 0 if liftover fails
        return row['chr'], 0

## Load Data
This section loads the gene annotation data from a GTF file and extracts relevant information such as gene ID, gene type, gene name, and transcript details. The data is then filtered to include only protein-coding regions.

In [7]:
# Read the GTF file into a pandas DataFrame
gene_df = pd.read_csv(gene_annotation_file_path, sep='\t', comment='#', header=None, 
                      names=['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'], 
                      dtype={'start': int, 'end': int})

# Extract 'gene_id' from attributes
gene_df['gene_id'] = gene_df['attribute'].str.extract(r'gene_id "(.*?)"')

# Extract 'gene_type' from attributes
gene_df['gene_type'] = gene_df['attribute'].str.extract(r'gene_type "(.*?)"')

# Extract 'gene_name' from attributes
gene_df['gene_name'] = gene_df['attribute'].str.extract(r'gene_name "(.*?)"')

# Extract 'transcript_id' from attributes
gene_df['transcript_id'] = gene_df['attribute'].str.extract(r'transcript_id "(.*?)"')

# Extract 'transcript' and 'num' from transcript_id
gene_df[['transcript', 'transcript_num']] = gene_df['transcript_id'].str.split('.', expand=True)

# Extract 'transcript_name' from attributes
gene_df['transcript_name'] = gene_df['attribute'].str.extract(r'transcript_name "(.*?)"')

# Drop the original attribute column
gene_df = gene_df.drop('attribute', axis=1)

# Filter rows for protein-coding regions
gene_df = gene_df[(gene_df['gene_type'] == 'protein_coding') & (gene_df['feature'] == 'CDS')]

# Standardize the gene_id by removing any version numbers (i.e., text after the dot)
gene_df['std_gene_id'] = gene_df['gene_id'].str.split('.').str[0]

# Display the DataFrame
gene_df

Unnamed: 0,chr,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,transcript_id,transcript,transcript_num,transcript_name,std_gene_id
60,chr1,HAVANA,CDS,65565,65573,.,+,0,ENSG00000186092.7,protein_coding,OR4F5,ENST00000641515.2,ENST00000641515,2,OR4F5-201,ENSG00000186092
63,chr1,HAVANA,CDS,69037,70005,.,+,0,ENSG00000186092.7,protein_coding,OR4F5,ENST00000641515.2,ENST00000641515,2,OR4F5-201,ENSG00000186092
236,chr1,HAVANA,CDS,450743,451678,.,-,0,ENSG00000284733.2,protein_coding,OR4F29,ENST00000426406.4,ENST00000426406,4,OR4F29-201,ENSG00000284733
304,chr1,HAVANA,CDS,685719,686654,.,-,0,ENSG00000284662.2,protein_coding,OR4F16,ENST00000332831.5,ENST00000332831,5,OR4F16-201,ENSG00000284662
524,chr1,HAVANA,CDS,924432,924948,.,+,0,ENSG00000187634.13,protein_coding,SAMD11,ENST00000616016.5,ENST00000616016,5,SAMD11-209,ENSG00000187634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1998488,chrM,ENSEMBL,CDS,10470,10763,.,+,0,ENSG00000212907.2,protein_coding,MT-ND4L,ENST00000361335.1,ENST00000361335,1,MT-ND4L-201,ENSG00000212907
1998495,chrM,ENSEMBL,CDS,10760,12137,.,+,0,ENSG00000198886.2,protein_coding,MT-ND4,ENST00000361381.2,ENST00000361381,2,MT-ND4-201,ENSG00000198886
1998509,chrM,ENSEMBL,CDS,12337,14145,.,+,0,ENSG00000198786.2,protein_coding,MT-ND5,ENST00000361567.2,ENST00000361567,2,MT-ND5-201,ENSG00000198786
1998515,chrM,ENSEMBL,CDS,14149,14673,.,-,0,ENSG00000198695.2,protein_coding,MT-ND6,ENST00000361681.2,ENST00000361681,2,MT-ND6-201,ENSG00000198695


## Load and Filter Schizophrenia Variant Data
This section reads in the schizophrenia variant data, splits the chromosome and position information, filters out unwanted chromosomes and variant types, and merges it with the gene annotation data to include gene names.

In [8]:
# Read the schizophrenia variant data into a DataFrame
scz_variant_results_df = pd.read_csv(scz_variants_file_path, sep='\t')

# Split the locus into chromosome and position, and convert the position to an integer
scz_variant_results_df[['chr', 'pos']] = scz_variant_results_df['locus'].str.split(':', expand=True)
scz_variant_results_df['chr'] = 'chr' + scz_variant_results_df['chr'].astype(str)
scz_variant_results_df['pos'] = pd.to_numeric(scz_variant_results_df['pos'], errors='coerce').fillna(0).astype(int)

filter_consequence_list = [
    "3_prime_UTR_variant",
    "5_prime_UTR_variant",
    "coding_sequence_variant",
    "downstream_gene_variant",
    "intergenic_variant",
    "intron_variant",
    "mature_miRNA_variant",
     "non_coding_transcript_exon_variant",
    "non_coding_transcript_variant",
    "null",
    "synonymous_variant",
   "upstream_gene_variant",
    "splice_region_variant",
    "stop_retained_variant",
    "protein_altering_variant"
]

pLoF_consequence_list = [
    'stop_gained',
    'splice_acceptor_variant',
    'splice_donor_variant',
    'frameshift_variant'
]

# Apply the liftover function to each row in the DataFrame
scz_variant_results_df[['chr', 'pos']] = scz_variant_results_df.apply(liftover, axis=1, result_type='expand')

# Filter out rows related to sex chromosomes and unrelated consequences
scz_variant_results_df = scz_variant_results_df[(scz_variant_results_df['chr'] != "chrX") &
                                                (scz_variant_results_df['chr'] != "chrY") &
                                                (scz_variant_results_df['chr'] != "chrMT")]
scz_variant_results_df = scz_variant_results_df[~scz_variant_results_df['consequence'].isin(filter_consequence_list)]
scz_variant_results_df = scz_variant_results_df[~scz_variant_results_df['consequence'].isna()]

# Merge the filtered variants with gene names from the gene annotation data
scz_variant_results_df = pd.merge(scz_variant_results_df, gene_df[['std_gene_id', 'gene_name']].drop_duplicates(), left_on='gene_id', right_on='std_gene_id', how='left').drop('std_gene_id', axis=1)

scz_variant_results_df

Unnamed: 0,locus,alleles,gene_id,consequence,hgvsc,hgvsp,cadd,mpc,polyphen,group,...,est,se,qp,i2,in_analysis,source,k,chr,pos,gene_name
0,1:139290,"[""TG"",""T""]",ENSG00000237683,frameshift_variant,c.19delC,p.Gln7LysfsTer98,,,,AFR (genomes),...,,,,,True,genomes,,chr1,139290,
1,1:139290,"[""TG"",""T""]",ENSG00000237683,frameshift_variant,c.19delC,p.Gln7LysfsTer98,,,,meta,...,,,,,True,genomes,,chr1,139290,
2,1:819959,"[""C"",""T""]",ENSG00000269308,splice_acceptor_variant,c.35-2C>T,,0.001,,,Estonian_genome_EST,...,0.091681,0.688460,,,,,,chr1,884579,
3,1:819959,"[""C"",""T""]",ENSG00000269308,splice_acceptor_variant,c.35-2C>T,,0.001,,,GPC_genomes_AFR,...,0.078467,0.082274,,,,,,chr1,884579,
4,1:819959,"[""C"",""T""]",ENSG00000269308,splice_acceptor_variant,c.35-2C>T,,0.001,,,GPC_genomes_AMR,...,-0.057189,0.896890,,,,,,chr1,884579,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9104927,22:51214273,"[""CA"",""C""]",ENSG00000079974,frameshift_variant,c.223delT,p.Trp75GlyfsTer38,,,,meta,...,,,,,True,"gnomAD exomes, gnomAD genomes",,chr22,50775845,RABL2B
9104928,22:51215097,"[""C"",""A""]",ENSG00000079974,splice_donor_variant,c.217+1G>T,,23.000,,,EUR (exomes),...,,,,,True,exomes,,chr22,50776669,RABL2B
9104929,22:51215097,"[""C"",""A""]",ENSG00000079974,splice_donor_variant,c.217+1G>T,,23.000,,,meta,...,,,,,True,exomes,,chr22,50776669,RABL2B
9104930,22:51215153,"[""G"",""T""]",ENSG00000079974,stop_gained,c.162C>A,p.Tyr54Ter,36.000,,,EUR (exomes),...,,,,,True,exomes,,chr22,50776725,RABL2B


## Filter Variants for Analysis
Further filtering of variants is done to include only those with a total allele count (ac_ctrl + ac_case) of 5 or less, and with non-zero allele numbers in both cases and controls.

In [9]:
# Filter variants dataframe for analysis
variants_df = scz_variant_results_df

# Filter variants where the sum of allele counts in cases and controls is less than or equal to 5
# and where both case and control allele numbers are non-zero
variants_df = variants_df[(variants_df['ac_ctrl'] + variants_df['ac_case'] <= 5) &
                          (variants_df['an_case'] > 0) &
                          (variants_df['an_ctrl'] > 0)]

## Load Constraint Predictions
Load saved HMM predictions for constraint data, which will be used in building the unified model.

In [10]:
# Load in saved predictions for the constraint dataset
constraint_predictions_df = pd.read_csv(constraint_predictions_file_path, sep='\t')

# Display the DataFrame
constraint_predictions_df

Unnamed: 0,chr,pos,prob_0,prob_1,observation
0,chr1,925922,9.218631e-34,1.000000,0.0
1,chr1,925923,2.515628e-03,0.997484,1.0
2,chr1,925924,2.518103e-03,0.997482,0.0
3,chr1,925925,4.543584e-05,0.999955,1.0
4,chr1,925926,2.596411e-03,0.997404,1.0
...,...,...,...,...,...
28933587,chr22,50777975,5.029692e-05,0.999950,1.0
28933588,chr22,50777976,7.197833e-05,0.999928,1.0
28933589,chr22,50777977,3.225875e-02,0.967741,1.0
28933590,chr22,50777978,5.462209e-01,0.453779,0.0


## Load Missense Pathogenicity Data
Load and preprocess missense pathogenicity data, including extracting transcript details and renaming columns for consistency.

In [11]:
# Read the missense pathogenicity data into a pandas DataFrame
alpha_missense_df = pd.read_csv(alpha_missense_file_path, sep='\t', header=3)

# Rename columns to standard labels
alpha_missense_df.rename(columns={"#CHROM": "chr", "POS": "pos", "REF": "ref", "ALT": 'alt'}, inplace=True)

# Extract 'transcript' and 'transcript_num' from transcript_id
alpha_missense_df[['transcript', 'transcript_num']] = alpha_missense_df['transcript_id'].str.split('.', expand=True)

# Display the DataFrame
alpha_missense_df

Unnamed: 0,chr,pos,ref,alt,genome,uniprot_id,transcript_id,protein_variant,am_pathogenicity,am_class,transcript,transcript_num
0,chr1,69094,G,T,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign,ENST00000335137,4
1,chr1,69094,G,C,hg38,Q8NH21,ENST00000335137.4,V2L,0.2937,likely_benign,ENST00000335137,4
2,chr1,69094,G,A,hg38,Q8NH21,ENST00000335137.4,V2M,0.3296,likely_benign,ENST00000335137,4
3,chr1,69095,T,C,hg38,Q8NH21,ENST00000335137.4,V2A,0.2609,likely_benign,ENST00000335137,4
4,chr1,69095,T,A,hg38,Q8NH21,ENST00000335137.4,V2E,0.2922,likely_benign,ENST00000335137,4
...,...,...,...,...,...,...,...,...,...,...,...,...
71697551,chrY,57196925,T,G,hg38,Q01113,ENST00000244174.10_PAR_Y,F521C,0.1903,likely_benign,ENST00000244174,10_PAR_Y
71697552,chrY,57196925,T,C,hg38,Q01113,ENST00000244174.10_PAR_Y,F521S,0.2045,likely_benign,ENST00000244174,10_PAR_Y
71697553,chrY,57196925,T,A,hg38,Q01113,ENST00000244174.10_PAR_Y,F521Y,0.1440,likely_benign,ENST00000244174,10_PAR_Y
71697554,chrY,57196926,C,G,hg38,Q01113,ENST00000244174.10_PAR_Y,F521L,0.5879,likely_pathogenic,ENST00000244174,10_PAR_Y


## Build Unified Constraint, Pathogenicity, and pLoF Model
This section creates a unified model that integrates constraint predictions, pathogenicity predictions, and loss-of-function (pLoF) indicators to assess the association with schizophrenia.

In [12]:
# Subset the constraint predictions to include only relevant columns
constraint_predictions_df = constraint_predictions_df[['chr', 'pos', 'prob_0']]

# Group by chromosome and position, and find the maximum 'am_pathogenicity' value
max_am_pathogenicity_df = alpha_missense_df.groupby(['chr', 'pos'])['am_pathogenicity'].max().reset_index()

# Copy the variants dataframe to create a pLoF-specific dataframe
pLoF_ind_variants_df = variants_df

# Create an indicator column for pLoF variants (1 for pLoF, 0 otherwise)
pLoF_ind_variants_df['pLoF_ind'] = (variants_df['consequence'] == "pLoF").astype('int32')

# Subset the pLoF dataframe to include only useful columns
pLoF_ind_variants_df = pLoF_ind_variants_df[['chr', 'pos', 'gene_id', 'gene_name', 'group', 'ac_case', 'an_case', 'ac_ctrl', 'an_ctrl', 'pLoF_ind']]

# Calculate effect size and variance for pLoF variants
pLoF_ind_variants_df = calculate_effect_size_and_variance(pLoF_ind_variants_df)

# Merge constraint predictions, pathogenicity predictions, and pLoF variants based on chromosome and position
constraint_pathogenicity_pLoF_df = pd.merge(pLoF_ind_variants_df, pd.merge(constraint_predictions_df, max_am_pathogenicity_df, on=['chr', 'pos'], how='inner'), on=['chr', 'pos'], how='inner')

# Save the merged dataframe to a compressed CSV file for further analysis
constraint_pathogenicity_pLoF_df.to_csv(results_path + f"constraint_am_scz_pLoF.tsv.gz", index=False, compression='gzip', sep='\t')

# Display the DataFrame
constraint_pathogenicity_pLoF_df

Unnamed: 0,chr,pos,gene_id,gene_name,group,ac_case,an_case,ac_ctrl,an_ctrl,pLoF_ind,effect_size,var_effect_size,prob_0,am_pathogenicity
0,chr1,925946,ENSG00000187634,SAMD11,meta,0,48496,3,194644,0,-0.556242,2.285740,0.000050,0.9876
1,chr1,925948,ENSG00000187634,SAMD11,meta,0,48496,1,194644,0,0.291066,2.666692,0.000040,0.9460
2,chr1,925952,ENSG00000187634,SAMD11,EUR (exomes),1,17748,0,38148,0,1.863854,2.666749,0.000050,0.9687
3,chr1,925952,ENSG00000187634,SAMD11,meta,1,48496,0,194644,0,2.488316,2.666692,0.000050,0.9687
4,chr1,925957,ENSG00000187634,SAMD11,AFR (genomes),1,4490,0,2340,0,0.447236,2.667317,0.009844,0.8327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5418790,chr22,50769528,ENSG00000079974,RABL2B,meta,0,48496,1,194644,0,0.291066,2.666692,0.015377,0.2788
5418791,chr22,50775792,ENSG00000079974,RABL2B,EST (genomes),0,522,1,4562,0,1.068170,2.668800,0.383440,0.4733
5418792,chr22,50775792,ENSG00000079974,RABL2B,meta,0,48496,1,194644,0,0.291066,2.666692,0.383440,0.4733
5418793,chr22,50775820,ENSG00000079974,RABL2B,EUR-N (exomes),4,14554,1,22374,0,1.528864,0.889002,0.396517,0.9987


## Meta-Regression Model for Schizophrenia
This section builds a meta-regression model for each gene, incorporating constraint, pathogenicity, and pLoF information. The results are saved and displayed for further analysis.

In [13]:
# Read the data from the file containing constraint, pathogenicity, and pLoF information for schizophrenia genes
constraint_pathogenicity_pLoF_df = pd.read_csv(results_path + f'constraint_am_scz_pLoF.tsv.gz', sep='\t')

# Initialize a list to store the results of the meta-regression model
meta_model_results = []

# Compute the negative log of the probabilities for constraint and pathogenicity to use as predictors
input_df = constraint_pathogenicity_pLoF_df
input_df[['log_constraint', 'log_pathogenicity']] = -np.log(1-input_df[['prob_0', 'am_pathogenicity']])

# Group the data by gene_id, gene_name, and group to build models for each gene separately
grouped_gene_data = input_df.groupby(['gene_id', 'gene_name', 'group'])

# Loop over each gene group and build a meta-regression model
for gene_key, gene_data in tqdm(grouped_gene_data, desc="Processing genes", unit="gene"):
    gene_id, gene_name, group = gene_key
    
    # Prepare the design matrix (X) and response variable (y) for the regression model
    X = sm.add_constant(gene_data[['log_constraint', 'log_pathogenicity', 'pLoF_ind']])
    y = gene_data['effect_size']

    # Weights are the inverse of the variance of the effect size
    weights = 1 / gene_data['var_effect_size']

    try:
        # Fit a weighted least squares (WLS) regression model
        model = sm.WLS(y, X, weights=weights, missing='drop').fit()
        
        # Append relevant results to the meta_model_results list
        meta_model_results.append({
            'gene_id': gene_id,
            'gene_name': gene_name,
            'group': group,
            'p_constraint': model.pvalues['log_constraint'],
            'p_pathogenicity': model.pvalues['log_pathogenicity'],
            'p_pLoF': model.pvalues['pLoF_ind'],
            'p_const': model.pvalues['const'],
            'p_unified': model.f_pvalue
        })

    except Exception as e:
        # Handle any errors that arise during model fitting
#         print(f"Error processing gene {gene_name} in group {group}: {e}")
        pass

# Convert the list of results to a DataFrame for further analysis
unified_model_df = pd.DataFrame(meta_model_results)

# Save the DataFrame containing p-values from the meta-regression models to a compressed CSV file
unified_model_df.to_csv(results_path + "schizophrenia_unified_model_pvalues.tsv", index=False, sep='\t')

# Display the contents of the DataFrame
unified_model_df

Processing genes:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████▊ | 185603/187765 [14:42<00:10, 210.39gene/s]


Unnamed: 0,gene_id,gene_name,group,p_constraint,p_pathogenicity,p_pLoF,p_const,p_unified
0,ENSG00000000419,DPM1,AFR (genomes),0.451656,0.266461,,0.250178,0.419736
1,ENSG00000000419,DPM1,AMR (exomes),0.654983,0.648871,,0.576296,0.803001
2,ENSG00000000419,DPM1,EAS (exomes),0.356055,0.675121,,0.782929,0.562411
3,ENSG00000000419,DPM1,EUR (exomes),0.331273,0.012301,,0.000020,0.035529
4,ENSG00000000419,DPM1,EUR-N (exomes),0.484493,0.208346,,0.059904,0.375240
...,...,...,...,...,...,...,...,...
150756,ENSG00000273217,ENSG00000273217,EUR (exomes),,,,,
150757,ENSG00000273217,ENSG00000273217,meta,,,,,
150758,ENSG00000273274,ZBTB8B,AFR (genomes),0.386409,0.743476,,0.302428,0.678464
150759,ENSG00000273274,ZBTB8B,EST (genomes),,,,,


In [14]:
# Read the saved model results
unified_model_df = pd.read_csv(results_path + "schizophrenia_unified_model_pvalues.tsv", sep='\t')

# Filter the DataFrame for significant p-values or specific gene names for closer examination
filter_mask = (unified_model_df['p_unified'] < 1e-4) | (unified_model_df['p_constraint'] < 1e-4)
gene_name_mask = (unified_model_df['gene_name'] == 'CACNA1A')
gene_name_mask2 = (unified_model_df['gene_name'].str.contains('CACNA1'))
gene_name_mask3 = (unified_model_df['gene_name'].isin(['POLG', 'DYNC1H1', 'GLRA1', 'PTEN', 'BRAT1', 'ALDH7A1', 'GAL']))

# Display the filtered DataFrame sorted by the unified p-value
pd.set_option('display.max_rows', 250)
unified_model_df[filter_mask].sort_values('p_unified')

Unnamed: 0,gene_id,gene_name,group,p_constraint,p_pathogenicity,p_pLoF,p_const,p_unified
66142,ENSG00000139323,POC1B,AMR (exomes),0.0,,,0.0,0.0
73801,ENSG00000145242,EPHA5,AFR (exomes),0.0,0.0,,0.0,0.0
146111,ENSG00000225921,NOL7,EUR-N (exomes),0.0,0.0,,0.0,0.0
121952,ENSG00000180917,CMTR2,FIN (exomes),0.0,0.0,,0.0,0.0
66968,ENSG00000139971,ARMH4,FIN (exomes),,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...
116804,ENSG00000176473,WDR25,EST (genomes),0.0,0.0,,0.0,
124756,ENSG00000183137,CEP57L1,EST (genomes),0.0,,,0.0,
125983,ENSG00000184012,TMPRSS2,EST (genomes),0.0,0.0,,0.0,
132814,ENSG00000188655,RNASE9,AMR (exomes),0.0,0.0,,0.0,


In [7]:
# filter_mask = (unified_model_df['p_comb'] < 1e-4) | (unified_model_df['p_value_overall'] < 1e-4)
filter_mask = (unified_model_df['p_unified'] < 1e-4) | (unified_model_df['p_constraint'] < 1e-4)
group_mask = (unified_model_df['group'] == 'DBS') | (unified_model_df['group'] == 'SWE')
gene_name_mask = (unified_model_df['gene_name'] == 'SLC2A1')

pd.set_option('display.max_rows', 550)
unified_model_df[filter_mask].sort_values('p_comb')

Unnamed: 0,gene_id,gene_name,group,p_prob_0,p_pathogenicity,p_const,p_overall,p_pLoF,p_comb
91053,ENSG00000163104,SMARCAD1,EST (genomes),0.0,0.0,0.0,0.0,0.1026751,0.0
138210,ENSG00000198914,POU3F3,meta,0.0,0.0,0.0,0.0,0.1994571,0.0
104788,ENSG00000169660,HEXD,EST (genomes),0.0,,0.0,0.0,1.0,0.0
88414,ENSG00000161265,U2AF1L4,EST (genomes),,,0.0,0.0,,0.0
82579,ENSG00000155393,HEATR3,EST (genomes),0.0,0.0,0.0,0.0,,0.0
63750,ENSG00000138134,STAMBPL1,FIN (exomes),,0.0,0.0,0.0,1.0,0.0
18715,ENSG00000099917,MED15,FIN (exomes),,,0.0,0.0,,0.0
41085,ENSG00000118655,DCLRE1B,EST (genomes),0.0,0.0,0.0,0.0,,0.0
133573,ENSG00000196792,STRN3,FIN (exomes),,0.0,0.0,0.0,,0.0
122339,ENSG00000182667,NTM,AMR (exomes),,,0.0,0.0,,0.0
