## Setup

In [1]:
!pip install numpy
!pip install pandas
!pip install scipy
!pip install hmmlearn
!pip install statsmodels



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from hmmlearn import hmm
import gzip
from tqdm import tqdm
import statsmodels.api as sm
import scipy.stats as stats
from scipy.stats import fisher_exact
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [3]:
# Specify project directories in Sherlock
data_path = '/oak/stanford/groups/mrivas/projects/wgs-constraint-llm/data/'
results_path = '/oak/stanford/groups/mrivas/projects/wgs-constraint-llm/osthoag/wgs-constraint-llm/results/'

# Specify the file paths
ukb_variants_file_path = data_path + 'ukb24310_c19.qced_nonrel.vmiss.gz'

## Define helper methods

In [4]:
def get_sequence(coverage_df, variants_df):
    # Get the length of the genetic sequence
    sequence_length = max(coverage_df['pos'].max(), variants_df['pos'].max())

    # Create boolean mask for exome coverage
    coverage_mask = np.zeros(sequence_length + 1, dtype=bool)

    # Use NumPy boolean indexing to get mask for positions with over 80% coverage
    coverage_mask[coverage_df['pos'].to_numpy()] = 1

    # Initialize values to zero for all positions
    sequence = np.zeros(sequence_length + 1)

    # Set positions to 1 where a variant exists
    sequence[variants_df['pos'].to_numpy()] = 1

    # Filter for only the protein-coding regions with over 80% exome coverage
    observations = np.array(sequence[coverage_mask])
    
    positions = np.where(coverage_mask)[0]
    
    return observations, positions

def get_HMM_predictions(observations, model, order=2):
    # Flatten the higher-order structure
    X = np.stack([observations[i:i-order] for i in range(order)], axis=1)

    # Convert observations to counts
    X_counts = np.column_stack([(X == i).sum(axis=1) for i in range(2)])

    # Predict probabilities for each position
    probabilities = model.predict_proba(X_counts)

    return probabilities

def fit_HMM(observations, order=2):
    # Flatten the higher-order structure
    X = np.stack([observations[i:i-order] for i in range(order)], axis=1)

    # Convert observations to counts
    X_counts = np.column_stack([(X == i).sum(axis=1) for i in range(2)])

    # Create and fit a first-order HMM
    model = hmm.MultinomialHMM(n_components=2, random_state=10)
    model.fit(X_counts)

    return model

def ols_regression(predictions_df):
    # Add a constant term to the independent variable for the intercept
    X = sm.add_constant(predictions_df['prob_0'])

    # Fit the linear regression model
    model4 = sm.OLS(predictions_df['observation'],X).fit()

    model4.summary().tables[1].pvalues_precision = 100  # Adjust the number of significant digits

    # Get the summary of the regression
    print(model4.summary())

    # Extract the F-statistic and its associated p-value
    f_statistic = model4.fvalue
    p_value_f_statistic = model4.f_pvalue
    
    return f_statistic, p_value_f_statistic

def plot_hist_from_predictions(predictions_df):
    # Create a figure with two subplots in one row and two columns
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))

    # Plot the first histogram in the first subplot
    axs[0].hist(predictions_df['observation'], edgecolor="blue")
    axs[0].set_title('Histogram of Observations')
    axs[0].set_xlabel('Has a variant')
    axs[0].set_ylabel('Frequency')

    # Plot the second histogram in the second subplot
    axs[1].hist(predictions_df['prob_0'], edgecolor="blue")
    axs[1].set_title('Histogram of Predictions')
    axs[1].set_xlabel('Probability of 0')
    axs[1].set_ylabel('Frequency')

    # Adjust layout to prevent clipping of titles
    plt.tight_layout()

    # Show the plots
    plt.show()

## Load data

In [5]:
# Read the compressed file into a pandas DataFrame
ukb_variants_df = pd.read_csv(ukb_variants_file_path, compression='gzip', sep='\t')

# Filter locations for F_MISS < 0.1
ukb_variants_df = ukb_variants_df[ukb_variants_df['F_MISS'] < 0.1]

# Expand ID column to get pos
ukb_variants_df[['chr', 'pos', 'ref', 'alt']] = ukb_variants_df['ID'].str.split(':', expand=True)

# Conver pos to int
ukb_variants_df['pos'] = ukb_variants_df['pos'].astype(int)

ukb_variants_df

Unnamed: 0,#CHROM,ID,MISSING_CT,OBS_CT,F_MISS,chr,pos,ref,alt
167,19,chr19:61721:C:G,38465,393852,0.097664,chr19,61721,C,G
168,19,chr19:61722:A:C,36449,393852,0.092545,chr19,61722,A,C
169,19,chr19:61722:A:G,36449,393852,0.092545,chr19,61722,A,G
170,19,chr19:61724:T:A,33923,393852,0.086131,chr19,61724,T,A
171,19,chr19:61727:G:A,30490,393852,0.077415,chr19,61727,G,A
...,...,...,...,...,...,...,...,...,...
28664428,19,chr19:58607407:G:T,38549,393852,0.097877,chr19,58607407,G,T
28664429,19,chr19:58607408:G:A,38520,393852,0.097803,chr19,58607408,G,A
28664430,19,chr19:58607408:G:T,38520,393852,0.097803,chr19,58607408,G,T
28664431,19,chr19:58607409:G:T,38675,393852,0.098197,chr19,58607409,G,T


## Get predictions for UKB chromosome 19 from the HMM trained on AoU chromosome 2

In [2]:
# Define pattern for coverage file paths
coverage_file_prefix = data_path + 'gnomad.genomes.r3.0.1.coverage_0.5_over10_chr'
coverage_file_suffix = '.summary.tsv.gz'

# Parameters for analysis
order = 2

# Define path to trained HMM
hmm_file_path = results_path + 'HMM_aou_wgs_chr2_model.joblib'

# Load the pretrained model
model = joblib.load(hmm_file_path)

# Get coverage file for chromosome 19
chr19_coverage_df = pd.read_csv(coverage_file_prefix + '19' + coverage_file_suffix, sep='\t', names=['pos'])

chr19_coverage_df

In [None]:
# Get coverage file for chromosome 19
chr_coverage_df = pd.read_csv(coverage_file_prefix + '19' + coverage_file_suffix, sep='\t', names=['pos'])

chr_variants_df = ukb_variants_df

# Get training data for the chromosome
observations, positions = get_sequence(chr_coverage_df, chr_variants_df)

# Fit HMM and retrieve probabilites
probabilities = get_HMM_predictions(observations, model, order=order)

# Create a DataFrame with 'pos' reflecting the index of the original sequence and 'prob_0/1' as the predictions
chr_predictions_df = pd.DataFrame({'chr': 'chr19',
                                   'pos': positions[0:-order],
                                   'prob_0': probabilities[:, 0], 
                                   'prob_1': probabilities[:, 1], 
                                   'observation': observations[0:-order]
                                  })

# Checkpoint chr_predictions_df to a csv to avoid memory bottleneck
chr_predictions_df.to_csv(results_path + "HMM_ukb_constraint_predictions_chr19.tsv.gz", index=False, compression='gzip', sep='\t')

## Train HMM on UKB chromosome 19 and get predictions

In [7]:
# Define pattern for coverage file paths
coverage_file_prefix = data_path + 'gnomad.genomes.r3.0.1.coverage_0.5_over10_chr'
coverage_file_suffix = '.summary.tsv.gz'

# Parameters for analysis
order = 2

# Define path to trained HMM
hmm_file_path = results_path + 'HMM_ukb_wgs_chr19_model.joblib'

# Filter rows for chromosome 19
chr_variants_df = ukb_variants_df
chr_coverage_df = pd.read_csv(coverage_file_prefix + '19' + coverage_file_suffix, sep='\t', names=['pos'])

# Get training data for the chromosome
observations, positions = get_sequence(chr_coverage_df, chr_variants_df)

# Fit HMM to Chromosome 2
model = fit_HMM(observations, order=order)

# Save the HMM model to a file to avoid retraining
joblib.dump(model, hmm_file_path)

# Fit HMM and retrieve probabilites
probabilities = get_HMM_predictions(observations, model, order=order)

# Create a DataFrame with 'pos' reflecting the index of the original sequence and 'prob_0/1' as the predictions
chr_predictions_df = pd.DataFrame({'chr': 'chr19',
                                   'pos': positions[0:-order],
                                   'prob_0': probabilities[:, 0], 
                                   'prob_1': probabilities[:, 1], 
                                   'observation': observations[0:-order]
                                  })

# Checkpoint chr_predictions_df to a csv to avoid memory bottleneck
chr_predictions_df.to_csv(results_path + "HMM_ukb19_constraint_predictions_chr19.tsv.gz", index=False, compression='gzip', sep='\t')

MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340


## Compare results to AoU Data

In [5]:
aou_chr19_predictions_df = pd.read_csv(results_path + "HMM_aou_constraint_predictions_chr19.tsv.gz", compression='gzip', sep='\t')
ukb_chr19_predictions_df = pd.read_csv(results_path + "HMM_ukb19_constraint_predictions_chr19.tsv.gz", compression='gzip', sep='\t')

# Merge the DataFrames on the columns 'chr' and 'pos'
merged_df = pd.merge(aou_chr19_predictions_df, ukb_chr19_predictions_df, on=['chr', 'pos'], suffixes=('_aou', '_ukb'))

merged_df

Unnamed: 0,chr,pos,prob_0_aou,prob_1_aou,observation_aou,prob_0_ukb,prob_1_ukb,observation_ukb
0,chr19,60645,2.621388e-35,1.000000,0.0,1.000000,3.210649e-16,0.0
1,chr19,60646,6.635288e-01,0.336471,0.0,0.981195,1.880520e-02,0.0
2,chr19,60647,8.721584e-01,0.127842,0.0,0.975906,2.409405e-02,0.0
3,chr19,60648,9.377551e-01,0.062245,0.0,0.974418,2.558151e-02,0.0
4,chr19,60649,9.583747e-01,0.041625,0.0,0.974000,2.599985e-02,0.0
...,...,...,...,...,...,...,...,...
55042752,chr19,58607370,5.260989e-04,0.999474,0.0,0.001104,9.988959e-01,0.0
55042753,chr19,58607371,2.120975e-06,0.999998,1.0,0.000008,9.999916e-01,1.0
55042754,chr19,58607372,2.271861e-06,0.999998,1.0,0.001104,9.988959e-01,1.0
55042755,chr19,58607373,3.873230e-03,0.996127,1.0,0.001105,9.988955e-01,0.0


In [7]:
X = merged_df['prob_0_ukb']
X = sm.add_constant(X)
y = merged_df['observation_ukb']

# Fit the logistic regression model
model = sm.GLM(y, X, family=sm.families.Binomial()).fit()

print(model.pseudo_rsquared('McFadden'))
print(model.summary())

0.4297016502484097
                 Generalized Linear Model Regression Results                  
Dep. Variable:        observation_ukb   No. Observations:             55042757
Model:                            GLM   Df Residuals:                 55042755
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -2.0827e+07
Date:                Sun, 18 Feb 2024   Deviance:                   4.1654e+07
Time:                        17:35:52   Pearson chi2:                 3.27e+07
No. Iterations:                    13   Pseudo R-squ. (CS):             0.4346
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9509      0.000 

In [6]:
X = merged_df['prob_0_ukb']
X = sm.add_constant(X)
y = merged_df['observation_aou']

# Fit the logistic regression model
model = sm.GLM(y, X, family=sm.families.Binomial()).fit()

print(model.pseudo_rsquared('McFadden'))
print(model.summary())

0.03993771719471406
                 Generalized Linear Model Regression Results                  
Dep. Variable:        observation_aou   No. Observations:             55042757
Model:                            GLM   Df Residuals:                 55042755
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.4186e+07
Date:                Sun, 18 Feb 2024   Deviance:                   6.8371e+07
Time:                        17:31:12   Pearson chi2:                 5.54e+07
No. Iterations:                     5   Pseudo R-squ. (CS):            0.05036
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.2899      0.000

In [8]:
X = merged_df['prob_0_ukb']
X = sm.add_constant(X)
y = merged_df['prob_0_aou']

# Fit the logistic regression model
model = sm.OLS(y, X).fit()

# print(model.pseudo_rsquared('McFadden'))
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             prob_0_aou   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                  0.095
Method:                 Least Squares   F-statistic:                 5.767e+06
Date:                Sun, 18 Feb 2024   Prob (F-statistic):               0.00
Time:                        17:38:12   Log-Likelihood:            -1.8981e+07
No. Observations:            55042757   AIC:                         3.796e+07
Df Residuals:                55042755   BIC:                         3.796e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1957   5.71e-05   3430.245      0.0