## Setup

In [None]:
!pip install numpy
!pip install pandas
!pip install statsmodels

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns
import statsmodels.api as sm
import gzip
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [None]:
# Specify project directories in Sherlock
data_path = '/oak/stanford/groups/mrivas/projects/wgs-constraint-llm/data/'
results_path = '/oak/stanford/groups/mrivas/projects/wgs-constraint-llm/osthoag/wgs-constraint-llm/results/'

# Specify the file paths
coverage_file_path = data_path + 'gnomad.exomes.v4.0.coverage.summary.tsv.bgz'

## Load data

In [None]:
# Define model name (and absolute path)
model_name = results_path + 'HMM_rgc_0.9_over20_chr2'

aou_predictions_df = pd.read_csv(model_name + '_predictions_aou_wes.tsv.gz', compression='gzip', sep='\t')
rgc_predictions_df = pd.read_csv(model_name + '_predictions_rgc_wes.tsv.gz', compression='gzip', sep='\t')

# Merge the DataFrames on the columns 'chr' and 'pos'
merged_df = pd.merge(aou_predictions_df, rgc_predictions_df, on=['chr', 'pos'], suffixes=('_aou', '_rgc'))

merged_df

In [None]:
merged_df = merged_df.drop(columns=['prob_1_aou', 'prob_1_rgc'])

# Save results for future analysis
merged_df.to_csv(model_name + '_joint_predictions_aou_rgc_wes.tsv.gz', index=False, compression='gzip', sep='\t')

## Compare predictions for AoU data and RGC data for WES (excluding chr2)

In [None]:
# Define model name (and absolute path)
model_name = results_path + 'HMM_rgc_0.9_over20_chr2'

merged_df = pd.read_csv(model_name + '_joint_predictions_aou_rgc_wes.tsv.gz', compression='gzip', sep='\t')

# Filter out chromosome 2
merged_df = merged_df[merged_df['chr'] != 'chr2']

### Perform $\chi^2$ analysis for joint distribution

In [None]:
# Define the bins
bins = np.arange(0, 1.1, .1)

# Bucket the DataFrame to obtain marginal distributions
merged_df['prob_0_aou_bin'] = pd.cut(merged_df['prob_0_aou'], bins=bins, precision=2)
merged_df['prob_0_rgc_bin'] = pd.cut(merged_df['prob_0_rgc'], bins=bins, precision=2)

# Calculate marginal distributions
marginal_aou = merged_df['prob_0_aou_bin'].value_counts(normalize=True).sort_index()
marginal_rgc = merged_df['prob_0_rgc_bin'].value_counts(normalize=True).sort_index()

# Calculate expected joint distribution assuming independence
# expected_joint = np.outer(marginal_aou, marginal_rgc)
expected_joint = np.outer(marginal_aou, marginal_rgc) * len(merged_df)

# Calculate actual joint distribution
# actual_joint = merged_df[['prob_0_aou_bin', 'prob_0_rgc_bin']].value_counts(normalize=True).sort_index()
actual_joint = merged_df[['prob_0_aou_bin', 'prob_0_rgc_bin']].value_counts(normalize=False).sort_index()
actual_joint = actual_joint.to_numpy().reshape((9, 9))
actual_joint = np.insert(actual_joint, 2, 0, axis=0)
actual_joint = np.insert(actual_joint, 2, 0, axis=1)

# Calculate percent difference
percent_diff = (actual_joint - expected_joint) / expected_joint
percent_diff = np.nan_to_num(percent_diff)

# Calculate chi-squared statistic
chi_sqr = (expected_joint - actual_joint) ** 2 / expected_joint
chi_sqr = np.nan_to_num(chi_sqr)

# Create subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 12))

# Plot expected joint distribution with log scale
sns.heatmap(expected_joint, cmap='Blues', annot=False, fmt=".3f", cbar=True, xticklabels=marginal_rgc.index, yticklabels=marginal_aou.index, ax=axs[0, 0], norm=LogNorm())
axs[0, 0].set_xlabel('RGC Constraint Probability')
axs[0, 0].set_ylabel('AoU Constraint Probability')
axs[0, 0].set_title('Expected Joint Distribution')
axs[0, 0].invert_yaxis()

# Plot actual joint distribution with log scale
sns.heatmap(actual_joint, cmap='Blues', annot=False, fmt=".3f", cbar=True, xticklabels=marginal_rgc.index, yticklabels=marginal_aou.index, ax=axs[0, 1], norm=LogNorm())
axs[0, 1].set_xlabel('RGC Constraint Probability')
axs[0, 1].set_ylabel('AoU Constraint Probability')
axs[0, 1].set_title('Actual Joint Distribution')
axs[0, 1].invert_yaxis()

# Plot percent difference
sns.heatmap(percent_diff, cmap='Blues', annot=False, fmt=".3f", cbar=True, xticklabels=marginal_rgc.index, yticklabels=marginal_aou.index, ax=axs[1, 0])
axs[1, 0].set_xlabel('RGC Constraint Probability')
axs[1, 0].set_ylabel('AoU Constraint Probability')
axs[1, 0].set_title('Percent Difference of Observed vs Expected Joint Distribution')
axs[1, 0].invert_yaxis()

# Plot chi-squared statistic
sns.heatmap(chi_sqr, cmap='Blues', annot=False, fmt=".3f", cbar=True, xticklabels=marginal_rgc.index, yticklabels=marginal_aou.index, ax=axs[1, 1])
axs[1, 1].set_xlabel('RGC Constraint Probability')
axs[1, 1].set_ylabel('AoU Constraint Probability')
axs[1, 1].set_title('Chi-Squared Statistic for Observed vs Expected Joint Distribution')
axs[1, 1].invert_yaxis()

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
from scipy import stats
chi_sqr_statistic = np.sum(chi_sqr)
n = np.sum(actual_joint)
dof = n - 2
p_val = 1 - stats.chi2.cdf(chi_sqr_statistic, dof)
print(chi_sqr_statistic, n, p_val)

In [None]:
xticklabels = [f'{i:.1f}' for i in np.arange(0, 1.1, 0.1)]
yticklabels = [f'{i:.1f}' for i in np.arange(0, 1.1, 0.1)]

fig, ax1 = plt.subplots(1, figsize=(7, 6))

# Plot actual joint distribution with log scale
ax1 = sns.heatmap(actual_joint, cmap='Blues', annot=False, fmt=".3f", cbar=True, norm=LogNorm(), ax=ax1)
ax1.set_title('Observed Joint Distribution', fontsize=14)
ax1.set_xlabel('RGC Constraint Probability', fontsize=12)
ax1.set_ylabel('AoU Constraint Probability', fontsize=12)
ax1.invert_yaxis()
ax1.set_xticks(range(11))
ax1.set_xticklabels(xticklabels)
ax1.set_yticks(range(11))
ax1.set_yticklabels(yticklabels)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.savefig(results_path + "Figure 5a: observed joint distribution of RGC vs AoU WES constraint predictions")
plt.show()

fig, ax2 = plt.subplots(1, figsize=(7, 6))

# Plot chi-squared statistic
ax2 = sns.heatmap(chi_sqr, cmap='Blues', annot=False, fmt=".3f", cbar=True, ax=ax2)
ax2.set_title('$\chi^2$ for Observed vs Expected Joint Distribution', y=1, x=0.55, fontsize=14)
ax2.set_xlabel('RGC Constraint Probability', fontsize=12)
ax2.set_ylabel('AoU Constraint Probability', fontsize=12)
ax2.invert_yaxis()
ax2.set_xticks(range(11))
ax2.set_xticklabels(xticklabels)
ax2.set_yticks(range(11))
ax2.set_yticklabels(yticklabels)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.savefig(results_path + "Figure 2: chi-square statistics of RGC vs AoU WES constraint predictions")
plt.show()

### Perform correspondence analysis for joint distribution

In [None]:
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.preprocessing import StandardScaler

# Calculate row and column totals
row_totals = np.sum(actual_joint, axis=1)
col_totals = np.sum(actual_joint, axis=0)

# Add a small constant to avoid division by zero
row_totals = row_totals + 1e-10
col_totals = col_totals + 1e-10

# Calculate row and column profiles
row_profiles = actual_joint / row_totals[:, None]
col_profiles = actual_joint / col_totals[None, :]

# Replace inf or NaN values with a finite number
row_profiles = np.nan_to_num(row_profiles, nan=0.0, posinf=0.0, neginf=0.0)
col_profiles = np.nan_to_num(col_profiles, nan=0.0, posinf=0.0, neginf=0.0)

# Calculate chi-square distance matrix
D_r = np.diag(1.0 / np.sqrt(row_totals))
D_c = np.diag(1.0 / np.sqrt(col_totals))
S = D_r @ (row_profiles - col_profiles) @ D_c

# Perform singular value decomposition
U, s, Vt = svds(S, k=min(S.shape) - 1)

# Calculate row and column coordinates
F_r = D_r @ U @ np.diag(np.sqrt(s))
F_c = D_c @ Vt.T @ np.diag(np.sqrt(s))

# Standardize the coordinates
scaler = StandardScaler()
F_r = scaler.fit_transform(F_r)
F_c = scaler.transform(F_c)

In [None]:
import matplotlib.pyplot as plt

# Create a new figure
plt.figure(figsize=(10, 10))

# Plot row points and labels
plt.scatter(F_r[:, 0], F_r[:, 1], color='blue', label='AoU')
for i, label in enumerate(marginal_aou.index):
    if i==5:
        plt.text(F_r[i, 0]-0.25, F_r[i, 1]-0.1, label, color='blue')
    elif i==2:
        plt.text(F_r[i, 0], F_r[i, 1]+0.025, label, color='blue')
    elif i==9:
        plt.text(F_r[i, 0], F_r[i, 1]-0.05, label, color='blue')
    else:
        plt.text(F_r[i, 0], F_r[i, 1], label, color='blue')

# Plot column points and labels
plt.scatter(F_c[:, 0], F_c[:, 1], color='red', label='RGC')
for i, label in enumerate(marginal_rgc.index):
    plt.text(F_c[i, 0]-0.75, F_c[i, 1]-0.075, label, color='red')

# Add a legend
plt.legend()

# Add title
plt.title("BiPlot of AoU and RGC joint distribution bins")

# Show the plot
plt.show()

### Evaluate correlation via GLM

In [None]:
# Assuming merged_df is your DataFrame with columns 'prob_0_aou' and 'prob_0_rgc'
ax = sns.jointplot(
    data=merged_df[merged_df['chr'] != 'chr2'],
    x='prob_0_aou',
    y='prob_0_rgc',
    kind='hex',
    cmap='Blues',  # Change the colormap to Blues
    norm=LogNorm(),  # Use logarithmic scale
    marginal_kws=dict(bins=10),
#     histplot_kws=dict(func='log'),  # Set log scale for marginal histogram
#     kdeplot_kws=dict(func='log'),  # Set log scale for marginal KDE
    gridsize=9,  # Set grid size to 9
)

# Add labels and title
ax.set_axis_labels('AoU Constraint Probability', 'RGC Constraint Probability', fontsize=11)
ax.fig.suptitle(r'Joint Distribution of RGC and AoU Constraint Probabilities for WES', y=1.05, x=0.5, fontsize=12)  # Adjust title position and make it bold

# Add colorbar
cbar_ax = ax.fig.add_axes([1, 0.1, 0.03, 0.6])  # Adjust the position as needed
cb = plt.colorbar(cax=cbar_ax)

# Set colorbar label
cb.set_label('Log-Scaled Count')

# Show the plot
plt.savefig(results_path + "Figure 2: RGC vs AoU WES constraint predictions")
plt.show()

In [None]:
X = merged_df['prob_0_aou']
X = sm.add_constant(X)
y = merged_df['prob_0_rgc']

# Fit the logistic regression model
model = sm.OLS(y, X).fit()

print(model.summary())