### HS vs TF NaN Distribution

In [None]:
import pandas as pd
import pickle

# Load the dataset
df = pd.read_csv("/home/logs/jtorresb/yeastformer/yeast/yeast_data/output/yeast_master_matrix_shifted.csv", sep='\t', index_col=0)

# Load housekeeping gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/hk_genes.pkl", "rb") as f:
    housekeeping_genes = pickle.load(f)

# Load transcription factor IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/tf_genes.pkl", "rb") as f:
    transcription_factors = pickle.load(f)

# Ensure the genes in hk_genes and tf_genes are present in the dataset index
housekeeping_genes = [gene for gene in housekeeping_genes if gene in df.index]
transcription_factors = [gene for gene in transcription_factors if gene in df.index]

# Compute the number of missing values (NaNs) for each gene in each group
hk_genes_nans = df.loc[housekeeping_genes].isna().sum(axis=1)  # Count NaNs per housekeeping gene
tf_genes_nans = df.loc[transcription_factors].isna().sum(axis=1)  # Count NaNs per transcription factor gene

# Calculate the average number of NaNs per gene in each group
avg_nans_hk = hk_genes_nans.mean()  # Average NaNs per housekeeping gene
avg_nans_tf = tf_genes_nans.mean()  # Average NaNs per transcription factor gene

# Calculate the total number of NaNs in each group
total_nans_hk = hk_genes_nans.sum()  # Total NaNs for housekeeping genes
total_nans_tf = tf_genes_nans.sum()  # Total NaNs for transcription factor genes

# Print the results
print(f"Average number of missing values (NaNs) per housekeeping gene: {avg_nans_hk}")
print(f"Average number of missing values (NaNs) per transcription factor gene: {avg_nans_tf}")

# Print total number of NaNs in each group
print(f"Total number of missing values (NaNs) in housekeeping genes: {total_nans_hk}")
print(f"Total number of missing values (NaNs) in transcription factor genes: {total_nans_tf}")

### Median Check HS vs TF

#### Plot

In [None]:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Load normalization factors (dictionary: {gene_id: normalization_factor})
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/output/yeast_median_dict_quantile.pkl", "rb") as f:
    normalization_factors = pickle.load(f)

# Load housekeeping gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/hk_genes.pkl", "rb") as f:
    housekeeping_genes = pickle.load(f)

# Load transcription factor IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/tf_genes.pkl", "rb") as f:
    transcription_factors = pickle.load(f)

# Extract normalization values for each group
all_values = list(normalization_factors.values())
housekeeping_values = [normalization_factors[gene] for gene in housekeeping_genes if gene in normalization_factors]
tf_values = [normalization_factors[gene] for gene in transcription_factors if gene in normalization_factors]

# Create a dataframe for seaborn
df = pd.DataFrame({
    "Normalization Factor": [np.mean(all_values), np.mean(housekeeping_values), np.mean(tf_values)],
    "Category": ["All genes", "Housekeeping", "Transcription factors"]
})

# Plot barplot
plt.figure(figsize=(5, 4))
sns.barplot(x="Category", y="Normalization Factor", data=df, palette=["#4878d0", "#e07b39", "#6aaf48"])

plt.xlabel("")
plt.ylabel("Average Non-zero Median Expression")
plt.title("Average Normalization Factors")
plt.xticks(rotation=20)

plt.show()

print([np.mean(all_values), np.mean(housekeeping_values), np.mean(tf_values)])

#### Statistical Significance

In [None]:
import random
import matplotlib.pyplot as plt
import pickle
import numpy as np

# Load normalization factors (dictionary: {gene_id: normalization_factor})
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/output/yeast_median_dict_quantile.pkl", "rb") as f:
    gene_median_nonzero_dict = pickle.load(f)

# Load housekeeping gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/hk_genes.pkl", "rb") as f:
    housekeeping_genes = pickle.load(f)

# Filter housekeeping genes that exist in the normalization factors dictionary
hk_values = [gene_median_nonzero_dict[gene] for gene in housekeeping_genes if gene in gene_median_nonzero_dict]
n = len(hk_values)

if n == 0:
    print("No housekeeping gene values found in the normalization factors dictionary.")
else:
    # Compute the observed mean for housekeeping genes
    observed_mean = np.mean(hk_values)
    
    # Set number of permutations
    mcmax = 9999
    mean_array = np.zeros(mcmax)
    all_genes = list(gene_median_nonzero_dict.keys())

    # Perform sampling and compute the mean for each random sample
    for i in range(mcmax):
        sampled_keys = random.sample(all_genes, n)
        sampled_values = [gene_median_nonzero_dict[k] for k in sampled_keys]
        mean_array[i] = np.mean(sampled_values)

    # Calculate the p-value (non-parametric)
    p_value = (np.sum(mean_array >= observed_mean) + 1) / (mcmax + 1)
    print(f"Observed mean for housekeeping genes: {observed_mean}")
    print(f"P-value: {p_value}")

    # Plot histogram of the permutation distribution and mark the observed mean
    plt.figure(figsize=(10, 6))
    plt.hist(mean_array, bins=30, color='skyblue', edgecolor='black')
    plt.axvline(observed_mean, color='red', linestyle='dashed', linewidth=2,
                label=f'Observed Housekeeping Mean: {observed_mean:.3f}')
    plt.title('Empirical Null Distribution of Mean Normalization Factors\nvia Random Sampling', fontsize=14)
    plt.xlabel('Mean Normalization Factor', fontsize=12)
    plt.ylabel('Frequency (Number of Samples)', fontsize=12)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()

In [None]:
import random
import matplotlib.pyplot as plt
import pickle
import numpy as np

# Load normalization factors (dictionary: {gene_id: normalization_factor})
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/output/yeast_median_dict_quantile.pkl", "rb") as f:
    gene_median_nonzero_dict = pickle.load(f)

# Load transcription factor (TF) gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/tf_genes.pkl", "rb") as f:
    tf_genes = pickle.load(f)

# Filter TF genes that exist in the normalization factors dictionary
tf_values = [gene_median_nonzero_dict[gene] for gene in tf_genes if gene in gene_median_nonzero_dict]
n = len(tf_values)

if n == 0:
    print("No transcription factor values found in the normalization factors dictionary.")
else:
    # Compute the observed mean for transcription factors
    observed_mean = np.mean(tf_values)
    
    # Set number of permutations
    mcmax = 9999
    mean_array = np.zeros(mcmax)
    all_genes = list(gene_median_nonzero_dict.keys())

    # Perform sampling and compute the mean for each random sample
    for i in range(mcmax):
        sampled_keys = random.sample(all_genes, n)
        sampled_values = [gene_median_nonzero_dict[k] for k in sampled_keys]
        mean_array[i] = np.mean(sampled_values)

    # Calculate the p-value (non-parametric)
    p_value = (np.sum(mean_array >= observed_mean) + 1) / (mcmax + 1)
    print(f"Observed mean for transcription factors: {observed_mean}")
    print(f"P-value: {p_value}")

    # Plot histogram of the permutation distribution and mark the observed mean
    plt.figure(figsize=(10, 6))
    plt.hist(mean_array, bins=30, color='skyblue', edgecolor='black')
    plt.axvline(observed_mean, color='red', linestyle='dashed', linewidth=2,
                label=f'Observed TF Mean: {observed_mean:.3f}')
    plt.title('Empirical Null Distribution of Mean Normalization Factors\nvia Random Sampling for Transcription Factors', fontsize=14)
    plt.xlabel('Mean Normalization Factor', fontsize=12)
    plt.ylabel('Frequency (Number of Samples)', fontsize=12)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()

In [None]:
import random
import matplotlib.pyplot as plt
import pickle
import numpy as np

# Load normalization factors (dictionary: {gene_id: normalization_factor})
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/output/yeast_median_dict_quantile.pkl", "rb") as f:
    gene_median_nonzero_dict = pickle.load(f)

# Load housekeeping gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/hk_genes.pkl", "rb") as f:
    housekeeping_genes = pickle.load(f)

# Load transcription factor (TF) gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/tf_genes.pkl", "rb") as f:
    tf_genes = pickle.load(f)

# Filter genes that exist in the normalization factors dictionary
hk_values = [gene_median_nonzero_dict[gene] for gene in housekeeping_genes if gene in gene_median_nonzero_dict]
tf_values = [gene_median_nonzero_dict[gene] for gene in tf_genes if gene in gene_median_nonzero_dict]

n_hk = len(hk_values)
n_tf = len(tf_values)

if n_hk == 0 or n_tf == 0:
    print("No valid housekeeping or transcription factor gene values found in the normalization factors dictionary.")
else:
    # Compute observed mean difference
    observed_diff = np.mean(hk_values) - np.mean(tf_values)
    
    # Set number of permutations
    mcmax = 9999
    diff_array = np.zeros(mcmax)
    combined_values = hk_values + tf_values

    # Perform permutation test
    for i in range(mcmax):
        random.shuffle(combined_values)
        perm_hk = combined_values[:n_hk]
        perm_tf = combined_values[n_hk:]
        diff_array[i] = np.mean(perm_hk) - np.mean(perm_tf)

    # Calculate p-value (two-tailed)
    p_value = (np.sum(np.abs(diff_array) >= np.abs(observed_diff)) + 1) / (mcmax + 1)
    print(f"Observed mean difference (HK - TF): {observed_diff}")
    print(f"P-value: {p_value}")

    # Plot histogram of the permutation distribution and mark the observed difference
    plt.figure(figsize=(10, 6))
    plt.hist(diff_array, bins=30, color='skyblue', edgecolor='black')
    plt.axvline(observed_diff, color='red', linestyle='dashed', linewidth=2,
                label=f'Observed Difference: {observed_diff:.3f}')
    plt.title('Permutation Test: Housekeeping vs Transcription Factors', fontsize=14)
    plt.xlabel('Mean Normalization Factor Difference (HK - TF)', fontsize=12)
    plt.ylabel('Frequency (Number of Samples)', fontsize=12)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()


In [19]:
from scipy.stats import mannwhitneyu

# Load normalization factors (dictionary: {gene_id: normalization_factor})
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/output/yeast_median_dict_quantile.pkl", "rb") as f:
    gene_median_nonzero_dict = pickle.load(f)

# Load housekeeping gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/hk_genes.pkl", "rb") as f:
    housekeeping_genes = pickle.load(f)

# Load transcription factor (TF) gene IDs
with open("/home/logs/jtorresb/yeastformer/yeast/yeast_data/genes_info/tf_genes.pkl", "rb") as f:
    tf_genes = pickle.load(f)

# Filter genes that exist in the normalization factors dictionary
hk_values = [gene_median_nonzero_dict[gene] for gene in housekeeping_genes if gene in gene_median_nonzero_dict]
tf_values = [gene_median_nonzero_dict[gene] for gene in tf_genes if gene in gene_median_nonzero_dict]

# Perform Wilcoxon rank-sum test (Mann-Whitney U test)
stat, p_value = mannwhitneyu(hk_values, tf_values, alternative='two-sided')

print(f"Mann-Whitney U Test Statistic: {stat}")
print(f"P-value: {p_value}")

Mann-Whitney U Test Statistic: 1476.0
P-value: 0.08143345758570157
