In [None]:
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import math
import sys
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.genmod.families import NegativeBinomial, Gamma
from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP

from scipy.stats import expon, nbinom, norm, poisson
from scipy.optimize import minimize

import warnings
warnings.filterwarnings("ignore")


# now the DCIS count data is found in an RDA file, which we apparently read using 'pyreadr'
import pyreadr

import os

# thread issues
os.environ['OMP_NUM_THREADS'] = '12'  # Limit to 1 thread
os.environ['MKL_NUM_THREADS'] = '12'  # Limit to 1 thread for MKL (if used)
os.environ['NUMEXPR_NUM_THREADS'] = '12'  # Limit to 1 thread for NumExpr (if used)



# to convert Ensemble to Refseq gene names
gene_convert = pyreadr.read_r('/path/to/gene_info/ensemble_to_refseq_gene_name_table.rds')
gene_convert = gene_convert[None]
id_to_name = {gene_id: gene_name for gene_id, gene_name in zip(gene_convert["gene_id"], gene_convert["gene_name"])}


In [None]:
### Parameters ###

# whether or not we're doing outlier removal using trimmed means
trim_means_flag = True
trim_percent = 1 # 1% usually gets rid of most extreme outliers

# genes must be expressed in this % of patients (between 0-1)
express_percent_limit = 0.0 # set to 0 if you want patient stats (all genes with at least 1 read), set to 0.2 if we want AIC stats of genes with >20% expression

# library adjust (using fractional method)
adjust_for_lib = False

# calculate AIC distance
calc_AIC_dist = False # False saves time when running the full program

# a flag if we want to just do "no ZI" or "NB vs ZINB only"
NB_ZINB_only = False # Only comparing NB to ZINB [trim_percent should be low, maybe even zero]

# trim will remove zeroes, so I don't think we should activate trim when doing NB/ZINB comparison
if (NB_ZINB_only == True):
    trim_percent = 0

no_ZI_AICs = False


In [None]:
# non-AIC related functions used in this program    

# computation of gene average, fraction of zeroes, and library size
def dataset_stats_generator(df, draw_zero_distribution = False, dataset_name = ""):



    num_genes = df.shape[0]
    num_samples = df.shape[1]
    # Compute the metrics for each row
    col_sums = df.sum(axis = 0)

    fraction_zero_samples = (df == 0).sum(axis=0) / num_genes
    fraction_zero_genes = (df == 0).sum(axis=1) / num_samples
    row_means = df.mean(axis=1)



    if (draw_zero_distribution):
        plt.hist(fraction_zero_genes, bins=100, color='blue', alpha=0.7)
        plt.xlabel('Fraction of Zeroes (Genes)')
        plt.ylabel('Count')
        plt.title("Fraction of Zeroes per Gene")
        plt.show()

        plt.hist(fraction_zero_samples, bins=100, color='blue', alpha=0.7)
        plt.xlabel('Fraction of Zeroes (Samples)')
        plt.ylabel('Count')
        plt.title("Fraction of Zeroes per Sample")
        plt.show()


        min_data = np.min(row_means + 0.0000001)
        max_data = np.max(row_means + 1)
        print(min_data, max_data)

        # Generate log-spaced bins
        bins = np.logspace(np.log10(min_data), np.log10(max_data), 500)



        plt.hist(row_means, bins=bins, color='blue', alpha=0.7, log=True)
        plt.xlabel('Means of Transcript Counts')
        plt.ylabel('Counts (log-scaled)')
        # plt.xscale('log')
        # plt.yscale('log')
        # plt.xlim(left=1)
        plt.title("Distribution of Mean Counts Per Transcript: " + dataset_name)
        plt.show()



        plt.hist(row_means[row_means >= 10000], bins=100, color='blue', alpha=0.7)
        plt.xlabel('Means of Gene Expression (means >=10,000 only)')
        plt.ylabel('Counts (log-scaled)')
        plt.yscale('log')
        plt.title("Distribution of Mean Counts Per Gene: " + dataset_name)
        plt.show()
      
    # get the average of these 
    avg_library_size = np.round(np.sum(col_sums) / num_samples, decimals = 0)
    avg_zeroes = np.round(np.sum(fraction_zero_samples) / num_samples, decimals = 3)
    avg_mean_expression = np.round(np.mean(row_means), decimals = 3)
    stdev_mean_expression = np.round(np.std(row_means), decimals = 3)
    
    # print("Avg Library Size", "Avg Fraction Zeroes", "Avg Mean Expression")
    return avg_library_size, avg_zeroes, avg_mean_expression, stdev_mean_expression

# Simulating some data for illustration
#data = np.random.negative_binomial(10, 0.5, 1000)

def fit_to_nb_plot(data, plotrange = 30):

    # Estimating parameters directly from data
    mean = np.mean(data)
    var = np.var(data)
    p = 1 - (mean / var)
    n = mean * (1 - p) / p

    # Plotting
    plt.hist(data, bins=range(plotrange), align='left', density=True, alpha=0.6, color='g')
    plt.plot(bins[:-1], nbinom.pmf(bins[:-1], n, p), 'ro-', lw=2)
    plt.title("Negative Binomial Fit")
    plt.show()

# adjust for library sizes
def library_adjust(data):
    if (adjust_for_lib):
        library_size = data.sum(axis=0)
        
        cleaned_matrix = np.round((data /library_size)*10000000)
        return cleaned_matrix
    else:
        return data

In [None]:
# functions to compute ZINB

def zinb_loglike(params, counts):
    mu, theta, pi = params
    p = 1 / (1 + mu/theta)
    n = mu * p / (1 - p)
    loglik_pois = nbinom.logpmf(counts, n, p)
    loglik_zero = np.log(pi + (1 - pi) * np.exp(nbinom.logpmf(0, n, p)))
    loglik = np.where(counts == 0, loglik_zero, np.log(1 - pi) + loglik_pois)
    return -np.sum(loglik)

def calculate_aic(loglik, k):
    return 2*k - 2*loglik

def fit_zinb_and_calculate_aic(counts):
    
    initial_params = np.array([np.mean(counts), np.var(counts), 0.9])
    bounds = [(0, None), (0, None), (0, 1)]
    result = minimize(zinb_loglike, initial_params, args=(counts), bounds=bounds)
    mu, theta, pi = result.x
    loglik = -result.fun
    
    k = 3  # Number of parameters
    aic = calculate_aic(loglik, k)
    return mu, theta, pi, aic

In [None]:
# functions to compute ZIP
def zip_loglike(params, counts):
    mu, pi = params
    loglik_pois = poisson.logpmf(counts, mu)
    loglik_zero = np.log(pi + (1 - pi) * np.exp(poisson.logpmf(0, mu)))
    loglik = np.where(counts == 0, loglik_zero, np.log(1 - pi) + loglik_pois)
    return -np.sum(loglik)

def calculate_aic(loglik, k):
    return 2*k - 2*loglik

def fit_zip_and_calculate_aic(counts):
    
    initial_params = np.array([np.mean(counts), 0.9])
    bounds = [(0, None), (0, 1)]
    result = minimize(zip_loglike, initial_params, args=(counts), bounds=bounds)
    mu, pi = result.x
    loglik = -result.fun
    k = 2  # Number of parameters for ZIP model
    aic = calculate_aic(loglik, k)
    return mu, pi, aic

In [None]:
def compare_distributions(aic_values):
    distribution_types = {
        "NB": aic_values["NB"],
        "Gaussian": aic_values["Gaussian"],
        "Poisson": aic_values["Poisson"],
        "Exponential": aic_values["Exponential"],
        "ZIP": aic_values["Zero-Inflated Poisson"],
        "ZINB": aic_values["Zero-Inflated Negative Binomial"]
    }

    min_aic_type = min(distribution_types, key=distribution_types.get)
    min_aic_value = distribution_types[min_aic_type]

    return min_aic_type, min_aic_value



# this is the function that computes AIC for all distributions (Gaussian, Exponential, Negative Binomial, ZIP, ZINB)
# and reports which distribution is lowest
# row - a vector of expressions
def manual_aic(row):
    
    row = np.round(row) # it must be count data

    # trimmed mean to remove outliers
    n = len(row)
    
    if (trim_means_flag):
        elements_to_trim = int(np.floor(trim_percent / 100.0 * n))  
        sorted_data = np.sort(row)
        
        if (elements_to_trim > 0):
            row = sorted_data[elements_to_trim:-elements_to_trim]
        else: 
            row = sorted_data

    if (sum(row) <= 0):
        return "ZEROES"

    eps = 0.0000000000001

    X = sm.add_constant(np.ones(len(row)))

    # Exponential parameters
    lambda_exp = 1 / np.mean(row)
    log_likelihood_exp = np.sum(expon.logpdf(row, scale=1/lambda_exp))
    aic_exp = 2*1 - 2*log_likelihood_exp  # 1 parameter for exponential

    # log -> linear -> delog
    # using StatsModels to fit to a Gamma (it doesn't have exponential)
    #model_exponential_approx = sm.GLM(row, X, family=Gamma()).fit()
    #print('AIC for Gamma:', model_exponential_approx.aic)
    #aic_exp = model_exponential_approx.aic



    # Compute AIC to NB manually 
    mu_sample = np.mean(row)
    var_sample = np.var(row)
    if (mu_sample == var_sample):
        var_sample = var_sample + eps
    r_estimated = mu_sample**2 / (var_sample - mu_sample)
    
    if (mu_sample + r_estimated) == 0:
        r_estimated = r_estimated + eps
    p_estimated = r_estimated / (mu_sample + r_estimated)
    #log_likelihood_nb = np.sum(nbinom.logpmf(row, r_estimated, p_estimated))
    #aic_nb = 2*2 - 2*log_likelihood_nb  # 2 parameters for NB
    
    #print("NB AIC SciPy", aic_nb_orig)

    # StatsModels method to compute fit to NB
    #X = sm.add_constant(np.ones(len(row)))
    #model_nb = sm.GLM(row, X, family=NegativeBinomial()).fit(disp=0)
    #aic_nb = model_nb.aic
    #print(model_nb.summary())
    #print("NB AIC GLM", aic_nb)

    res = sm.NegativeBinomial(row, X).fit(start_params=[1,1], disp=0)
    
    const = res.params[0]
    alpha = res.params[1]

    mu = np.exp(const)
    p = 1/(1+np.exp(const)*alpha)
    n = np.exp(const)*p/(1-p)

    nb_theta = mu * (1 - p) / p

    aic_nb = res.aic

    # ZINB parameters
    mu, zinb_theta, zinb_pi, aic = fit_zinb_and_calculate_aic(row)
    aic_zinb = aic
    

    if (NB_ZINB_only is False):
        # AIC of Gaussian following IRLS (IRLS)
        model_gaussian = sm.GLM(row, X, family=sm.families.Gaussian()).fit(disp=0)
        aic_gauss = model_gaussian.aic


        # Poisson parameters (all methods give the same AIC)
        model_poisson = sm.Poisson(row, X).fit(disp=0)
        aic_pois = model_poisson.aic

        # Usage
        mu, pi, aic = fit_zip_and_calculate_aic(row)
        aic_zip = aic
        #print(f"mu: {mu}, pi: {pi}, AIC: {aic}")
    else:
        aic_zip = 100000000
        aic_gauss = 100000000
        aic_pois = 100000000

    # sometimes ZIP and ZINB can be NaN if there are no zeroes
    # NB can also become NaN if Mean > Variance (I think)
    # just in case, lets add this check for all of them
    aic_scores = {'aic_zip': aic_zip, 'aic_zinb': aic_zinb, 'aic_nb': aic_nb, 'aic_pois': aic_pois, 'aic_gauss': aic_gauss, 'aic_exp': aic_exp}

    for key in aic_scores:
        if np.isnan(aic_scores[key]) | np.isinf(aic_scores[key]):
            aic_scores[key] = 100000000

    aic_zip, aic_zinb, aic_nb, aic_pois, aic_gauss, aic_exp = aic_scores.values()
        
    # in certain analyses, we might only want to look at certain distributions
    # so we will make the AIC score high for those we don't care about
    if (NB_ZINB_only == True):
        aic_zip, aic_pois, aic_gauss, aic_exp = 10000000, 10000000, 10000000, 10000000
    if (no_ZI_AICs == True):
        aic_zip, aic_zinb = 10000000, 10000000
    
    # print(aic_nb, aic_gauss, aic_pois, aic_exp)
    best_distribution, best_aic = compare_distributions({
        "NB": aic_nb,
        "Gaussian": aic_gauss,
        "Poisson": aic_pois,
        "Exponential": aic_exp,
        "Zero-Inflated Poisson": aic_zip,
        "Zero-Inflated Negative Binomial": aic_zinb
    })

    return best_distribution, nb_theta, zinb_theta, zinb_pi


In [None]:
# this version of the function computes the same values but instead returns the 
# differential between the NB AIC and other distributions

# row - a vector of expressions
def manual_aic_distfromNB(row):
    
    row = np.round(row) # make it count data
    # New Way - Trimmed Means
    # trimmed mean to remove outliers
    n = len(row)
    
    if (trim_means_flag):
        elements_to_trim = int(np.floor(trim_percent / 100.0 * n))  
        sorted_data = np.sort(row)
        
        if (elements_to_trim > 0):
            row = sorted_data[elements_to_trim:-elements_to_trim]
        else: 
            row = sorted_data

    
    if (sum(row) <= 0):
        return -9999

    # Exponential parameters
    lambda_exp = 1 / np.mean(row)
    log_likelihood_exp = np.sum(expon.logpdf(row, scale=1/lambda_exp))
    aic_exp = 2*1 - 2*log_likelihood_exp  # 1 parameter for exponential
   

    # NB parameters
    mu_sample = np.mean(row)
    var_sample = np.var(row)
    if (mu_sample == var_sample):
        var_sample = var_sample + 0.0000000000001
    r_estimated = mu_sample**2 / (var_sample - mu_sample)
    
    if (mu_sample + r_estimated) == 0:
        r_estimated = r_estimated + 0.0000000000001
    p_estimated = r_estimated / (mu_sample + r_estimated)
    log_likelihood_nb = np.sum(nbinom.logpmf(row, r_estimated, p_estimated))
    aic_nb = 2*2 - 2*log_likelihood_nb  # 2 parameters for NB

    # ZINB parameters
    pi_zinb = np.mean(row == 0)
    log_likelihood_zeros = np.sum(np.log(pi_zinb) * (row == 0))
    log_likelihood_non_zeros = np.sum(np.log(1 - pi_zinb) + nbinom.logpmf(row[row != 0], r_estimated, p_estimated))
    log_likelihood_zinb = log_likelihood_zeros + log_likelihood_non_zeros
    aic_zinb = 2*3 - 2*log_likelihood_zinb  # 3 parameters for ZINB: pi, r, and p

    # Gaussian parameters
    mu_gauss = np.mean(row)
    sigma_gauss = np.std(row)
    log_likelihood_gauss = np.sum(norm.logpdf(row, mu_gauss, sigma_gauss))
    aic_gauss = 2*2 - 2*log_likelihood_gauss  # 2 parameters for Gaussian: mu and sigma

   

    # Poisson parameters
    lambda_pois = np.mean(row)
    log_likelihood_pois = np.sum(poisson.logpmf(row, lambda_pois))
    aic_pois = 2*1 - 2*log_likelihood_pois  # 1 parameter for Poisson: lambda

    # ZIP parameters
    pi_zip = np.mean(row == 0)
    lambda_zip = np.mean(row[row != 0])
    log_likelihood_zeros_zip = np.sum(np.log(pi_zip) * (row == 0))
    log_likelihood_non_zeros_zip = np.sum(np.log(1 - pi_zip) + poisson.logpmf(row[row != 0], lambda_zip))
    log_likelihood_zip = log_likelihood_zeros_zip + log_likelihood_non_zeros_zip
    aic_zip = 2*2 - 2*log_likelihood_zip  # 2 parameters for ZIP: pi and lambda
    
    # ZIP and ZINB can be NaN if there are no zeroes; similarly, NB is NaN if Mean > Variance
    # to account for this, lets set them to a very high AIC if this occurs
    aic_scores = {'aic_zip': aic_zip, 'aic_zinb': aic_zinb, 'aic_nb': aic_nb, 'aic_pois': aic_pois, 'aic_gauss': aic_gauss, 'aic_exp': aic_exp}

    for key in aic_scores:
        if np.isnan(aic_scores[key]):
            aic_scores[key] = 1000000000

    aic_zip, aic_zinb, aic_nb, aic_pois, aic_gauss, aic_exp = aic_scores.values()

    if (NB_ZINB_only == True):
        aic_zip, aic_pois, aic_gauss, aic_exp = 10000000, 10000000, 10000000, 10000000
    if (no_ZI_AICs == True):
        aic_zip, aic_zinb = 10000000, 10000000

    # Compare AICs and determine best fit
    if (aic_nb < aic_pois ) & (aic_nb < aic_gauss) & (aic_nb < aic_exp) & (aic_nb < aic_zip) & (aic_nb < aic_zinb):
        return 0
    elif (aic_gauss < aic_nb) & (aic_gauss < aic_pois) & (aic_gauss < aic_exp) & (aic_gauss < aic_zip) & (aic_gauss < aic_zinb):
        return (aic_nb - aic_gauss)/aic_gauss
    elif (aic_exp < aic_nb) & (aic_exp < aic_pois) & (aic_exp < aic_gauss) & (aic_exp < aic_zip) & (aic_exp < aic_zinb):
        return (aic_nb - aic_exp)/aic_exp
    elif (aic_zinb < aic_nb) & (aic_zinb < aic_pois) & (aic_zinb < aic_gauss) & (aic_zinb < aic_zip) & (aic_zinb < aic_exp):
        return (aic_nb - aic_zinb)/aic_zinb
    elif (aic_zip < aic_nb) & (aic_zip < aic_pois) & (aic_zip < aic_gauss) & (aic_zip < aic_zinb) & (aic_zip < aic_exp):
        return (aic_nb - aic_zip)/aic_zip
    else:
        return (aic_nb - aic_pois)/aic_pois

In [None]:
# this is a dataset with 528 FFPE breast cancer samples, sequenced from a HiSeq

data = pd.read_csv('/path/to/third_party/GSE167977_third_party_ffpe/GSE167977_Raw_Counts.txt',
                  delimiter='\t')

# filter and compute dispersion
# dispersion of tumours - All Data
tumours_counts = pd.DataFrame(data)

tumours_counts.set_index('ensembl_gene_id', inplace=True)
#tumours_counts = tumours_counts.drop(tumours_counts.columns[0], axis=1) # column 1
tumours_counts = tumours_counts.drop(tumours_counts.columns[-5:], axis=1) # last 5 columns

row_means = tumours_counts.mean(axis=1)


# adjust for library size (fraction method)
# should come before the gene filter
tumours_counts_lib_adjust = library_adjust(tumours_counts)

fraction_of_zeroes = (tumours_counts_lib_adjust == 0).mean(axis=1)
filtered_df = tumours_counts_lib_adjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

print(filtered_df.shape)

dataset_stats = dataset_stats_generator(filtered_df, draw_zero_distribution=True)
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])

new_tumours_counts = filtered_df.copy()
new_tumours_counts['Mean'] = filtered_df.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_df.std(axis=1)
new_tumours_counts['Min'] = filtered_df.min(axis=1)
new_tumours_counts['Max'] = filtered_df.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


GSE167977_stdev = [] 
new_tumours_counts = filtered_df.copy()

for i in range(min(10, len(filtered_df))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)

    # Append the standard deviation to the list
    GSE167977_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE167977_stdev)

In [None]:
if(calc_AIC_dist):

    print("GSE167977 - How off NB is to the winning distribution")
    aic_off = filtered_df.apply(manual_aic_distfromNB, axis=1)
    print(aic_off)

    # I want the average, and I want to eliminate -9999 (that's if all genes are zeroes)
    mask = (aic_off != 0) & (aic_off > -9999)

    median_off = np.median(aic_off[mask])
    print("NB AIC is usually: ", median_off)
    count_less_than_10 = np.sum(aic_off[mask] < 0.01)
    count_greater_equal_10 = np.sum(aic_off[mask] >= 0.01)

    print(f"Count of values < 1%: {count_less_than_10}")
    print(f"Count of values >= 1%: {count_greater_equal_10}")


In [None]:
# for testing
#filtered_df = filtered_df.iloc[1:100]

# function to compute by row
print("GSE167977 - Lowest AIC across all genes")

aic_values = filtered_df.apply(manual_aic, axis=1)
aic_values, nb_theta_GSE167977, zinb_theta_GSE167977, zinb_pi_GSE167977 = zip(*aic_values)

# print(aic_values)

AIC_top_rank_GSE167977 = pd.Series(aic_values).value_counts()
print(AIC_top_rank_GSE167977)

In [None]:
data = pd.read_csv('/path/to/third_party/GSE181466_third_party_ffpe/GSE181466_rsem_genes_matrix-97.txt',
                  delimiter='\t')

# patient information splitting is unnecessary, this appears to all be both FFPE and from tumours
# there is subtype and age information in the series matrix file, if we're interested

# dispersion of tumours - All Data
tumours_counts = pd.DataFrame(data)
# removing gene column at position 0
tumours_counts.set_index('Unnamed: 0', inplace=True)
#tumours_counts = tumours_counts.drop(tumours_counts.columns[0], axis=1)
# skip genes that are all zeroes, or just one spurrious read somewhere

# adjust for library size (fraction method)
tumours_counts_libadjust = library_adjust(tumours_counts)

fraction_of_zeroes = (tumours_counts_libadjust == 0).mean(axis=1)
filtered_df = tumours_counts_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients
print(filtered_df.shape)

dataset_stats = dataset_stats_generator(filtered_df, draw_zero_distribution=True)
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
print(dataset_stats)



new_tumours_counts = filtered_df.copy()
new_tumours_counts['Mean'] = filtered_df.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_df.std(axis=1)
new_tumours_counts['Min'] = filtered_df.min(axis=1)
new_tumours_counts['Max'] = filtered_df.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


GSE181466_stdev = [] 
new_tumours_counts = filtered_df.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE181466_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE181466_stdev)

In [None]:
import pandas as pd

# Define the trimming percentage
trim_percent = 0.015  # 1% trimming

# Function to calculate trimmed mean for a row
def trimmed_mean(row, proportion):
    sorted_row = sorted(row)
    n = len(sorted_row)
    trim_count = int(n * proportion)
    trimmed_row = sorted_row[trim_count:n-trim_count]
    return sum(trimmed_row) / len(trimmed_row) if trimmed_row else float('nan')

# Apply the trimmed mean function to each row
trimmed_means = filtered_df.apply(lambda row: trimmed_mean(row, trim_percent), axis=1)

# Calculate the mean of the trimmed means
overall_mean = trimmed_means.mean()

#print("Trimmed Means for Each Row:")
#print(trimmed_means)
#print("\nOverall Mean of Trimmed Means:")
#print(overall_mean)

search_term = 'NEAT'


matching_rows = [name for name in trimmed_means.index if search_term in name]

print("Matching Row Names:")
for name in matching_rows:
    print(trimmed_means.loc[name])


In [None]:
if (calc_AIC_dist):
    print("GSE181466 - How off NB is to the winning distribution")
    aic_off = filtered_df.apply(manual_aic_distfromNB, axis=1)
    print(aic_off)

    # I want the average, and I want to eliminate -9999 (that's if all genes are zeroes)
    mask = (aic_off != 0) & (aic_off > -9999)

    median_off = np.median(aic_off[mask])
    print("NB AIC is usually: ", median_off)
    count_less_than_10 = np.sum(aic_off[mask] < 0.01)
    count_greater_equal_10 = np.sum(aic_off[mask] >= 0.01)

    print(f"Count of values < 1%: {count_less_than_10}")
    print(f"Count of values >= 1: {count_greater_equal_10}")

In [None]:
#filtered_df = filtered_df.iloc[1:200]

print("GSE181466")

aic_values = filtered_df.apply(manual_aic, axis=1)
aic_values, nb_theta_GSE181466, zinb_theta_GSE181466, zinb_pi_GSE181466 = zip(*aic_values)
AIC_top_rank_GSE181466 = pd.Series(aic_values).value_counts()

print(AIC_top_rank_GSE181466)

In [None]:
## here, we will repeat our plots but for a different data set
all_counts = pyreadr.read_r('/path/to/third_party/GSE146889_third_party_ffpe/GSE146889_GeneCount.rds')
df = all_counts[None] # load all_counts into a pandas data frame


# we need to split the tumors and normals by name
count_TUMOR = df.filter(like='tumor')
count_NORMAL = df.filter(like='normal')

#filtered_tumour = count_TUMOR[count_TUMOR.sum(axis=1) > 1]
#filtered_normal = count_NORMAL[count_NORMAL.sum(axis=1) > 1]

# adjust for library size (fraction method)
count_TUMOR_libadjust = library_adjust(count_TUMOR)

fraction_of_zeroes = (count_TUMOR_libadjust == 0).mean(axis=1)
filtered_tumour = count_TUMOR_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

print("Tumours (GSE146889)")
dataset_stats = dataset_stats_generator(filtered_tumour, draw_zero_distribution=True, dataset_name="GSE146889 Tumours")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
print(count_TUMOR_libadjust.shape)
print(filtered_tumour.shape)


new_tumours_counts = filtered_tumour.copy()
new_tumours_counts['Mean'] = filtered_tumour.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_tumour.std(axis=1)
new_tumours_counts['Min'] = filtered_tumour.min(axis=1)
new_tumours_counts['Max'] = filtered_tumour.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


GSE146889_tumour_stdev = [] 
new_tumours_counts = filtered_tumour.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE146889_tumour_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE146889_tumour_stdev)


#print(dataset_stats)
# adjust for library size (fraction method)
count_NORMAL_libadjust = library_adjust(count_NORMAL)

fraction_of_zeroes = (count_NORMAL_libadjust == 0).mean(axis=1)
filtered_normal = count_NORMAL_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

print("Normals")
dataset_stats = dataset_stats_generator(filtered_normal, draw_zero_distribution=True, dataset_name="GSE146889 Normals")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
#print(dataset_stats)

new_tumours_counts = filtered_normal.copy()
new_tumours_counts['Mean'] = filtered_normal.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_normal.std(axis=1)
new_tumours_counts['Min'] = filtered_normal.min(axis=1)
new_tumours_counts['Max'] = filtered_normal.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


GSE146889_normal_stdev = [] 
new_tumours_counts = filtered_normal.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE146889_normal_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE146889_normal_stdev)

In [None]:
#filtered_tumour = filtered_tumour.iloc[1:200]

print("GSE146889 - Tumours")

aic_values = filtered_tumour.apply(manual_aic, axis=1)
aic_values, nb_theta_GSE146889_tumours, zinb_theta_GSE146889_tumours, zinb_pi_GSE146889_tumours = zip(*aic_values)
AIC_top_rank_GSE146889_tumours = pd.Series(aic_values).value_counts()
print(AIC_top_rank_GSE146889_tumours)

In [None]:
#filtered_normal = filtered_normal.iloc[1:200]

print("GSE146889 - Normal")

aic_values = filtered_normal.apply(manual_aic, axis=1)

aic_values, nb_theta_GSE146889_normals, zinb_theta_GSE146889_normals, zinb_pi_GSE146889_normals = zip(*aic_values)

AIC_top_rank_GSE146889_normals = pd.Series(aic_values).value_counts()
print(AIC_top_rank_GSE146889_normals)

In [None]:
all_counts = pyreadr.read_r('/path/to/third_party/GSE209998_third_party_ffpe/GSE209998_GeneCount.rds')
sample_information = pyreadr.read_r('/path/to/third_party/GSE209998_third_party_ffpe/GSE209998_Sample_Data.rds')

# now we want to isolate just the expression from a particular type of tissue
df_counts = all_counts[None] # load all_counts into a pandas data frame
df_sample = sample_information[None] # load all_counts into a pandas data frame

# here, we need to match if a sample is normal or tumour by !Sample_source_name_ch1 row

# so I need to: 1) match columns between sample_information and all_counts 
# are they in the same order
columns_df1 = df_counts.columns
columns_df2 = df_sample.columns

# Now we find what samples were tumours and what were normal
samples_row = df_sample.loc["!Sample_source_name_ch1"]

split_dfs = {}
for sample_type in samples_row.unique():
    matching_columns = [col for col in df_counts.columns if col in df_sample.columns and samples_row[col] == sample_type]
    split_dfs[sample_type] = df_counts[matching_columns]

sample_source = df_sample.loc["!Sample_source"]

split_source = {}
for sample_type in sample_source.unique():
    matching_columns = [col for col in df_counts.columns if col in df_sample.columns and sample_source[col] == sample_type]
    split_source[sample_type] = df_counts[matching_columns]


count_FRESH = split_source["Fresh frozen"]
count_FFPE = split_source["FFPE"]

#filtered_ffpe = count_FFPE[count_FFPE.sum(axis=1) > 1]
# adjust for library size (fraction method)
count_FFPE_libadjust = library_adjust(count_FFPE)
fraction_of_zeroes = (count_FFPE_libadjust == 0).mean(axis=1)
filtered_ffpe = np.round(count_FFPE_libadjust[fraction_of_zeroes < (1 - express_percent_limit)]) # must be expressed to this percentage of patients

#print(np.average(filtered_ffpe[0:1]), np.std(filtered_ffpe[0:1]))

print("FFPE")
dataset_stats = dataset_stats_generator(filtered_ffpe, draw_zero_distribution=True, dataset_name = "GSE209998 FFPE")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
# print(dataset_stats)

print("FFPE", filtered_ffpe.shape)


new_tumours_counts = filtered_ffpe.copy()
new_tumours_counts['Mean'] = filtered_ffpe.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_ffpe.std(axis=1)
new_tumours_counts['Min'] = filtered_ffpe.min(axis=1)
new_tumours_counts['Max'] = filtered_ffpe.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


GSE209998_ffpe_stdev = [] 
new_tumours_counts = filtered_ffpe.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE209998_ffpe_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE209998_ffpe_stdev)


# adjust for library size (fraction method)
count_FRESH_libadjust = library_adjust(count_FRESH)
fraction_of_zeroes = (count_FRESH_libadjust == 0).mean(axis=1)
filtered_fresh = np.round(count_FRESH_libadjust[fraction_of_zeroes < (1 - express_percent_limit)]) # must be expressed to this percentage of patients
print("Fresh", filtered_fresh.shape[1])

dataset_stats = dataset_stats_generator(filtered_fresh, draw_zero_distribution=True, dataset_name = "GSE209998 Formalin-Fixed")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])



new_tumours_counts = filtered_fresh.copy()
new_tumours_counts['Mean'] = filtered_fresh.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_fresh.std(axis=1)
new_tumours_counts['Min'] = filtered_fresh.min(axis=1)
new_tumours_counts['Max'] = filtered_fresh.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


GSE209998_ff_stdev = [] 
new_tumours_counts = filtered_fresh.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE209998_ff_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE209998_ff_stdev)

In [None]:
# Define the trimming percentage
trim_percent = 0.00  # 1% trimming is 0.01

# Function to calculate trimmed mean for a row
def right_tailed_trimmed_mean(row, proportion):
    sorted_row = sorted(row)
    n = len(sorted_row)
    trim_count = int(n * proportion)
    # Trim only from the right
    trimmed_row = sorted_row[:n-trim_count]
    return sum(trimmed_row) / len(trimmed_row) if trimmed_row else float('nan')




# Apply the trimmed mean function to each row
trimmed_means = filtered_ffpe.apply(lambda row: right_tailed_trimmed_mean(row, trim_percent), axis=1)

min_data = np.min(trimmed_means + 0.0000001)
max_data = np.max(trimmed_means + 1)
print(min_data, max_data)

# Generate log-spaced bins
bins = np.logspace(np.log10(min_data), np.log10(max_data), 500)

plt.figure(figsize=(4, 4))
plt.hist(trimmed_means, bins=bins, color='blue', alpha=0.7, log=True)
plt.xlabel('Mean of Transcript Counts')
plt.ylabel('Counts (log-scaled)')
#plt.xscale('log')
plt.yscale('log')
#plt.xlim(left=1)
dataset_name = "GSE209998 FFPE"
plt.title("Distribution of Mean Counts Per Transcript: " + dataset_name, size=8)

outpath = "/path/to/6.0.1_Third_Party_Data.Best_AIC_and_Stat_Generator.No_GLM/"
filename = 'GSE209998_FFPE_Log_Distribution.pdf'
plt.savefig(outpath + filename, format='pdf', dpi=300, bbox_inches='tight') 

plt.show()

# Calculate the mean of the trimmed means
overall_mean = trimmed_means.mean()

#print("Trimmed Means for Each Row:")
#print(trimmed_means)
#print("\nOverall Mean of Trimmed Means:")
#print(overall_mean)

search_term = 'MALAT1'

matching_rows = [name for name in trimmed_means.index if search_term in name]

print("Matching Row Names:")
for name in matching_rows:
    print(trimmed_means.loc[name])

In [None]:
# Try out making a plot. Log mean vs log variance.

trim_percent = 0.00  # 1% trimming is 0.01

def right_tailed_trimmed_mean_and_variance(row, proportion):
    sorted_row = sorted(row)
    n = len(sorted_row)
    trim_count = int(n * proportion)
    # Trim only from the right (highest values)
    trimmed_row = sorted_row[:n - trim_count]
    if not trimmed_row:
        return float('nan'), float('nan')  # Return NaN if trimmed_row is empty

    # Calculate the trimmed mean
    mean = sum(trimmed_row) / len(trimmed_row)
    
    # Calculate the trimmed variance
    if len(trimmed_row) > 1:
        variance = sum((x - mean) ** 2 for x in trimmed_row) / (len(trimmed_row) - 1)
    else:
        variance = 0.0  # Variance is zero if only one data point remains

    return mean, variance


trimmed_means = []
trimmed_variances = []
for index, row in filtered_ffpe.iterrows():
    mean, variance = right_tailed_trimmed_mean_and_variance(row, trim_percent)
    # Only include valid results
    if not np.isnan(mean) and mean > 0 and variance > 0:
        trimmed_means.append(mean)
        trimmed_variances.append(variance)


epsilon = 1e-10
log_trimmed_means = np.log(np.array(trimmed_means) + epsilon)
log_trimmed_variances = np.log(np.array(trimmed_variances) + epsilon)

plt.figure(figsize=(4, 4))
plt.scatter(log_trimmed_means, log_trimmed_variances, alpha=0.6, edgecolors='w', s=50)

# Plot the green line where variance equals mean (slope = 1)
x_values = np.linspace(np.min(log_trimmed_means), np.max(log_trimmed_means), 100)
y_values_poisson = x_values  # Since log(var) = log(mean)

plt.plot(x_values, y_values_poisson, color='green', linestyle='--', label='Variance = Mean (Poisson)')

# Plot the red line where variance equals mean squared (slope = 2)
y_values_quadratic = 2 * x_values  # Since log(var) = 2 * log(mean)

plt.plot(x_values, y_values_quadratic, color='red', linestyle='--', label='Variance = Mean²')

# Add labels and title
plt.xlabel('Log (Mean)')
plt.ylabel('Log (Variance)')
dataset_name = "GSE209998 FFPE"
plt.title('Log(Mean) vs. Log(Variance): ' + dataset_name)

# Add a legend
plt.legend()

# Add grid
plt.grid(True)

# Show the plot
plt.show()

plt.figure(figsize=(4, 4))
hb = plt.hexbin(log_trimmed_means, log_trimmed_variances, gridsize=50, cmap='viridis', mincnt=1)

# Add a colorbar
cb = plt.colorbar(hb)
cb.set_label('Counts')

# Plot the theoretical lines
x_values = np.linspace(np.min(log_trimmed_means), np.max(log_trimmed_means), 100)

# Variance equals mean (Poisson)
y_values_poisson = x_values
plt.plot(x_values, y_values_poisson, color='green', linestyle='--', label='Variance = Mean (Poisson)')

# Variance equals mean squared
y_values_quadratic = 2 * x_values
plt.plot(x_values, y_values_quadratic, color='red', linestyle='--', label='Variance = Mean²')

# Labels and title
plt.xlabel('Log (Mean)')
plt.ylabel('Log (Variance)')
plt.title('Log(Mean) vs. Log(Variance): ' + dataset_name, size=8)

# Legend and grid
plt.legend(fontsize=8)
plt.grid(True)

outpath = "/path/to/6.0.1_Third_Party_Data.Best_AIC_and_Stat_Generator.No_GLM/"
filename = 'GSE209998_FFPE_Log_Mean_vs_Log_Var.pdf'
plt.savefig(outpath + filename, format='pdf', dpi=300, bbox_inches='tight') 

# Show the plot
plt.show()

In [None]:
#filtered_ffpe = filtered_ffpe.iloc[1:200]

print("GSE209998 - FFPE Tumours")

aic_values = filtered_ffpe.apply(manual_aic, axis=1)

aic_values, nb_theta_GSE209998_ffpe, zinb_theta_GSE209998_ffpe, zinb_pi_GSE209998_ffpe = zip(*aic_values)

AIC_top_rank_GSE209998_ffpe = pd.Series(aic_values).value_counts()

print(AIC_top_rank_GSE209998_ffpe)

In [None]:
print("GSE209998 - Fresh/Frozen Tumours")

aic_values = filtered_fresh.apply(manual_aic, axis=1)
aic_values, nb_theta_GSE209998_fresh, zinb_theta_GSE209998_fresh, zinb_pi_GSE209998_fresh = zip(*aic_values)

AIC_top_rank_GSE209998_fresh = pd.Series(aic_values).value_counts() 

print(AIC_top_rank_GSE209998_fresh)

In [None]:
data = pd.read_csv('/path/to/third_party/GSE47462_third_party_ffpe/GSE47462_Raw_counts_Refseq_genes.txt',
                  delimiter='\t')
# Split the DataFrame into subsets based on column names indicating sample type
normal_data = data.filter(like='_normal')
EN_data = data.filter(like='_EN')
DCIS_data = data.filter(like='_DCIS')
IDC_data = data.filter(like='_IDC')

# since there isn't a ton of data, I also want to group tumors
tumours_data = data.loc[:, ~data.columns.str.contains('_normal')]
#print(tumours_data)
tumours_data.set_index('symbol', inplace=True)

normal_data.index = tumours_data.index

#tumours_data = tumours_data.iloc[:, 1:]

#filtered_tumour = tumours_counts[tumours_counts.sum(axis=1) > 1]
#filtered_normal = normal_counts[normal_counts.sum(axis=1) > 1]

# adjust for library size (fraction method)
tumours_data_libadjust = library_adjust(tumours_data)
fraction_of_zeroes = (tumours_data_libadjust == 0).mean(axis=1)
filtered_tumour = tumours_data_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

normal_data_libadjust = library_adjust(normal_data)
fraction_of_zeroes = (normal_data_libadjust == 0).mean(axis=1)
filtered_normal = normal_data_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

print(filtered_tumour.shape)
print(filtered_normal.shape)

print("Tumour")
dataset_stats = dataset_stats_generator(filtered_tumour, draw_zero_distribution=True, dataset_name="GSE47462 Tumours")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
# print(dataset_stats)

new_tumours_counts = filtered_tumour.copy()
new_tumours_counts['Mean'] = filtered_tumour.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_tumour.std(axis=1)
new_tumours_counts['Median'] = filtered_tumour.median(axis=1)
new_tumours_counts['Min'] = filtered_tumour.min(axis=1)
new_tumours_counts['Max'] = filtered_tumour.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Median', 'Min', 'Max']])


GSE47462_tumour_stdev = [] 
new_tumours_counts = filtered_tumour.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE47462_tumour_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE47462_tumour_stdev)

print("Normal")
dataset_stats = dataset_stats_generator(filtered_normal, draw_zero_distribution=True, dataset_name="GSE47462 Normals")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
# print(dataset_stats)

new_tumours_counts = filtered_normal.copy()
new_tumours_counts['Mean'] = filtered_normal.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_normal.std(axis=1)
new_tumours_counts['Min'] = filtered_normal.min(axis=1)
new_tumours_counts['Max'] = filtered_normal.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


GSE47462_normal_stdev = [] 
new_tumours_counts = filtered_normal.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE47462_normal_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE47462_normal_stdev)


In [None]:
#filtered_tumour = filtered_tumour.iloc[1:200]

print("GSE47462 - Tumours")

aic_values = filtered_tumour.apply(manual_aic, axis=1)
aic_values, nb_theta_GSE47462_tumours, zinb_theta_GSE47462_tumours, zinb_pi_GSE47462_tumours = zip(*aic_values)

AIC_top_rank_GSE47462_tumours = pd.Series(aic_values).value_counts() 

print(AIC_top_rank_GSE47462_tumours)

In [None]:
#filtered_normal = filtered_normal.iloc[1:200]

print("GSE47462 - Normal")

aic_values = filtered_normal.apply(manual_aic, axis=1)
aic_values, nb_theta_GSE47462_normals, zinb_theta_GSE47462_normals, zinb_pi_GSE47462_normals = zip(*aic_values)

AIC_top_rank_GSE47462_normals = pd.Series(aic_values).value_counts()    

print(AIC_top_rank_GSE47462_normals)

In [None]:
# Read the CSV file into a DataFrame
data = pd.read_csv('/path/to/third_party/GSE120795_third_party_ffpe/GSE120795_total_norms_raw_counts.tsv',
                  delimiter='\t')

# in the series matrix"disease: healthy", 
patient_info = pd.read_csv('/path/to/third_party/GSE120795_third_party_ffpe/GSE120795_cell_info.txt',
                  delimiter='\t')

# this filter is present because those filtered out were not FFPE (blood and bone marrow)
mask = patient_info.iloc[0] == "healthy"

filtered_data = patient_info.loc[:, mask]
patient_names = filtered_data.columns
column_names_with_extension = [name + ".fastq.gz" for name in patient_names]
column_names_with_extension = column_names_with_extension[1:]

# Assuming 'second_list' is the list where you want to filter based on column names
filtered_data = data[column_names_with_extension]
ffpe_counts = pd.DataFrame(filtered_data)
#filtered_data = ffpe_counts[ffpe_counts.sum(axis=1) > 1]
print(ffpe_counts.shape)


ffpe_counts_libadjust = library_adjust(ffpe_counts)
fraction_of_zeroes = (ffpe_counts_libadjust == 0).mean(axis=1)
filtered_data = ffpe_counts_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients
print(filtered_data.shape)

dataset_stats = dataset_stats_generator(filtered_data, draw_zero_distribution=True, dataset_name="GSE120795")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
print(dataset_stats)


new_tumours_counts = filtered_data.copy()
new_tumours_counts['Mean'] = filtered_data.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_data.std(axis=1)
new_tumours_counts['Median'] = filtered_data.max(axis=1)
new_tumours_counts['Min'] = filtered_data.min(axis=1)
new_tumours_counts['Max'] = filtered_data.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Median', 'Min', 'Max']])


GSE120795_stdev = [] 
new_tumours_counts = filtered_data.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    GSE120795_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(GSE120795_stdev)



In [None]:
#filtered_data = filtered_data.iloc[1:200]

print("GSE120795 - Normal")

aic_values = filtered_data.apply(manual_aic, axis=1)
aic_values, nb_theta_GSE120795_normals, zinb_theta_GSE120795_normals, zinb_pi_GSE120795_normals = zip(*aic_values)

AIC_top_rank_GSE120795_normals = pd.Series(aic_values).value_counts()  

print(AIC_top_rank_GSE120795_normals)

In [None]:
# the GDC Count-Me-In Data
data = pd.read_csv('/path/to/third_party/CountMeIn_BConly_third_party_ffpe/MBC_CMI_Compiled_Counts.tsv',
                  delimiter=' ') # space delimited


tumours_counts = pd.DataFrame(data)


tumours_counts = tumours_counts[tumours_counts['gene_type'].notna()]
tumours_counts.set_index('gene_name', inplace=True)
tumours_counts = tumours_counts.drop(tumours_counts.columns[:2], axis=1) # columns 1-3 should be ignored

# library adjust; remove genes expressed < express_percent_limit
tumours_counts_libadjust = library_adjust(tumours_counts)
fraction_of_zeroes = (tumours_counts_libadjust == 0).mean(axis=1)
filtered_df = tumours_counts_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients
print(filtered_df.shape)

dataset_stats = dataset_stats_generator(filtered_df, draw_zero_distribution=True, dataset_name="TMBC Project")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])
print(dataset_stats)



new_tumours_counts = filtered_df.copy()
new_tumours_counts['Mean'] = filtered_df.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_df.std(axis=1)
new_tumours_counts['Min'] = filtered_df.min(axis=1)
new_tumours_counts['Max'] = filtered_df.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


TMBC_stdev = [] 
new_tumours_counts = filtered_df.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    TMBC_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(TMBC_stdev)

In [None]:
# Define the trimming percentage
trim_percent = 0.01  # 1% trimming

# Function to calculate trimmed mean for a row
def trimmed_mean(row, proportion):
    sorted_row = sorted(row)
    n = len(sorted_row)
    trim_count = int(n * proportion)
    trimmed_row = sorted_row[trim_count:n-trim_count]
    return sum(trimmed_row) / len(trimmed_row) if trimmed_row else float('nan')

# Apply the trimmed mean function to each row
trimmed_means = filtered_df.apply(lambda row: trimmed_mean(row, trim_percent), axis=1)

# Calculate the mean of the trimmed means
overall_mean = trimmed_means.mean()

#print("Trimmed Means for Each Row:")
#print(trimmed_means)
#print("\nOverall Mean of Trimmed Means:")
#print(overall_mean)

search_term = 'XIST'


matching_rows = [name for name in trimmed_means.index if search_term in name]

print("Matching Row Names:")
for name in matching_rows:
    print(trimmed_means.loc[name])


In [None]:
#filtered_df = filtered_df.iloc[1:200]

print("Count Me In - Breast Cancer Only")

aic_values = filtered_df.apply(manual_aic, axis=1)
aic_values, nb_theta_CMI, zinb_theta_CMI, zinb_pi_CMI = zip(*aic_values)

AIC_top_rank_CMI = pd.Series(aic_values).value_counts()  

print(AIC_top_rank_CMI)

In [None]:
# Our dataset!
all_counts = pyreadr.read_r('/path/to/dcis/expression_counts.Jan2023_1_2_and_2_2.rds')
vst_norm = pyreadr.read_r('/path/to/dcis/expression_VST_Normalized.Jan2023_1_2_and_2_2.rds')

# this data is loading without issue
ship_data = pyreadr.read_r('/path/to/dcis/ship1_2_full_tbl.Jan2023.With_Stroma_Assignment.rds')
# I wish that we could've simply used the RDA, but the counts-only RDS works and loads faster so what can you do
# in the future, could try the package 'rpy2' instead, it's an alternative that requires R but that's okay for us

# now we want to isolate just the expression from a particular type of tissue
df = all_counts[None] # load all_counts into a pandas data frame

# Eliminate any samples in the blacklist
ship_df = ship_data[None]
#print(ship_df['blacklist'].value_counts()) # they're all false

# since ship_data already has patients filtered out, lets filter out any patient who isn't on the list
# match by 'sample_name'
df_blacklist_filtered = df[ship_df['sample_name']]

# split the patients by tissue
count_DCIS = df_blacklist_filtered.filter(like='_D')
count_STROMA = df_blacklist_filtered.filter(like='_S')
count_NORMAL = df_blacklist_filtered.filter(like='_N')

# if we want consistency between the 3 sample types
vst_table = vst_norm[None] # we don't apply this anymore because it blocks any gene with >80% frac_zero
filtered_norm_count = count_NORMAL#[count_NORMAL.index.isin(vst_table.index)]
filtered_tumour_count = count_DCIS#[count_DCIS.index.isin(vst_table.index)]
filtered_stroma_count = count_STROMA#[count_STROMA.index.isin(vst_table.index)]

filtered_norm_count_libadjust = library_adjust(filtered_norm_count)
fraction_of_zeroes = (filtered_norm_count_libadjust == 0).mean(axis=1)
filtered_norm_count = filtered_norm_count_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

filtered_tumour_count_libadjust = library_adjust(filtered_tumour_count)
fraction_of_zeroes = (filtered_tumour_count_libadjust == 0).mean(axis=1)
filtered_tumour_count = filtered_tumour_count_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

filtered_stroma_count_libadjust = library_adjust(filtered_stroma_count)
fraction_of_zeroes = (filtered_stroma_count_libadjust == 0).mean(axis=1)
filtered_stroma_count = filtered_stroma_count_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients



print("DCIS")
print(filtered_tumour_count.shape)
dataset_stats = dataset_stats_generator(filtered_tumour_count, draw_zero_distribution=True, dataset_name="Sunnybrook DCIS")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])


new_tumours_counts = filtered_tumour_count.copy()
new_tumours_counts['Mean'] = filtered_tumour_count.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_tumour_count.std(axis=1)
new_tumours_counts['Median'] = filtered_tumour_count.median(axis=1)
new_tumours_counts['Min'] = filtered_tumour_count.min(axis=1)
new_tumours_counts['Max'] = filtered_tumour_count.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Median', 'Min', 'Max']])


DCIS_tumour_stdev = [] 
new_tumours_counts = filtered_tumour_count.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    DCIS_tumour_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(DCIS_tumour_stdev)

print("Stroma")
print(filtered_stroma_count.shape)
dataset_stats = dataset_stats_generator(filtered_stroma_count, draw_zero_distribution=True, dataset_name="Sunnybrook Stroma")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])



new_tumours_counts = filtered_stroma_count.copy()
new_tumours_counts['Mean'] = filtered_stroma_count.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_stroma_count.std(axis=1)
new_tumours_counts['Min'] = filtered_stroma_count.min(axis=1)
new_tumours_counts['Max'] = filtered_stroma_count.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


DCIS_stroma_stdev = [] 
new_tumours_counts = filtered_stroma_count.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    DCIS_stroma_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(DCIS_stroma_stdev)



print("Normal")
print(filtered_norm_count.shape)
dataset_stats = dataset_stats_generator(filtered_norm_count, draw_zero_distribution=True, dataset_name="Sunnybrook Normals")
print("Average Library Size: ", dataset_stats[0])
print("Fraction of Zeroes: ", dataset_stats[1])
print("Average Mean Expression: ", dataset_stats[2])
print("Average Stdev Expression: ", dataset_stats[3])

new_tumours_counts = filtered_norm_count.copy()
new_tumours_counts['Mean'] = filtered_norm_count.mean(axis=1)
new_tumours_counts['StdDev'] = filtered_norm_count.std(axis=1)
new_tumours_counts['Min'] = filtered_norm_count.min(axis=1)
new_tumours_counts['Max'] = filtered_norm_count.max(axis=1)

top_10_means = new_tumours_counts.nlargest(10, 'Mean')

# Step 4: Print the standard deviation of the filtered rows
print("\nTop 10 expressed genes")
print(top_10_means[['Mean', 'StdDev', 'Min', 'Max']])


DCIS_normal_stdev = [] 
new_tumours_counts = filtered_norm_count.copy()
for i in range(min(10, len(new_tumours_counts))):
    # Calculate the mean for each row
    row_means = new_tumours_counts.mean(axis=1)
    
    # Calculate the standard deviation of these means
    std_dev_of_means = row_means.std(ddof=0)
    
    # Append the standard deviation to the list
    DCIS_normal_stdev.append(std_dev_of_means)
    
    # Remove the row with the highest mean
    max_mean_index = row_means.idxmax()
    new_tumours_counts.drop(index=max_mean_index, inplace=True)

# print(dataset_stats)

print(DCIS_normal_stdev)



In [None]:
# Define the trimming percentage
trim_percent = 0.01  # 1% trimming

# Function to calculate trimmed mean for a row
def trimmed_mean(row, proportion):
    sorted_row = sorted(row)
    n = len(sorted_row)
    trim_count = int(n * proportion)
    trimmed_row = sorted_row[trim_count:n-trim_count]
    return sum(trimmed_row) / len(trimmed_row) if trimmed_row else float('nan')

# Apply the trimmed mean function to each row
trimmed_means = filtered_tumour_count.apply(lambda row: trimmed_mean(row, trim_percent), axis=1)

# Calculate the mean of the trimmed means
overall_mean = trimmed_means.mean()

#print("Trimmed Means for Each Row:")
#print(trimmed_means)
#print("\nOverall Mean of Trimmed Means:")
#print(overall_mean)

search_term = 'ENSG00000245532'


matching_rows = [name for name in trimmed_means.index if search_term in name]

print("Matching Row Names:")
for name in matching_rows:
    print(trimmed_means.loc[name])


In [None]:
# Try out making a plot. Log mean vs log variance.

trim_percent = 0.00  # 1% trimming is 0.01

def right_tailed_trimmed_mean_and_variance(row, proportion):
    sorted_row = sorted(row)
    n = len(sorted_row)
    trim_count = int(n * proportion)
    # Trim only from the right (highest values)
    trimmed_row = sorted_row[:n - trim_count]
    if not trimmed_row:
        return float('nan'), float('nan')  # Return NaN if trimmed_row is empty

    # Calculate the trimmed mean
    mean = sum(trimmed_row) / len(trimmed_row)
    
    # Calculate the trimmed variance
    if len(trimmed_row) > 1:
        variance = sum((x - mean) ** 2 for x in trimmed_row) / (len(trimmed_row) - 1)
    else:
        variance = 0.0  # Variance is zero if only one data point remains

    return mean, variance


trimmed_means = []
trimmed_variances = []
for index, row in filtered_tumour_count.iterrows():
    mean, variance = right_tailed_trimmed_mean_and_variance(row, trim_percent)
    # Only include valid results
    if not np.isnan(mean) and mean > 0 and variance > 0:
        trimmed_means.append(mean)
        trimmed_variances.append(variance)


epsilon = 1e-10
log_trimmed_means = np.log(np.array(trimmed_means) + epsilon)
log_trimmed_variances = np.log(np.array(trimmed_variances) + epsilon)

plt.figure(figsize=(8, 6))
hb = plt.hexbin(log_trimmed_means, log_trimmed_variances, gridsize=50, cmap='viridis', mincnt=1)

# Add a colorbar
cb = plt.colorbar(hb)
cb.set_label('Counts')

# Plot the theoretical lines
x_values = np.linspace(np.min(log_trimmed_means), np.max(log_trimmed_means), 100)

# Variance equals mean (Poisson)
y_values_poisson = x_values
plt.plot(x_values, y_values_poisson, color='green', linestyle='--', label='Variance = Mean (Poisson)')

# Variance equals mean squared
y_values_quadratic = 2 * x_values
plt.plot(x_values, y_values_quadratic, color='red', linestyle='--', label='Variance = Mean²')

# Labels and title
plt.xlabel('Log (Mean)')
plt.ylabel('Log (Variance)')
dataset_name = "Sunnybrook (DCIS)"
plt.title('Log(Mean) vs. Log(Variance): ' + dataset_name)
# Legend and grid
plt.legend()
plt.grid(True)

# Show the plot
plt.show()

In [None]:
filt_filtered_tumour_count = filtered_tumour_count.iloc[1:5000]

print("Our Data: Tumours")
#print(filtered_tumour_count.head(10))

aic_values = filt_filtered_tumour_count.apply(manual_aic, axis=1)
aic_values, nb_theta_DCIS, zinb_theta_DCIS, zinb_pi_DCIS = zip(*aic_values)

AIC_top_rank_DCIS = pd.Series(aic_values).value_counts()      

print(AIC_top_rank_DCIS)

In [None]:
#filtered_norm_count = filtered_norm_count.iloc[1:200]


print("Our Data: Normal")

aic_values = filtered_norm_count.apply(manual_aic, axis=1)
aic_values, nb_theta_DCISNorm, zinb_theta_DCISNorm, zinb_pi_DCISNorm = zip(*aic_values)

AIC_top_rank_DCISNorm = pd.Series(aic_values).value_counts()       

print(AIC_top_rank_DCISNorm)

In [None]:
#filtered_stroma_count = filtered_stroma_count.iloc[1:200]


print("Our Data: Stroma")

aic_values = filtered_stroma_count.apply(manual_aic, axis=1)
aic_values, nb_theta_DCISStrom, zinb_theta_DCISStrom, zinb_pi_DCISStrom = zip(*aic_values)

AIC_top_rank_DCISStrom = pd.Series(aic_values).value_counts()    

print(AIC_top_rank_DCISStrom)

In [None]:
# Creating a heatmap
AIC_top_rank_GSE167977.name = "GSE167977 (BC)"
AIC_top_rank_GSE181466.name = "GSE181466 (TNBC)"
AIC_top_rank_GSE146889_tumours.name = "GSE146889 (Colo./Endo.)"
AIC_top_rank_GSE146889_normals.name = "GSE146889 (Normal)"
AIC_top_rank_GSE209998_ffpe.name = "GSE209998 (BC|FFPE)"
AIC_top_rank_GSE209998_fresh.name = "GSE209998 (BC|FRESH)"
AIC_top_rank_GSE47462_tumours.name = "GSE47462 (BC)"
AIC_top_rank_GSE47462_normals.name = "GSE47462 (Normal)"
AIC_top_rank_GSE120795_normals.name = "GSE120795 (Normal)"
AIC_top_rank_CMI.name = "TMBC Project"
AIC_top_rank_DCIS.name = "DCIS (Tumour)"
AIC_top_rank_DCISNorm.name = "DCIS (Normal)"
AIC_top_rank_DCISStrom.name = "DCIS (Stroma)"


# lets combine the results in one large table
# must adjust for events where one table is missing entries for a particular dist
combined_table = pd.concat([
    AIC_top_rank_GSE47462_tumours, AIC_top_rank_GSE47462_normals,
    AIC_top_rank_GSE120795_normals, 
    AIC_top_rank_GSE146889_tumours, AIC_top_rank_GSE146889_normals,
    AIC_top_rank_GSE167977, AIC_top_rank_GSE181466,
    AIC_top_rank_GSE209998_ffpe, AIC_top_rank_GSE209998_fresh,
    AIC_top_rank_CMI, AIC_top_rank_DCIS, AIC_top_rank_DCISNorm, AIC_top_rank_DCISStrom
    ], axis=1)
# print(combined_table)

# change table to be based on percentages
df_filled = combined_table.fillna(0)
df_percent = df_filled.div(df_filled.sum(axis=0), axis=1) 
df_transposed = df_percent.T

if 'ZEROES' in df_transposed.columns:
    df_transposed = df_transposed.drop('ZEROES', axis=1)

extra = "All"

if (NB_ZINB_only == True):
    new_order = ['NB', 'ZINB']
    extra = "NB_ZINB"
elif (no_ZI_AICs == True):
    new_order = ['NB', 'Exponential', 'Gaussian', 'Poisson']
    extra = "No_ZI"
else:
    new_order = ['NB', 'Exponential', 'Gaussian', 'Poisson', 'ZINB', 'ZIP']

df_transposed = df_transposed[new_order]

print(df_transposed.round(3))

lib_adj = "No"
if (adjust_for_lib):
    lib_adj = "Yes"

df_transposed.to_csv('/path/to/6.0.1_Third_Party_Data.Best_AIC_and_Stat_Generator.No_GLM/Percent_Best_AIC_Table.ZeroFract_' + 
                     str(express_percent_limit) + ".Trim_" + str(trim_percent) + ".Lib_Adj_" + str(lib_adj) + "." + str(extra) + ".ZI_start_0.9.csv" , index=True)


In [None]:
# lets use the table to create a heatmap
import seaborn as sns
import matplotlib.pyplot as plt


# Create the heatmap from percentage values above
if (NB_ZINB_only == True):
        # the NB/ZINB plot has just two rows so we should make it narrower
        plt.figure(figsize=(3, 3.7))
elif (no_ZI_AICs == True):
        plt.figure(figsize=(4, 4))
else: 
        plt.figure(figsize=(6, 4))



heatmap = sns.heatmap(df_transposed, annot=True, fmt=".2f", cmap='crest_r', linewidths=.5,
                      annot_kws={"size": 8, "color": 'w'},  # Set annotation text color to black
                      cbar_kws={'shrink': 0.5, 'ticks': [0, 0.5, 1], 'format': '%.2f'})



# make X/Y labels smaller
plt.yticks(fontsize=7)
plt.xticks(fontsize=7, rotation=60)

# draw a horizontal line between certain rows
gap_size = 3
plt.axhline(y=2, color='honeydew', linewidth=gap_size)
plt.axhline(y=3, color='honeydew', linewidth=gap_size)
plt.axhline(y=5, color='honeydew', linewidth=gap_size)
plt.axhline(y=6, color='honeydew', linewidth=gap_size)
plt.axhline(y=7, color='honeydew', linewidth=gap_size)
plt.axhline(y=9, color='honeydew', linewidth=gap_size)
plt.axhline(y=10, color='honeydew', linewidth=gap_size)

# wanted the color bar to display less digits
cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=8)  
cbar.set_ticks([cbar.vmin, 0, cbar.vmax])
cbar.set_ticklabels([f'{cbar.vmin:.1f}', '0.0', f'{cbar.vmax:.1f}'])

# Save and display the plot
plt.tight_layout()
plt.savefig('/path/to/6.0.1_Third_Party_Data.Best_AIC_and_Stat_Generator.No_GLM/Percent_Best_AIC_Table.ZeroFract_' + 
                     str(express_percent_limit) + ".Trim_" + str(trim_percent) + ".Lib_Adj_" + str(lib_adj) + "." + str(extra) +  ".ZI_start_0.9.pdf",
             dpi=300, bbox_inches='tight')  # dpi is dots per inch, for resolution

plt.show()

# Optional: Clear the figure after saving, so that future plt calls don't reuse the same figure
plt.clf()

In [None]:
if (NB_ZINB_only is False) & (no_ZI_AICs is False):

    # saving the dispersion from NB and dispersion/pi from ZINB
    PATH = "/path/to/6.0.1_Third_Party_Data.Best_AIC_and_Stat_Generator.No_GLM/"

    # starting with NB Theta
    #np.savetxt(f'{PATH}nb_theta_GSE120795_normals.csv', np.array(nb_theta_GSE120795_normals)[None], delimiter=',', fmt='%d')
    nb_theta_GSE120795_normals = pd.DataFrame([nb_theta_GSE120795_normals])
    nb_theta_GSE120795_normals.to_csv(f'{PATH}nb_theta_GSE120795_normals.csv', index=False, header=False)

    #np.savetxt(f'{PATH}nb_theta_GSE47462_tumours.csv', np.array(nb_theta_GSE47462_tumours)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}nb_theta_GSE47462_normals.csv', np.array(nb_theta_GSE47462_normals)[None], delimiter=',', fmt='%d')

    nb_theta_GSE47462_tumours = pd.DataFrame([nb_theta_GSE47462_tumours])
    nb_theta_GSE47462_normals = pd.DataFrame([nb_theta_GSE47462_normals])

    nb_theta_GSE47462_tumours.to_csv(f'{PATH}nb_theta_GSE47462_tumours.csv', index=False, header=False)
    nb_theta_GSE47462_normals.to_csv(f'{PATH}nb_theta_GSE47462_normals.csv', index=False, header=False)

    #np.savetxt(f'{PATH}nb_theta_GSE146889_tumours.csv', np.array(nb_theta_GSE146889_tumours)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}nb_theta_GSE146889_normals.csv', np.array(nb_theta_GSE146889_normals)[None], delimiter=',', fmt='%d')

    nb_theta_GSE146889_tumours = pd.DataFrame([nb_theta_GSE146889_tumours])
    nb_theta_GSE146889_normals = pd.DataFrame([nb_theta_GSE146889_normals])

    nb_theta_GSE146889_tumours.to_csv(f'{PATH}nb_theta_GSE146889_tumours.csv', index=False, header=False)
    nb_theta_GSE146889_normals.to_csv(f'{PATH}nb_theta_GSE146889_normals.csv', index=False, header=False)

    #np.savetxt(f'{PATH}nb_theta_GSE167977.csv', np.array(nb_theta_GSE167977)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}nb_theta_GSE181466.csv', np.array(nb_theta_GSE181466)[None], delimiter=',', fmt='%d')
    nb_theta_GSE167977 = pd.DataFrame([nb_theta_GSE167977])
    nb_theta_GSE181466 = pd.DataFrame([nb_theta_GSE181466])

    nb_theta_GSE167977.to_csv(f'{PATH}nb_theta_GSE167977.csv', index=False, header=False)
    nb_theta_GSE181466.to_csv(f'{PATH}nb_theta_GSE181466.csv', index=False, header=False)

    #np.savetxt(f'{PATH}nb_theta_GSE209998_ffpe.csv', np.array(nb_theta_GSE209998_ffpe)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}nb_theta_GSE209998_fresh.csv', np.array(nb_theta_GSE209998_fresh)[None], delimiter=',', fmt='%d')

    nb_theta_GSE209998_ffpe = pd.DataFrame([nb_theta_GSE209998_ffpe])
    nb_theta_GSE209998_fresh = pd.DataFrame([nb_theta_GSE209998_fresh])

    nb_theta_GSE209998_ffpe.to_csv(f'{PATH}nb_theta_GSE209998_ffpe.csv', index=False, header=False)
    nb_theta_GSE209998_fresh.to_csv(f'{PATH}nb_theta_GSE209998_fresh.csv', index=False, header=False)


    #np.savetxt(f'{PATH}nb_theta_CMI.csv', np.array(nb_theta_CMI)[None], delimiter=',', fmt='%d')

    #np.savetxt(f'{PATH}nb_theta_DCIS.csv', np.array(nb_theta_DCIS)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}nb_theta_DCISNorm.csv', np.array(nb_theta_DCISNorm)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}nb_theta_DCISStrom.csv', np.array(nb_theta_DCISStrom)[None], delimiter=',', fmt='%d')
    nb_theta_CMI = pd.DataFrame([nb_theta_CMI])

    nb_theta_CMI.to_csv(f'{PATH}nb_theta_CMI.csv', index=False, header=False)

    nb_theta_DCIS = pd.DataFrame([nb_theta_DCIS])
    nb_theta_DCISNorm = pd.DataFrame([nb_theta_DCISNorm])
    nb_theta_DCISStrom = pd.DataFrame([nb_theta_DCISStrom])


    nb_theta_DCIS.to_csv(f'{PATH}nb_theta_DCIS.csv', index=False, header=False)
    nb_theta_DCISNorm.to_csv(f'{PATH}nb_theta_DCISNorm.csv', index=False, header=False)
    nb_theta_DCISStrom.to_csv(f'{PATH}nb_theta_DCISStrom.csv', index=False, header=False)



    # Now ZINB Theta
    #np.savetxt(f'{PATH}zinb_theta_GSE120795_normals.csv', np.array(zinb_theta_GSE120795_normals)[None], delimiter=',', fmt='%d')
    zinb_theta_GSE120795_normals = pd.DataFrame([zinb_theta_GSE120795_normals])
    zinb_theta_GSE120795_normals.to_csv(f'{PATH}zinb_theta_GSE120795_normals.csv', index=False, header=False)


    #np.savetxt(f'{PATH}zinb_theta_GSE47462_tumours.csv', np.array(zinb_theta_GSE47462_tumours)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_theta_GSE47462_normals.csv', np.array(zinb_theta_GSE47462_normals)[None], delimiter=',', fmt='%d')
    zinb_theta_GSE47462_tumours = pd.DataFrame([zinb_theta_GSE47462_tumours])
    zinb_theta_GSE47462_normals = pd.DataFrame([zinb_theta_GSE47462_normals])

    zinb_theta_GSE47462_tumours.to_csv(f'{PATH}zinb_theta_GSE47462_tumours.csv', index=False, header=False)
    zinb_theta_GSE47462_normals.to_csv(f'{PATH}zinb_theta_GSE47462_normals.csv', index=False, header=False)


    #np.savetxt(f'{PATH}zinb_theta_GSE146889_tumours.csv', np.array(zinb_theta_GSE146889_tumours)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_theta_GSE146889_normals.csv', np.array(zinb_theta_GSE146889_normals)[None], delimiter=',', fmt='%d')
    zinb_theta_GSE146889_tumours = pd.DataFrame([zinb_theta_GSE146889_tumours])
    zinb_theta_GSE146889_normals = pd.DataFrame([zinb_theta_GSE146889_normals])

    zinb_theta_GSE146889_tumours.to_csv(f'{PATH}zinb_theta_GSE146889_tumours.csv', index=False, header=False)
    zinb_theta_GSE146889_normals.to_csv(f'{PATH}zinb_theta_GSE146889_normals.csv', index=False, header=False)

    #np.savetxt(f'{PATH}zinb_theta_GSE167977.csv', np.array(zinb_theta_GSE167977)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_theta_GSE181466.csv', np.array(zinb_theta_GSE181466)[None], delimiter=',', fmt='%d')
    zinb_theta_GSE167977 = pd.DataFrame([zinb_theta_GSE167977])
    zinb_theta_GSE181466 = pd.DataFrame([zinb_theta_GSE181466])

    zinb_theta_GSE167977.to_csv(f'{PATH}zinb_theta_GSE167977.csv', index=False, header=False)
    zinb_theta_GSE181466.to_csv(f'{PATH}zinb_theta_GSE181466.csv', index=False, header=False)



    #np.savetxt(f'{PATH}zinb_theta_GSE209998_ffpe.csv', np.array(zinb_theta_GSE209998_ffpe)[None], delimiter=',', fmt='%f')
    #np.savetxt(f'{PATH}zinb_theta_GSE209998_fresh.csv', np.array(zinb_theta_GSE209998_fresh)[None], delimiter=',', fmt='%f')
    #np.savetxt(f'{PATH}zinb_theta_CMI.csv', np.array(zinb_theta_CMI)[None], delimiter=',', fmt='%d')
    zinb_theta_GSE209998_ffpe = pd.DataFrame([zinb_theta_GSE209998_ffpe])
    zinb_theta_GSE209998_fresh = pd.DataFrame([zinb_theta_GSE209998_fresh])
    zinb_theta_CMI = pd.DataFrame([zinb_theta_CMI])

    zinb_theta_GSE209998_ffpe.to_csv(f'{PATH}zinb_theta_GSE209998_ffpe.csv', index=False, header=False)
    zinb_theta_GSE209998_fresh.to_csv(f'{PATH}zinb_theta_GSE209998_fresh.csv', index=False, header=False)
    zinb_theta_CMI.to_csv(f'{PATH}zinb_theta_CMI.csv', index=False, header=False)


    #np.savetxt(f'{PATH}zinb_theta_DCIS.csv', np.array(zinb_theta_DCIS)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_theta_DCISNorm.csv', np.array(zinb_theta_DCISNorm)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_theta_DCISStrom.csv', np.array(zinb_theta_DCISStrom)[None], delimiter=',', fmt='%d')
    zinb_theta_DCIS = pd.DataFrame([zinb_theta_DCIS])
    zinb_theta_DCISNorm = pd.DataFrame([zinb_theta_DCISNorm])
    zinb_theta_DCISStrom = pd.DataFrame([zinb_theta_DCISStrom])

    zinb_theta_DCIS.to_csv(f'{PATH}zinb_theta_DCIS.csv', index=False, header=False)
    zinb_theta_DCISNorm.to_csv(f'{PATH}zinb_theta_DCISNorm.csv', index=False, header=False)
    zinb_theta_DCISStrom.to_csv(f'{PATH}zinb_theta_DCISStrom.csv', index=False, header=False)


    # Now ZINB Pi - 
    #np.savetxt(f'{PATH}zinb_pi_GSE120795_normals.csv', np.array(zinb_pi_GSE120795_normals)[None], delimiter=',', fmt='%d')

    #np.savetxt(f'{PATH}zinb_pi_GSE47462_tumours.csv', np.array(zinb_pi_GSE47462_tumours)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_pi_GSE47462_normals.csv', np.array(zinb_pi_GSE47462_normals)[None], delimiter=',', fmt='%d')

    zinb_pi_GSE120795_normals = pd.DataFrame([zinb_pi_GSE120795_normals])
    zinb_pi_GSE47462_tumours = pd.DataFrame([zinb_pi_GSE47462_tumours])
    zinb_pi_GSE47462_normals = pd.DataFrame([zinb_pi_GSE47462_normals])


    zinb_pi_GSE120795_normals.to_csv(f'{PATH}zinb_pi_GSE120795_normals.csv', index=False, header=False)
    zinb_pi_GSE47462_tumours.to_csv(f'{PATH}zinb_pi_GSE47462_tumours.csv', index=False, header=False)
    zinb_pi_GSE47462_normals.to_csv(f'{PATH}zinb_pi_GSE47462_normals.csv', index=False, header=False)


    #np.savetxt(f'{PATH}zinb_pi_GSE146889_tumours.csv', np.array(zinb_pi_GSE146889_tumours)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_pi_GSE146889_normals.csv', np.array(zinb_pi_GSE146889_normals)[None], delimiter=',', fmt='%d')
    zinb_pi_GSE146889_tumours = pd.DataFrame([zinb_pi_GSE146889_tumours])
    zinb_pi_GSE146889_normals = pd.DataFrame([zinb_pi_GSE146889_normals])

    zinb_pi_GSE146889_tumours.to_csv(f'{PATH}zinb_pi_GSE146889_tumours.csv', index=False, header=False)
    zinb_pi_GSE146889_normals.to_csv(f'{PATH}zinb_pi_GSE146889_normals.csv', index=False, header=False)

    #np.savetxt(f'{PATH}zinb_pi_GSE167977.csv', np.array(zinb_pi_GSE167977)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_pi_GSE181466.csv', np.array(zinb_pi_GSE181466)[None], delimiter=',', fmt='%d')
    zinb_pi_GSE167977 = pd.DataFrame([zinb_pi_GSE167977])
    zinb_pi_GSE181466 = pd.DataFrame([zinb_pi_GSE181466])

    zinb_pi_GSE167977.to_csv(f'{PATH}zinb_pi_GSE167977.csv', index=False, header=False)
    zinb_pi_GSE181466.to_csv(f'{PATH}zinb_pi_GSE181466.csv', index=False, header=False)



    #np.savetxt(f'{PATH}zinb_pi_GSE209998_ffpe.csv', np.array(zinb_pi_GSE209998_ffpe)[None], delimiter=',', fmt='%f')
    #np.savetxt(f'{PATH}zinb_pi_GSE209998_fresh.csv', np.array(zinb_pi_GSE209998_fresh)[None], delimiter=',', fmt='%f')
    #np.savetxt(f'{PATH}zinb_pi_CMI.csv', np.array(zinb_pi_CMI)[None], delimiter=',', fmt='%d')
    zinb_pi_GSE209998_ffpe = pd.DataFrame([zinb_pi_GSE209998_ffpe])
    zinb_pi_GSE209998_fresh = pd.DataFrame([zinb_pi_GSE209998_fresh])

    zinb_pi_GSE209998_ffpe.to_csv(f'{PATH}zinb_pi_GSE209998_ffpe.csv', index=False, header=False)
    zinb_pi_GSE209998_fresh.to_csv(f'{PATH}zinb_pi_GSE209998_fresh.csv', index=False, header=False)

    zinb_pi_CMI = pd.DataFrame([zinb_pi_CMI])

    zinb_pi_CMI.to_csv(f'{PATH}zinb_pi_CMI.csv', index=False, header=False)


    #np.savetxt(f'{PATH}zinb_pi_DCIS.csv', np.array(zinb_pi_DCIS)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_pi_DCISNorm.csv', np.array(zinb_pi_DCISNorm)[None], delimiter=',', fmt='%d')
    #np.savetxt(f'{PATH}zinb_pi_DCISStrom.csv', np.array(zinb_pi_DCISStrom)[None], delimiter=',', fmt='%d')

    zinb_pi_DCIS = pd.DataFrame([zinb_pi_DCIS])
    zinb_pi_DCISNorm = pd.DataFrame([zinb_pi_DCISNorm])
    zinb_pi_DCISStrom = pd.DataFrame([zinb_pi_DCISStrom])

    zinb_pi_DCIS.to_csv(f'{PATH}zinb_pi_DCIS.csv', index=False, header=False)

    zinb_pi_DCISNorm.to_csv(f'{PATH}zinb_pi_DCISNorm.csv', index=False, header=False)
    zinb_pi_DCISStrom.to_csv(f'{PATH}zinb_pi_DCISStrom.csv', index=False, header=False)



In [None]:
# Std plot
# amalgamate
combined_data = list(zip(
    GSE120795_stdev,
    GSE146889_tumour_stdev,
    GSE146889_normal_stdev,
    GSE167977_stdev,
    GSE181466_stdev,
    GSE209998_ffpe_stdev,
    GSE209998_ff_stdev,
    GSE47462_tumour_stdev,
    GSE47462_normal_stdev,
    TMBC_stdev,
    DCIS_tumour_stdev,
    DCIS_stroma_stdev,
    DCIS_normal_stdev
    ))

df = pd.DataFrame(combined_data, columns=['GSE120795', 'GSE146889 (Tumor)', 'GSE146889 (Normal)', 'GSE167977', 'GSE181466', 
                                          'GSE209998 (FFPE)', 'GSE209998 (FF)', 'GSE47462 (Tumor)', 'GSE47462 (Normal)',
                                          'TMBC', 'Sunnybrook (DCIS)', 'Sunnybrook (Stromal)', 'Sunnybrook (Normal)'
                                          ])

plt.figure(figsize=(6, 4)) 
ax = df.plot(linewidth=2.5)  # Increase line width

# Add labels and title

plt.xlabel('Number of transcripts removed from ranked list')
plt.ylabel('standard deviation of average counts per transcript (log)')
#plt.title('Line Plot of DataFrame Columns')
plt.yscale('log')
plt.yticks(fontsize=14)  # Adjust the font size as needed
plt.xticks(range(0, 10))
plt.ylim(top=100000)

plt.legend(fontsize='small', loc='upper left', bbox_to_anchor=(1, 1), handlelength=3)

outpath = "/path/to/6.0.1_Third_Party_Data.Best_AIC_and_Stat_Generator.No_GLM/"
filename = "suppl_fig.stdev_reduction_by_removal.pdf"
plt.savefig(outpath + filename, format='pdf', dpi=600, bbox_inches='tight')
# Show plot
plt.show()
