In [None]:
# This program computes AIC of gene expression across various distributions
# it then creates line plots that show how these AIC "winners" (those withl lowest AICs)
# in genes segmented by: total read counts 

In [None]:
# first lets read in scipy, as I'll need the "curve_fit" function in optimize
from __future__ import print_function

import numpy as np
from scipy.special import gammaln
from scipy.special import psi
from scipy.special import factorial
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.special import comb
import math
import sys
import os
import pandas as pd
import scipy.stats as stats
from scipy.optimize import minimize
import statsmodels.api as sm
from statsmodels.genmod.families import NegativeBinomial, Gamma
from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP

from scipy.stats import expon, nbinom, norm, poisson

import warnings
warnings.filterwarnings("ignore")


# now the DCIS count data is found in an RDA file, which we apparently read using 'pyreadr'
import pyreadr

# thread issues
os.environ['OMP_NUM_THREADS'] = '10'  # Limit to 1 thread
os.environ['MKL_NUM_THREADS'] = '10'  # Limit to 1 thread for MKL (if used)
os.environ['NUMEXPR_NUM_THREADS'] = '10'  # Limit to 1 thread for NumExpr (if used)


# to convert Ensemble to Refseq gene names
gene_convert = pyreadr.read_r('/path/to/dcis/gene_info/ensemble_to_refseq_gene_name_table.rds')
gene_convert = gene_convert[None]
id_to_name = {gene_id: gene_name for gene_id, gene_name in zip(gene_convert["gene_id"], gene_convert["gene_name"])}


def qqplot_nb_vs_exp(data, path, gene_name):
    
    # Sample data for the Negative Binomial plot (replace this with your data)
    
    sorted_data = np.sort(data)
    sorted_data = sorted_data[:-1] # remove top sample as it is often an outlier
    # Estimate the parameters r and p using method of moments
    mean_gene_expression = np.mean(sorted_data)
    var_gene_expression = np.var(sorted_data)
    p = mean_gene_expression / var_gene_expression
    r = mean_gene_expression**2 / (var_gene_expression - mean_gene_expression)

    # Calculate observed and theoretical quantiles for NB
    observed_quantiles_nb = np.array([(i - 0.5) / len(sorted_data) for i in range(1, len(sorted_data) + 1)])
    theoretical_quantiles_nb = stats.nbinom.ppf(observed_quantiles_nb, r, p)

    # Estimate the rate parameter λ (lambda) for the exponential distribution
    lambda_est = 1 / np.mean(sorted_data)

    # Calculate observed and theoretical quantiles for Exponential
    observed_quantiles_exponential = np.array([(i - 0.5) / len(sorted_data) for i in range(1, len(sorted_data) + 1)])
    theoretical_quantiles_exponential = stats.expon.ppf(observed_quantiles_exponential, scale=1/lambda_est)

    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 3))

    # Plot the Negative Binomial QQ plot on the left
    ax1.scatter(theoretical_quantiles_nb, sorted_data, label='NB QQ Plot')
    ax1.plot([min(theoretical_quantiles_nb), max(theoretical_quantiles_nb)], [min(sorted_data), max(sorted_data)], 'r--')
    ax1.set_title(f'Negative Binomial QQ Plot: {gene_name}', fontsize = 10)
    ax1.set_xlabel('Theoretical Quantiles', fontsize = "small")
    ax1.set_ylabel('Sample Quantiles', fontsize = "small")
    ax1.legend()
    ax1.grid(True)

    # Plot the Exponential QQ plot on the right
    ax2.scatter(theoretical_quantiles_exponential, sorted_data, label='Exponential QQ Plot')
    ax2.plot([min(theoretical_quantiles_exponential), max(theoretical_quantiles_exponential)], [min(sorted_data), max(sorted_data)], 'r--')
    ax2.set_title(f'Exponential QQ Plot: {gene_name}', fontsize = 10)
    ax2.set_xlabel('Theoretical Quantiles', fontsize = "small")
    ax2.set_ylabel('Sample Quantiles', fontsize = "small")
    ax2.legend()
    ax2.grid(True)

    # Adjust layout and display
    plt.tight_layout()
    
    image_name = f'{gene_name}.png'
    full_path = os.path.join(path, image_name)
    plt.savefig(full_path, dpi=100)
    #plt.show()
    plt.close()

In [None]:
### Parameters ###

# whether or not we're doing outlier removal using trimmed means
trim_means_flag = True
trim_percent = 10 # 1% usually gets rid of most extreme outliers

# genes must be expressed in this % of patients (between 0-1)
express_percent_limit = 0.2 # set to 0 if you want patient stats (all genes with at least 1 read), set to 0.2 if we want AIC stats of genes with >20% expression

# library adjust (using fractional method)
adjust_for_lib = False

# calculate AIC distance
calc_AIC_dist = False # False saves time when running the full program

# a flag if we want to just do "no ZI" or "NB vs ZINB only"
NB_ZINB_only = False # Only comparing NB to ZINB [trim_percent should be low, maybe even zero]

# trim will remove zeroes, so I don't think we should activate trim when doing NB/ZINB comparison
if (NB_ZINB_only == True):
    trim_percent = 0

no_ZI_AICs = False

In [None]:
# non-AIC related functions used in this program    

# computation of gene average, fraction of zeroes, and library size
def dataset_stats_generator(df, draw_zero_distribution = True):
    num_genes = df.shape[0]
    num_samples = df.shape[1]
    # Compute the metrics for each row
    row_sums = df.sum(axis = 0)
    fraction_zero_samples = (df == 0).sum(axis=0) / num_genes
    fraction_zero_genes = (df == 0).sum(axis=1) / num_samples
    row_means = df.mean(axis=1)

    if (draw_zero_distribution):
        plt.hist(fraction_zero_genes, bins=100, color='blue', alpha=0.7)
        plt.xlabel('Fraction of Zeroes (Genes)')
        plt.ylabel('Count')
        plt.title("Fraction of Zeroes per Gene")
        plt.show()

        plt.hist(fraction_zero_samples, bins=100, color='blue', alpha=0.7)
        plt.xlabel('Fraction of Zeroes (Samples)')
        plt.ylabel('Count')
        plt.title("Fraction of Zeroes per Sample")
        plt.show()

        plt.hist(row_means, bins=100, color='blue', alpha=0.7)
        plt.xlabel('Means of Gene Expression')
        plt.ylabel('Count')
        plt.title("Distribution of Means of Genes in Dataset")
        plt.show()
      
    # get the average of these 
    avg_library_size = np.round(np.sum(row_sums) / num_samples, decimals = 0)
    avg_zeroes = np.round(np.sum(fraction_zero_samples) / num_samples, decimals = 3)
    avg_mean_expression = np.round(np.mean(row_means), decimals = 3)
    
    # print("Avg Library Size", "Avg Fraction Zeroes", "Avg Mean Expression")
    return avg_library_size, avg_zeroes, avg_mean_expression

# Simulating some data for illustration
#data = np.random.negative_binomial(10, 0.5, 1000)

def fit_to_nb_plot(data, plotrange = 30):

    # Estimating parameters directly from data
    mean = np.mean(data)
    var = np.var(data)
    p = 1 - (mean / var)
    n = mean * (1 - p) / p

    # Plotting
    plt.hist(data, bins=range(plotrange), align='left', density=True, alpha=0.6, color='g')
    plt.plot(bins[:-1], nbinom.pmf(bins[:-1], n, p), 'ro-', lw=2)
    plt.title("Negative Binomial Fit")
    plt.show()

# adjust for library sizes
def library_adjust(data):
    if (adjust_for_lib):
        library_size = data.sum(axis=0)
        
        cleaned_matrix = np.round((data /library_size)*10000000)
        return cleaned_matrix
    else:
        return data

In [None]:
# functions to compute ZINB

def zinb_loglike(params, counts):
    mu, theta, pi = params
    p = 1 / (1 + mu/theta)
    n = mu * p / (1 - p)
    loglik_pois = nbinom.logpmf(counts, n, p)
    loglik_zero = np.log(pi + (1 - pi) * np.exp(nbinom.logpmf(0, n, p)))
    loglik = np.where(counts == 0, loglik_zero, np.log(1 - pi) + loglik_pois)
    return -np.sum(loglik)

def calculate_aic(loglik, k):
    return 2*k - 2*loglik

def fit_zinb_and_calculate_aic(counts):
    
    initial_params = np.array([np.mean(counts), np.var(counts), 0.5])
    bounds = [(0, None), (0, None), (0, 1)]
    result = minimize(zinb_loglike, initial_params, args=(counts), bounds=bounds)
    mu, theta, pi = result.x
    loglik = -result.fun
    
    k = 3  # Number of parameters
    aic = calculate_aic(loglik, k)
    return mu, theta, pi, aic

In [None]:
# functions to compute ZIP
def zip_loglike(params, counts):
    mu, pi = params
    loglik_pois = poisson.logpmf(counts, mu)
    loglik_zero = np.log(pi + (1 - pi) * np.exp(poisson.logpmf(0, mu)))
    loglik = np.where(counts == 0, loglik_zero, np.log(1 - pi) + loglik_pois)
    return -np.sum(loglik)

def calculate_aic(loglik, k):
    return 2*k - 2*loglik

def fit_zip_and_calculate_aic(counts):
    
    initial_params = np.array([np.mean(counts), 0.5])
    bounds = [(0, None), (0, 1)]
    result = minimize(zip_loglike, initial_params, args=(counts), bounds=bounds)
    mu, pi = result.x
    loglik = -result.fun
    k = 2  # Number of parameters for ZIP model
    aic = calculate_aic(loglik, k)
    return mu, pi, aic

In [None]:
def compare_distributions(aic_values):
    distribution_types = {
        "NB": aic_values["NB"],
        "Gaussian": aic_values["Gaussian"],
        "Poisson": aic_values["Poisson"],
        "Exponential": aic_values["Exponential"],
        "ZIP": aic_values["Zero-Inflated Poisson"],
        "ZINB": aic_values["Zero-Inflated Negative Binomial"]
    }

    min_aic_type = min(distribution_types, key=distribution_types.get)
    min_aic_value = distribution_types[min_aic_type]

    return min_aic_type, min_aic_value



# this is the function that computes AIC for all distributions (Gaussian, Exponential, Negative Binomial, ZIP, ZINB)
# and reports which distribution is lowest
# row - a vector of expressions
def manual_aic(row, path):
    gene_name = row.name

    row = np.round(row) # it must be count data

    # trimmed mean to remove outliers
    n = len(row)

    if (trim_means_flag):
        elements_to_trim = int(np.floor(trim_percent / 100.0 * n))  
        sorted_data = np.sort(row)
        
        if (elements_to_trim > 0):
            row = sorted_data[elements_to_trim:-elements_to_trim]
        else: 
            row = sorted_data

    if (sum(row) <= 0):
        return "ZEROES"

    X = sm.add_constant(np.ones(len(row)))

    # Exponential parameters
    lambda_exp = 1 / np.mean(row)
    log_likelihood_exp = np.sum(expon.logpdf(row, scale=1/lambda_exp))
    aic_exp = 2*1 - 2*log_likelihood_exp  # 1 parameter for exponential

    # log -> linear -> delog
    # using StatsModels to fit to a Gamma (it doesn't have exponential)
    #model_exponential_approx = sm.GLM(row, X, family=Gamma()).fit()
    #print('AIC for Gamma:', model_exponential_approx.aic)
    #aic_exp = model_exponential_approx.aic

    # Compute AIC to NB manually 
    mu_sample = np.mean(row)
    var_sample = np.var(row)
    if (mu_sample == var_sample):
        var_sample = var_sample + 0.0000000000001
    r_estimated = mu_sample**2 / (var_sample - mu_sample)
    
    if (mu_sample + r_estimated) == 0:
        r_estimated = r_estimated + 0.0000000000001
    p_estimated = r_estimated / (mu_sample + r_estimated)
    #log_likelihood_nb = np.sum(nbinom.logpmf(row, r_estimated, p_estimated))
    #aic_nb = 2*2 - 2*log_likelihood_nb  # 2 parameters for NB
    
    #print("NB AIC SciPy", aic_nb_orig)

    # StatsModels method to compute fit to NB
    #X = sm.add_constant(np.ones(len(row)))
    #model_nb = sm.GLM(row, X, family=NegativeBinomial()).fit(disp=0)
    #aic_nb = model_nb.aic
    #print(model_nb.summary())
    #print("NB AIC GLM", aic_nb)

    res = sm.NegativeBinomial(row, X).fit(start_params=[1,1], disp=0)
    
    const = res.params[0]
    alpha = res.params[1]

    mu = np.exp(const)
    p = 1/(1+np.exp(const)*alpha)
    n = np.exp(const)*p/(1-p)

    nb_theta = mu * (1 - p) / p

    aic_nb = res.aic

    # ZINB parameters
    mu, zinb_theta, zinb_pi, aic = fit_zinb_and_calculate_aic(row)
    aic_zinb = aic
    


    # AIC of Gaussian following IRLS (IRLS)
    model_gaussian = sm.GLM(row, X, family=sm.families.Gaussian()).fit(disp=0)
    aic_gauss = model_gaussian.aic


    # Poisson parameters (all methods give the same AIC)
    model_poisson = sm.Poisson(row, X).fit(disp=0)
    aic_pois = model_poisson.aic

    # ZIP parameters
    #pi_zip = np.mean(row == 0)
    #lambda_zip = np.mean(row[row != 0])
    #log_likelihood_zeros_zip = np.sum(np.log(pi_zip) * (row == 0))
    #log_likelihood_non_zeros_zip = np.sum(np.log(1 - pi_zip) + poisson.logpmf(row[row != 0], lambda_zip))
    #log_likelihood_zip = log_likelihood_zeros_zip + log_likelihood_non_zeros_zip
    #aic_zip = 2*2 - 2*log_likelihood_zip  # 2 parameters for ZIP: pi and lambda

    #print("Old", aic_zip)
    # Usage
    mu, pi, aic = fit_zip_and_calculate_aic(row)
    aic_zip = aic
    #print(f"mu: {mu}, pi: {pi}, AIC: {aic}")


    # sometimes ZIP and ZINB can be NaN if there are no zeroes
    # NB can also become NaN if Mean > Variance (I think)
    # just in case, lets add this check for all of them
    aic_scores = {'aic_zip': aic_zip, 'aic_zinb': aic_zinb, 'aic_nb': aic_nb, 'aic_pois': aic_pois, 'aic_gauss': aic_gauss, 'aic_exp': aic_exp}

    for key in aic_scores:
        if np.isnan(aic_scores[key]) | np.isinf(aic_scores[key]):
            aic_scores[key] = 100000000

    aic_zip, aic_zinb, aic_nb, aic_pois, aic_gauss, aic_exp = aic_scores.values()
        
    # in certain analyses, we might only want to look at certain distributions
    # so we will make the AIC score high for those we don't care about
    if (NB_ZINB_only == True):
        aic_zip, aic_pois, aic_gauss, aic_exp = 10000000, 10000000, 10000000, 10000000
    if (no_ZI_AICs == True):
        aic_zip, aic_zinb = 10000000, 10000000
    
    # 
    qqplot_nb_vs_exp(row, path, gene_name)  


    # print(aic_nb, aic_gauss, aic_pois, aic_exp)
    best_distribution, best_aic = compare_distributions({
        "NB": aic_nb,
        "Gaussian": aic_gauss,
        "Poisson": aic_pois,
        "Exponential": aic_exp,
        "Zero-Inflated Poisson": aic_zip,
        "Zero-Inflated Negative Binomial": aic_zinb
    })

    return best_distribution #, nb_theta, zinb_theta, zinb_pi


In [None]:
# NEW AIC CALCULATION PROGRAM
"""
import numpy as np
from scipy.stats import expon, nbinom, norm, poisson

# lets turn this into a function
def manual_aic(row, path):
    row = np.round(row) # make it count data
    
    # what if we adjust the row with elimination of outliers
    # IQR is distribution agnostic which is why I chose it
    Q1 = np.percentile(row, 25)
    Q3 = np.percentile(row, 75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 2 * IQR
    upper_bound = Q3 + 2 * IQR
   
    # row = row[(row >= lower_bound) & (row <= upper_bound)]
    #row = row[row <= upper_bound]
    
    
    if (sum(row) <= 0):
        return "ZEROES"
    
    # Exponential parameters
    lambda_exp = 1 / np.mean(row)
    log_likelihood_exp = np.sum(expon.logpdf(row, scale=1/lambda_exp))
    aic_exp = 2*1 - 2*log_likelihood_exp  # 1 parameter for exponential

    # NB parameters
    mu_sample = np.mean(row)
    var_sample = np.var(row)
    if (mu_sample == var_sample):
        var_sample = var_sample + 0.0000000000001
    r_estimated = mu_sample**2 / (var_sample - mu_sample)
    
    if (mu_sample + r_estimated) == 0:
        r_estimated = r_estimated + 0.0000000000001
    p_estimated = r_estimated / (mu_sample + r_estimated)
    log_likelihood_nb = np.sum(nbinom.logpmf(row, r_estimated, p_estimated))
    aic_nb = 2*2 - 2*log_likelihood_nb  # 2 parameters for NB

    # ZINB parameters
    pi_zinb = np.mean(row == 0)
    log_likelihood_zeros = np.sum(np.log(pi_zinb) * (row == 0))
    log_likelihood_non_zeros = np.sum(np.log(1 - pi_zinb) + nbinom.logpmf(row[row != 0], r_estimated, p_estimated))
    log_likelihood_zinb = log_likelihood_zeros + log_likelihood_non_zeros
    aic_zinb = 2*3 - 2*log_likelihood_zinb  # 3 parameters for ZINB: pi, r, and p

    # Gaussian parameters
    mu_gauss = np.mean(row)
    sigma_gauss = np.std(row)
    log_likelihood_gauss = np.sum(norm.logpdf(row, mu_gauss, sigma_gauss))
    aic_gauss = 2*2 - 2*log_likelihood_gauss  # 2 parameters for Gaussian: mu and sigma

    # Poisson parameters
    lambda_pois = np.mean(row)
    log_likelihood_pois = np.sum(poisson.logpmf(row, lambda_pois))
    aic_pois = 2*1 - 2*log_likelihood_pois  # 1 parameter for Poisson: lambda

    # ZIP parameters
    pi_zip = np.mean(row == 0)
    lambda_zip = np.mean(row[row != 0])
    log_likelihood_zeros_zip = np.sum(np.log(pi_zip) * (row == 0))
    log_likelihood_non_zeros_zip = np.sum(np.log(1 - pi_zip) + poisson.logpmf(row[row != 0], lambda_zip))
    log_likelihood_zip = log_likelihood_zeros_zip + log_likelihood_non_zeros_zip
    aic_zip = 2*2 - 2*log_likelihood_zip  # 2 parameters for ZIP: pi and lambda

    # sometimes ZIP and ZINB can be NaN if there are no zeroes
    # NB can also become NaN if Mean > Variance (I think)
    # just in case, lets add this check for all of them
    if np.isnan(aic_zip):
        aic_zip = 1000000000000
    if np.isnan(aic_zinb):
        aic_zinb = 1000000000000
    if np.isnan(aic_nb):
        aic_nb = 1000000000000
    if np.isnan(aic_pois):
        aic_pois = 1000000000000
    if np.isnan(aic_gauss):
        aic_gauss = 1000000000000
    if np.isnan(aic_exp):
        aic_exp = 1000000000000
        
   
    # Compare AICs and determine best fit
    if (aic_nb < aic_pois ) & (aic_nb < aic_gauss) & (aic_nb < aic_exp) & (aic_nb < aic_zip) & (aic_nb < aic_zinb): 
        #qqplot_nb_vs_exp(row, path)  
        return "NB"
    elif (aic_gauss < aic_nb) & (aic_gauss < aic_pois) & (aic_gauss < aic_exp) & (aic_gauss < aic_zip) & (aic_gauss < aic_zinb):
        return "Gaussian"
    elif (aic_exp < aic_nb) & (aic_exp < aic_pois) & (aic_exp < aic_gauss) & (aic_exp < aic_zip) & (aic_exp < aic_zinb):
        #qqplot_nb_vs_exp(row, path)  
        return "Exponential"
    elif (aic_zinb < aic_nb) & (aic_zinb < aic_pois) & (aic_zinb < aic_gauss) & (aic_zinb < aic_zip) & (aic_zinb < aic_exp):
        return "ZINB"
    elif (aic_zip < aic_nb) & (aic_zip < aic_pois) & (aic_zip < aic_gauss) & (aic_zip < aic_zinb) & (aic_zip < aic_exp):
        return "ZIP"
    else:
        return "Poisson"
"""

In [None]:
def segmental_aic_find(counts, path=""):
    add_vector_nb, add_vector_exp, add_vector_zinb, sample_total = [], [], [], []
    add_vector_zip, add_vector_gaussian, add_vector_poisson = [], [], []
    wf_nb, wf_zinb, wf_gau, wf_poi, wf_zip, wf_exp = 1,1,1,1,1,1

    for i in range(1,5):
        
        # instead of by zero fraction, we split genes by mean expression
        counts.loc[:, 'RowSum'] = counts.sum(axis=1)

        # Sort the DataFrame by the RowSum column
        counts = counts.sort_values(by='RowSum', ascending=False)

        # ditch any row with no reads
        counts = counts[counts['RowSum'] != 0]
        
        # Calculate quartile values
        total_rows = len(counts)
        top_25 = int(total_rows * 0.25)
        middle_50 = int(total_rows * 0.5)
        bottom_25 = int(total_rows * 0.75)
        
        # remove rowsum
        counts = counts.drop(columns=['RowSum'])
        
        top25_percent = counts.iloc[:top_25]
        top50_to25_percent = counts.iloc[top_25:middle_50]
        top75_to50_percent = counts.iloc[middle_50:bottom_25]
        bottom25_percent = counts.iloc[bottom_25:]
        
        # Set the quartiles
        if (i == 1):
            filtered_df = top25_percent
        if (i == 2):
            filtered_df = top50_to25_percent        
        if (i == 3): 
            filtered_df = top75_to50_percent
        if (i == 4): 
            filtered_df = bottom25_percent 
        
        sample_total.append(filtered_df.shape[0])
    
        #aic_values = filtered_df.apply(manual_aic, axis=1)
        aic_values = filtered_df.apply(lambda row: manual_aic(row, path), axis=1)
        ratio_NB_to_Gaussian = aic_values.value_counts()
    
        # to add zero if the distribution didn't occur
        if(wf_nb == 0):
           add_vector_nb.append(0)
        if(wf_zinb == 0):
            add_vector_zinb.append(0)
        if(wf_zip == 0):
            add_vector_zip.append(0)        
        if(wf_poi == 0):
            add_vector_poisson.append(0)
        if(wf_gau == 0):
            add_vector_gaussian.append(0)
        if(wf_exp == 0):
            add_vector_exp.append(0)
    
        wf_nb, wf_zinb, wf_gau, wf_poi, wf_zip, wf_exp = 0,0,0,0,0,0
    
        names = ratio_NB_to_Gaussian.index
        values = ratio_NB_to_Gaussian.values
    
        # add to individual vectors
        for i in range(len(names)):
            if(names[i] == 'NB'):
                add_vector_nb.append(values[i])
                wf_nb = 1
            if(names[i] == 'ZINB'):
                add_vector_zinb.append(values[i])
                wf_zinb = 1
            if(names[i] == 'ZIP'):
                add_vector_zip.append(values[i])
                wf_zip = 1
            if(names[i] == 'Poisson'):
                add_vector_poisson.append(values[i])
                wf_poi = 1
            if(names[i] == 'Gaussian'):
                add_vector_gaussian.append(values[i])
                wf_gau = 1
            if(names[i] == 'Exponential'):
                add_vector_exp.append(values[i])
                wf_exp = 1
        
    # need to do it once again for the last value
    if(wf_nb == 0):
        add_vector_nb.append(0)
    if(wf_zinb == 0):
        add_vector_zinb.append(0)
    if(wf_zip == 0):
        add_vector_zip.append(0)        
    if(wf_poi == 0):
        add_vector_poisson.append(0)
    if(wf_gau == 0):
        add_vector_gaussian.append(0)
    if(wf_exp == 0):
        add_vector_exp.append(0)
    
    return add_vector_nb, add_vector_zinb, add_vector_zip, add_vector_poisson, add_vector_gaussian, add_vector_exp, sample_total
    

In [None]:
# don't forget we want to take the average value

import matplotlib.pyplot as plt

# Create a vector of x-values (input values)
def gradual_aic_plot(data, title_add=""):
    x_values = ["Top 25%", "25-50%", "50-75%", "75-100%"]
    # order is: add_vector_nb, add_vector_zinb, add_vector_zip, add_vector_poisson,
    # add_vector_gaussian, add_vector_exp, sample_total
    add_vector_nb = data[0]
    add_vector_zinb = data[1]
    add_vector_zip = data[2]
    add_vector_poisson = data[3]
    add_vector_gaussian = data[4]
    add_vector_exp = data[5]
    sample_total = data[6]
    
    fraction_nb = [x / y for x, y in zip(add_vector_nb, sample_total)]
    fraction_zinb = [x / y for x, y in zip(add_vector_zinb, sample_total)]
    fraction_exp = [x / y for x, y in zip(add_vector_exp, sample_total)]
    fraction_zip = [x / y for x, y in zip(add_vector_zip, sample_total)]
    fraction_gau = [x / y for x, y in zip(add_vector_gaussian, sample_total)]
    fraction_poi = [x / y for x, y in zip(add_vector_poisson, sample_total)]

    # Create a line plot
    fig = plt.figure(figsize=(2.8, 2.2))
    plt.plot(x_values, fraction_nb, marker='o', linestyle='-', label='NB')
    plt.plot(x_values, fraction_zinb, marker='o', linestyle='-', label='ZINB')
    plt.plot(x_values, fraction_exp, marker='o', linestyle='-', label='Exponential')
    plt.plot(x_values, fraction_poi, marker='o', linestyle='-', label='Poisson')
    plt.plot(x_values, fraction_zip, marker='o', linestyle='-', label='ZIP')
    plt.plot(x_values, fraction_gau, marker='o', linestyle='-', label='Gaussian')

    # Add labels and a title
    #plt.xlabel('Genes Grouped by Overall Counts')
    #plt.ylabel('% of Genes Best Fitting Distribution')
    
    #title_text = f"Gene Fraction with Lowest AIC [{title_add}]"
    title_text = f"{title_add}"
    
    plt.title(title_text, fontsize=10)
    
    # saving plot
    plt.savefig('/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/By_Expression_AIC_Plot.' +
        str(title_add) + ".ZeroFract_" + str(express_percent_limit) + ".Trim_" + str(trim_percent) + ".pdf",
        dpi=300, bbox_inches='tight')  # dpi is dots per inch, for resolution

    # Show the plot
    plt.show()

In [None]:
# this is a dataset with 528 FFPE breast cancer samples, sequenced from a HiSeq

data = pd.read_csv('/path/to/GSE167977_third_party_ffpe/GSE167977_Raw_Counts.txt',
                  delimiter='\t')

# filter and compute dispersion
# dispersion of tumours - All Data
GSE167977_tumours_counts = pd.DataFrame(data)
GSE167977_tumours_counts = GSE167977_tumours_counts.set_index('ensembl_gene_id')
GSE167977_tumours_counts = GSE167977_tumours_counts.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
#GSE167977_tumours_counts = GSE167977_tumours_counts.drop(GSE167977_tumours_counts.columns[0], axis=1) # column 1
GSE167977_tumours_counts = GSE167977_tumours_counts.drop(GSE167977_tumours_counts.columns[-5:], axis=1) # last 5 columns

# adjust for library size (fraction method)
# should come before the gene filter
tumours_counts_lib_adjust = library_adjust(GSE167977_tumours_counts)

fraction_of_zeroes = (tumours_counts_lib_adjust == 0).mean(axis=1)
filtered_df = tumours_counts_lib_adjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients


print("GSE167977 - Lowest AIC across all genes")

GSE167977_tumours_data = segmental_aic_find(GSE167977_tumours_counts, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE167977/")


In [None]:
gradual_aic_plot(GSE167977_tumours_data, "GSE167977")

In [None]:
def top50_bar_plot(data, title_add=""):
    x_values = ["Top 25%", "25-50%", "50-75%", "75-100%"]
    # order is: add_vector_nb, add_vector_zinb, add_vector_zip, add_vector_poisson,
    # add_vector_gaussian, add_vector_exp, sample_total
    add_vector_nb = data[0]
    add_vector_zinb = data[1]
    add_vector_zip = data[2]
    add_vector_poisson = data[3]
    add_vector_gaussian = data[4]
    add_vector_exp = data[5]
    sample_total = data[6]
    
    sum_nb = add_vector_nb[0] #+ add_vector_nb[1]
    sum_zinb = add_vector_zinb[0] #+ add_vector_zinb[1]
    sum_exp = add_vector_exp[0] #+ add_vector_exp[1]
    sum_zip = add_vector_zip[0] + add_vector_zip[1]
    sum_gau = add_vector_gaussian[0] + add_vector_gaussian[1]
    sum_poi = add_vector_poisson[0] + add_vector_poisson[1]

    print(sum_nb, sum_zinb, sum_exp, sum_poi, sum_zip, sum_gau)
    pos = 6
    bar_width = 0.1
    categories = ['NB', 'ZINB', 'Exponential', 'Poisson', 'ZIP', 'Gaussian']
    
    # Plotting the bars
    fig, ax = plt.subplots()

    # Create a bar for each set of values
    plt.bar(pos, sum_nb, bar_width, label='NB')
    plt.bar(pos + bar_width, sum_zinb, bar_width, label='ZINB')
    plt.bar(pos + bar_width*2, sum_exp, bar_width, label='Exponential')
    plt.bar(pos + bar_width*3, sum_poi, bar_width, label='Poisson')
    plt.bar(pos + bar_width*4, sum_zip, bar_width, label='ZIP')
    plt.bar(pos + bar_width*5, sum_gau, bar_width, label='Gaussian')

    # Adding and formatting title and labels
    plt.xlabel('Distribution Types')
    plt.ylabel('Frequency')
    plt.title(title_add)
    #plt.xticks(pos + bar_width, categories)

    plt.legend()

    plt.savefig('/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/By_Expression_AIC_BarPlot.' +
        str(title_add) + ".ZeroFract_" + str(express_percent_limit) + ".Trim_" + str(trim_percent) + ".pdf",
        dpi=300, bbox_inches='tight')  # dpi is dots per inch, for resolution


    plt.show()



top50_bar_plot(GSE167977_tumours_data, "GSE167977")



In [None]:
data = pd.read_csv('/path/to/GSE181466_third_party_ffpe/GSE181466_rsem_genes_matrix-97.txt',
                  delimiter='\t')

# patient information splitting is unnecessary, this appears to all be both FFPE and from tumours
# there is subtype and age information in the series matrix file, if we're interested

# dispersion of tumours - All Data
tumours_counts = pd.DataFrame(data)
# removing gene column at position 0
tumours_counts = tumours_counts.drop(tumours_counts.columns[0], axis=1)
# skip genes that are all zeroes, or just one spurrious read somewhere

# adjust for library size (fraction method)
tumours_counts_libadjust = library_adjust(tumours_counts)

fraction_of_zeroes = (tumours_counts_libadjust == 0).mean(axis=1)
filtered_df = tumours_counts_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

print("GSE181466")
GSE181466_data = segmental_aic_find(filtered_df, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE181466/")
gradual_aic_plot(GSE181466_data, "GSE181466") 


In [None]:
## here, we will repeat our plots but for a different data set
all_counts = pyreadr.read_r('/path/to/GSE146889_third_party_ffpe/GSE146889_GeneCount.rds')
df = all_counts[None] 

# we need to split the tumors and normals by name
GSE146889_count_TUMOR = df.filter(like='tumor')
GSE146889_count_NORMAL = df.filter(like='normal')

count_TUMOR_libadjust = library_adjust(GSE146889_count_TUMOR)
fraction_of_zeroes = (count_TUMOR_libadjust == 0).mean(axis=1)
filtered_tumour = count_TUMOR_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

# adjust for library size (fraction method)
count_NORMAL_libadjust = library_adjust(GSE146889_count_NORMAL)
fraction_of_zeroes = (count_NORMAL_libadjust == 0).mean(axis=1)
filtered_normal = count_NORMAL_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients


print("GSE146889 - Tumours")
GSE146889_tum_data = segmental_aic_find(filtered_tumour, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE146889_tumour/")
gradual_aic_plot(GSE146889_tum_data, "GSE146889_Tumours") 

GSE146889_norm_data = segmental_aic_find(filtered_normal, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE146889_normal/")
gradual_aic_plot(GSE146889_norm_data, "GSE146889_Normals") 

In [None]:
all_counts = pyreadr.read_r('/path/to/GSE209998_third_party_ffpe/GSE209998_GeneCount.rds')
sample_information = pyreadr.read_r('/path/to/GSE209998_third_party_ffpe/GSE209998_Sample_Data.rds')

# now we want to isolate just the expression from a particular type of tissue
df_counts = all_counts[None] # load all_counts into a pandas data frame
df_sample = sample_information[None] # load all_counts into a pandas data frame

# here, we need to match if a sample is normal or tumour by !Sample_source_name_ch1 row

# so I need to: 1) match columns between sample_information and all_counts 
# are they in the same order
columns_df1 = df_counts.columns
columns_df2 = df_sample.columns

# Now we find what samples were tumours and what were normal
samples_row = df_sample.loc["!Sample_source_name_ch1"]

split_dfs = {}
for sample_type in samples_row.unique():
    matching_columns = [col for col in df_counts.columns if col in df_sample.columns and samples_row[col] == sample_type]
    split_dfs[sample_type] = df_counts[matching_columns]

sample_source = df_sample.loc["!Sample_source"]

split_source = {}
for sample_type in sample_source.unique():
    matching_columns = [col for col in df_counts.columns if col in df_sample.columns and sample_source[col] == sample_type]
    split_source[sample_type] = df_counts[matching_columns]


GSE209998_count_FRESH = split_source["Fresh frozen"]
GSE209998_count_FFPE = split_source["FFPE"]

count_FFPE_libadjust = library_adjust(GSE209998_count_FRESH)
fraction_of_zeroes = (count_FFPE_libadjust == 0).mean(axis=1)
filtered_ffpe = np.round(count_FFPE_libadjust[fraction_of_zeroes < (1 - express_percent_limit)]) # must be expressed to this percentage of patients

count_FRESH_libadjust = library_adjust(GSE209998_count_FFPE)
fraction_of_zeroes = (count_FRESH_libadjust == 0).mean(axis=1)
filtered_fresh = np.round(count_FRESH_libadjust[fraction_of_zeroes < (1 - express_percent_limit)]) # must be expressed to this percentage of patients


print("GSE209998")

GSE209998_ffpe_data = segmental_aic_find(filtered_ffpe, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE209998_FFPE/")
gradual_aic_plot(GSE209998_ffpe_data, "GSE209998_FFPE") 

GSE209998_fresh_data = segmental_aic_find(filtered_fresh, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE209998_FF/")
gradual_aic_plot(GSE209998_fresh_data, "GSE209998_FreshFrozen") 



In [None]:
data = pd.read_csv('/path/to/GSE47462_third_party_ffpe/GSE47462_Raw_counts_Refseq_genes.txt',
                  delimiter='\t')

# Split the DataFrame into subsets based on column names indicating sample type
GSE47462_normal_data = data.filter(like='_normal')
EN_data = data.filter(like='_EN')
DCIS_data = data.filter(like='_DCIS')
IDC_data = data.filter(like='_IDC')

# since there isn't a ton of data, I also want to group tumors
GSE47462_tumours_data = data.loc[:, ~data.columns.str.contains('_normal')]
GSE47462_tumours_data = GSE47462_tumours_data.iloc[:, 1:]


tumours_data_libadjust = library_adjust(GSE47462_tumours_data)
fraction_of_zeroes = (tumours_data_libadjust == 0).mean(axis=1)
filtered_tumour = tumours_data_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

normal_data_libadjust = library_adjust(GSE47462_normal_data)
fraction_of_zeroes = (normal_data_libadjust == 0).mean(axis=1)
filtered_normal = normal_data_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients


print("GSE47462")
GSE47462_tum_data = segmental_aic_find(filtered_tumour, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE47462_tumour/")

gradual_aic_plot(GSE47462_tum_data, "GSE47462_Tumours")
GSE47462_norm_data = segmental_aic_find(filtered_normal, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE47462_normal/")
gradual_aic_plot(GSE47462_norm_data, "GSE47462_Normals")


In [None]:
# Read the CSV file into a DataFrame
data = pd.read_csv('/path/to/GSE120795_third_party_ffpe/GSE120795_total_norms_raw_counts.tsv',
                  delimiter='\t')

# in the series matrix"disease: healthy", 
patient_info = pd.read_csv('/path/to/GSE120795_third_party_ffpe/GSE120795_cell_info.txt',
                  delimiter='\t')

mask = patient_info.iloc[0] == "healthy"

filtered_data = patient_info.loc[:, mask]
patient_names = filtered_data.columns
column_names_with_extension = [name + ".fastq.gz" for name in patient_names]
column_names_with_extension = column_names_with_extension[1:]

# Assuming 'second_list' is the list where you want to filter based on column names
GSE120795_filtered_data = data[column_names_with_extension]
GSE120795_ffpe_counts = pd.DataFrame(GSE120795_filtered_data)

ffpe_counts_libadjust = library_adjust(GSE120795_ffpe_counts)
fraction_of_zeroes = (ffpe_counts_libadjust == 0).mean(axis=1)
filtered_data = ffpe_counts_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients


print("GSE120795")
GSE120795_tum_data = segmental_aic_find(filtered_data, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/GSE120795/")
gradual_aic_plot(GSE120795_tum_data, "GSE120795") 


In [None]:
# the GDC Count-Me-In Data
data = pd.read_csv('/path/to/CountMeIn_BConly_third_party_ffpe/MBC_CMI_Compiled_Counts.tsv',
                  delimiter=' ')

TMBC_tumours_counts = pd.DataFrame(data)
TMBC_tumours_counts = TMBC_tumours_counts.drop(TMBC_tumours_counts.columns[:3], axis=1) # columns 1-3 should be ignored

tumours_counts_libadjust = library_adjust(TMBC_tumours_counts)
fraction_of_zeroes = (tumours_counts_libadjust == 0).mean(axis=1)
filtered_df = tumours_counts_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients


print("Count Me In")
TMBC_tum_data = segmental_aic_find(filtered_df, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/TMBC/")
gradual_aic_plot(TMBC_tum_data, "TMBC_Project") 



In [None]:
# Our dataset!
all_counts = pyreadr.read_r('/path/to/dcis/expression_counts.Jan2023_1_2_and_2_2.rds')
vst_norm = pyreadr.read_r('/path/to/dcis/expression_VST_Normalized.Jan2023_1_2_and_2_2.rds')

# this data is loading without issue
ship_data = pyreadr.read_r('/path/to/dcis/ship1_2_full_tbl.Jan2023.With_Stroma_Assignment.rds')
# I wish that we could've simply used the RDA, but the counts-only RDS works and loads faster so what can you do
# in the future, could try the package 'rpy2' instead, it's an alternative that requires R but that's okay for us

# now we want to isolate just the expression from a particular type of tissue
df = all_counts[None] # load all_counts into a pandas data frame

# Eliminate any samples in the blacklist
ship_df = ship_data[None]
#print(ship_df['blacklist'].value_counts()) # they're all false

# since ship_data already has patients filtered out, lets filter out any patient who isn't on the list
# match by 'sample_name'
df_blacklist_filtered = df[ship_df['sample_name']]

# split the patients by tissue
count_DCIS = df_blacklist_filtered.filter(like='_D')
count_STROMA = df_blacklist_filtered.filter(like='_S')
count_NORMAL = df_blacklist_filtered.filter(like='_N')

vst_table = vst_norm[None] # we don't apply this anymore because it blocks any gene with >80% frac_zero
DCIS_filtered_norm_count = count_NORMAL #[count_NORMAL.index.isin(vst_table.index)]
DCIS_filtered_tumour_count = count_DCIS #[count_DCIS.index.isin(vst_table.index)]
DCIS_filtered_stroma_count = count_STROMA #[count_STROMA.index.isin(vst_table.index)]

count_DCIS = count_DCIS.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
count_STROMA = count_STROMA.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
count_NORMAL = count_NORMAL.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))


filtered_norm_count_libadjust = library_adjust(DCIS_filtered_norm_count)
fraction_of_zeroes = (filtered_norm_count_libadjust == 0).mean(axis=1)
filtered_norm_count = filtered_norm_count_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

filtered_tumour_count_libadjust = library_adjust(DCIS_filtered_tumour_count)
fraction_of_zeroes = (filtered_tumour_count_libadjust == 0).mean(axis=1)
filtered_tumour_count = filtered_tumour_count_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients

filtered_stroma_count_libadjust = library_adjust(DCIS_filtered_stroma_count)
fraction_of_zeroes = (filtered_stroma_count_libadjust == 0).mean(axis=1)
filtered_stroma_count = filtered_stroma_count_libadjust[fraction_of_zeroes < (1 - express_percent_limit)] # must be expressed to this percentage of patients


In [None]:
print("Our Data: Tumours")
DCIS_tum_data = segmental_aic_find(count_DCIS, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/DCIS_tumour/")
gradual_aic_plot(DCIS_tum_data, "DCIS_Precise_Tumours") 

In [None]:
print("Our Data: Normal")
DCIS_norm_data = segmental_aic_find(count_NORMAL, "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/DCIS_normal/")
gradual_aic_plot(DCIS_norm_data, "DCIS_Precise_Normal") 

In [None]:
print("Our Data: Stroma")
DCIS_stroma_data = segmental_aic_find(count_STROMA, , "/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/qq_plots/DCIS_stroma/")
gradual_aic_plot(DCIS_stroma_data, "DCIS_Precise_Stroma") 

In [None]:
# lets plot all at the same time

import matplotlib.pyplot as plt
import numpy as np

# Create a 5x2 grid of subplots
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(8, 12))  # Adjust figsize as needed
x_values = ["Top 25%", "25-50%", "50-75%", "75-100%"]

for i in range(5):
    for j in range(2):
        if (i == 0) & (j == 0):
            data = DCIS_tum_data
            title_text = "DCIS-Precise: Tumour"
        elif (i == 0) & (j == 1):
            data = DCIS_norm_data
            title_text = "DCIS-Precise: Normal"
        elif (i == 1) & (j == 0):
            data = DCIS_stroma_data
            title_text = "DCIS-Precise: Stroma"
        elif (i == 1) & (j == 1):
            data = GSE47462_tum_data
            title_text = "GSE47462: Tumour"
        elif (i == 2) & (j == 0):
            data = GSE120795_tum_data
            title_text = "GSE120795: Normal"
        elif (i == 2) & (j == 1):
            data = GSE146889_tum_data
            title_text = "GSE146889: Tumour"
        elif (i == 3) & (j == 0):
            data = GSE167977_tumours_data
            title_text = "GSE167977: Tumour"
        elif (i == 3) & (j == 1):
            data = GSE181466_data
            title_text = "GSE181466: Tumour"
        elif (i == 4) & (j == 0):
            data = GSE209998_ffpe_data
            title_text = "GSE209998: Tumour"
        elif (i == 4) & (j == 1):
            data = TMBC_tum_data
            title_text = "TMBC"
        
        
        # order is: add_vector_nb, add_vector_zinb, add_vector_zip, add_vector_poisson,
        # add_vector_gaussian, add_vector_exp, sample_total
        add_vector_nb = data[0]
        add_vector_zinb = data[1]
        add_vector_zip = data[2]
        add_vector_poisson = data[3]
        add_vector_gaussian = data[4]
        add_vector_exp = data[5]
        sample_total = data[6]
    
        fraction_nb = [x / y for x, y in zip(add_vector_nb, sample_total)]
        fraction_zinb = [x / y for x, y in zip(add_vector_zinb, sample_total)]
        fraction_exp = [x / y for x, y in zip(add_vector_exp, sample_total)]
        fraction_zip = [x / y for x, y in zip(add_vector_zip, sample_total)]
        fraction_gau = [x / y for x, y in zip(add_vector_gaussian, sample_total)]
        fraction_poi = [x / y for x, y in zip(add_vector_poisson, sample_total)]

        axs[i, j].plot(x_values, fraction_nb, marker='o', linestyle='-', label='NB')
        axs[i, j].plot(x_values, fraction_zinb, marker='o', linestyle='-', label='ZINB')
        axs[i, j].plot(x_values, fraction_exp, marker='o', linestyle='-', label='Exponential')
        axs[i, j].plot(x_values, fraction_poi, marker='o', linestyle='-', label='Poisson')
        axs[i, j].plot(x_values, fraction_zip, marker='o', linestyle='-', label='ZIP')
        axs[i, j].plot(x_values, fraction_gau, marker='o', linestyle='-', label='Gaussian')
        
        axs[i, j].set_title(title_text, fontsize=10)
        axs[i, j].legend(fontsize=6)
        axs[i, j].set_xlabel('Genes Grouped by Overall Counts', fontsize=6)
        axs[i, j].set_ylabel('% of Genes Best Fitting Distribution', fontsize=6)

plt.tight_layout()

plt.savefig('/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/By_Expression_AIC_Plot.All.' +
    ".ZeroFract_" + str(express_percent_limit) + ".Trim_" + str(trim_percent) + ".pdf",
        dpi=300, bbox_inches='tight')  # dpi is dots per inch, for resolution

#plt.subplots_adjust(wspace=0.5, hspace=0.5)
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(8, 12))  # Adjust figsize as needed

for i in range(5):
    for j in range(2):
        if (i == 0) & (j == 0):
            data = DCIS_tum_data
            title_text = "DCIS-Precise: Tumour"
        elif (i == 0) & (j == 1):
            data = DCIS_norm_data
            title_text = "DCIS-Precise: Normal"
        elif (i == 1) & (j == 0):
            data = DCIS_stroma_data
            title_text = "DCIS-Precise: Stroma"
        elif (i == 1) & (j == 1):
            data = GSE47462_tum_data
            title_text = "GSE47462: Tumour"
        elif (i == 2) & (j == 0):
            data = GSE120795_tum_data
            title_text = "GSE120795: Normal"
        elif (i == 2) & (j == 1):
            data = GSE146889_tum_data
            title_text = "GSE146889: Tumour"
        elif (i == 3) & (j == 0):
            data = GSE167977_tumours_data
            title_text = "GSE167977: Tumour"
        elif (i == 3) & (j == 1):
            data = GSE181466_data
            title_text = "GSE181466: Tumour"
        elif (i == 4) & (j == 0):
            data = GSE209998_ffpe_data
            title_text = "GSE209998: Tumour"
        elif (i == 4) & (j == 1):
            data = TMBC_tum_data
            title_text = "The Metastatic Breast Cancer Project"
        
        
        # order is: add_vector_nb, add_vector_zinb, add_vector_zip, add_vector_poisson,
        # add_vector_gaussian, add_vector_exp, sample_total
        add_vector_nb = data[0]
        add_vector_zinb = data[1]
        add_vector_zip = data[2]
        add_vector_poisson = data[3]
        add_vector_gaussian = data[4]
        add_vector_exp = data[5]
        sample_total = data[6]
        
        sum_nb = add_vector_nb[0] + add_vector_nb[1]
        sum_zinb = add_vector_zinb[0] + add_vector_zinb[1]
        sum_exp = add_vector_exp[0] + add_vector_exp[1]
        sum_zip = add_vector_zip[0] + add_vector_zip[1]
        sum_gau = add_vector_gaussian[0] + add_vector_gaussian[1]
        sum_poi = add_vector_poisson[0] + add_vector_poisson[1]
    
        print(title_text, sum_nb, sum_zinb, sum_exp, sum_poi, sum_zip, sum_gau)
        pos = 6
        bar_width = 0.1
        categories = ('NB', 'ZINB', 'Exponential', 'Poisson', 'ZIP', 'Gaussian')
        
        # Create a bar for each set of values
        axs[i, j].bar(pos, sum_nb, bar_width, label='NB')
        axs[i, j].bar(pos + bar_width, sum_zinb, bar_width, label='ZINB')
        axs[i, j].bar(pos + bar_width*2, sum_exp, bar_width, label='Exponential')
        axs[i, j].bar(pos + bar_width*3, sum_poi, bar_width, label='Poisson')
        axs[i, j].bar(pos + bar_width*4, sum_zip, bar_width, label='ZIP')
        axs[i, j].bar(pos + bar_width*5, sum_gau, bar_width, label='Gaussian')

        # Adding and formatting title and labels
        axs[i, j].set_title(title_text, fontsize=10)
        axs[i, j].legend(fontsize=6)
        axs[i, j].set_xlabel('Distribution Types', fontsize=6)
        axs[i, j].set_ylabel('No. Genes Best Fitting Distribution', fontsize=6)
        tick_positions = [pos + bar_width * n for n in range(len(categories))]
        axs[i, j].set_xticks(tick_positions)
        axs[i, j].set_xticklabels(categories, fontsize=6)
        
plt.tight_layout()

# saving plot

plt.savefig('/path/to/6.2_Third_Party_Data.Best_AIC_vs_Expression_Level_Plot_Generator/By_Expression_AIC_BarPlot.All.' +
    ".ZeroFract_" + str(express_percent_limit) + ".Trim_" + str(trim_percent) + ".pdf",
        dpi=300, bbox_inches='tight')  # dpi is dots per inch, for resolution

plt.show()

    
       