In [None]:
# EJM Script
# I am rusty with Python, so the first thing I want to do is to import the data we need

# first lets read in scipy, as I'll need the "curve_fit" function in optimize
from __future__ import print_function

import numpy as np
from scipy.special import gammaln
from scipy.special import psi
from scipy.special import factorial
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.stats import poisson
from scipy.special import comb
import math
import sys
import pandas as pd

# now the DCIS count data is found in an RDA file, which we apparently read using 'pyreadr'

import pyreadr


# and a function I use in the document
def variance_mean(row):
    variance = np.var(row, ddof=1) # “Delta Degrees of Freedom”
    mean = np.mean(row)
    std_dev = np.sqrt(variance) # sometimes this is considered dispersion
    
    # the formula given is Variance = mean + dispersion*mean^2
    # re-arranged, this gives dispersion = (variance - mean)/(mean*mean)
    dispersion = 0 # in case the mean is zero, or if variance = mean
    if ((mean != 0) & (variance != mean)):
        #dispersion = (variance - mean)/(mean**2)
        dispersion = (mean**2)/(variance - mean) # did I have this backwards?
        inverse_dispersion = dispersion**-1
    
    # and they define "expected fraction zeros" as exp(-mean) - Poisson distribution!
    #prob_frac = math.exp(mean*-1)
    
    # lets do the NB formulation
    prob_frac = 1 # if dispersion is zero, the fraction zero is 100%
    if (dispersion >0):
        prob_frac = ((inverse_dispersion)/(mean + inverse_dispersion))**(inverse_dispersion)
    
    # we will also want to know the fraction of the row that equals zero
    zero_frac = (row == 0).sum()/len(row)
    return pd.Series({'variance': variance, 'mean': mean, "StDev": std_dev, "Dispersion": dispersion, 'zero_fraction': zero_frac, 'prob_frac': prob_frac})

In [None]:
def variance_mean(row):
    variance = np.var(row, ddof=1)
    mean = np.mean(row)
    std_dev = np.sqrt(variance) # sometimes this is considered dispersion
    
    dispersion = 0 # in case the mean is zero, or if variance = mean
    if ((mean != 0) & (variance != mean)):
        #dispersion = (variance - mean)/(mean**2)
        dispersion = (mean**2)/(variance - mean) # did I have this backwards?
        inverse_dispersion = dispersion**-1
    
    # and they define "expected fraction zeros" as exp(-mean) - Poisson distribution!
    #prob_frac = math.exp(mean*-1)
    
    # lets do the NB formulation
    prob_frac = 1 # if dispersion is zero, the fraction zero is 100%
    if (dispersion >0):
        prob_frac = ((inverse_dispersion)/(mean + inverse_dispersion))**(inverse_dispersion)
    
    # we will also want to know the fraction of the row that equals zero
    zero_frac = (row == 0).sum()/len(row)
    return pd.Series({'variance': variance, 'mean': mean, "StDev": std_dev, "Dispersion": dispersion, 'zero_fraction': zero_frac, 'prob_frac': prob_frac})


In [None]:
all_counts = pyreadr.read_r('/path/to/expression_counts.Jan2023_1_2_and_2_2.rds')
vst_norm = pyreadr.read_r('/path/to/expression_VST_Normalized.Jan2023_1_2_and_2_2.rds')

# this data is loading without issue
ship_data = pyreadr.read_r('/path/to/ship1_2_full_tbl.Jan2023.With_Stroma_Assignment.rds')
# I wish that we could've simply used the RDA, but the counts-only RDS works and loads faster so what can you do
# in the future, could try the package 'rpy2' instead, it's an alternative that requires R but that's okay for us

# to do: import Ensemble to Refseq gene name conversion table to fix that
gene_convert = pyreadr.read_r('/path/to/ensemble_to_refseq_gene_name_table.rds')



# now we want to isolate just the expression from a particular type of tissue

df = all_counts[None] # load all_counts into a pandas data frame

# Eliminate any samples in the blacklist
ship_df = ship_data[None]
#print(ship_df['blacklist'].value_counts()) # they're all false

# since ship_data already has patients filtered out, lets filter out any patient who isn't on the list
# match by 'sample_name'
df_blacklist_filtered = df[ship_df['sample_name']]

# split the patients by tissue
count_DCIS = df_blacklist_filtered.filter(like='_D')
count_STROMA = df_blacklist_filtered.filter(like='_S')
count_NORMAL = df_blacklist_filtered.filter(like='_N')
# we will perform all our work from this point on with Normal tissues for now

# I also want gene_convert to be a pandas data frame
gene_convert = gene_convert[None]


print(count_DCIS.shape)




In [None]:
# I want to add a new count table, averaged by the number of reads in the entire sample
column_sums = np.sum(count_NORMAL, axis=0)

# Divide each element by the column sum
count_NORMAL_libsizeadjust = count_NORMAL / column_sums

column_sums = np.sum(count_STROMA, axis=0)
count_STROMA_libsizeadjust = count_STROMA / column_sums


column_sums = np.sum(count_DCIS, axis=0)
count_DCIS_libsizeadjust = count_DCIS / column_sums

In [None]:
# now we have to filter out any genes that were eliminated when we did VST normalization
vst_table = vst_norm[None] # we don't apply this anymore because it blocks any gene with >80% frac_zero
filtered_count = count_NORMAL[count_NORMAL.index.isin(vst_table.index)]

#print(filtered_count) # looks like it worked, rows reduced to 23203 as expected

# converting Ensemble names to gene name
#count_NORMAL = filtered_count.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))

# removed filter from VST, I realize now it removed anything with >80% fraction zero
# thus it isn't a strong representation of the data
count_NORMAL = count_NORMAL.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
#print(count_NORMAL)

# we should filter out anything that is an Ensemble ID, so anything that starts "ENSG0"
pattern = '^ENSG0'
count_NORMAL = count_NORMAL[~count_NORMAL.index.str.match(pattern)]

# I found rounding makes a difference in the plots, shifting it slightly right
#count_NORMAL = count_NORMAL.round(0)
#print(count_NORMAL)

count_NORMAL_libsizeadjust = count_NORMAL_libsizeadjust.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
#print(count_NORMAL)

# we should filter out anything that is an Ensemble ID, so anything that starts "ENSG0"
count_NORMAL_libsizeadjust = count_NORMAL_libsizeadjust[~count_NORMAL_libsizeadjust.index.str.match(pattern)]

#print(count_NORMAL_libsizeadjust)

count_STROMA = count_STROMA.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
count_STROMA = count_STROMA[~count_STROMA.index.str.match(pattern)]

count_DCIS = count_DCIS.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
count_DCIS = count_DCIS[~count_DCIS.index.str.match(pattern)]


count_STROMA_libsizeadjust = count_STROMA_libsizeadjust.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
count_STROMA_libsizeadjust = count_STROMA_libsizeadjust[~count_STROMA_libsizeadjust.index.str.match(pattern)]
count_DCIS_libsizeadjust = count_DCIS_libsizeadjust.rename(index=dict(zip(gene_convert["gene_id"], gene_convert["gene_name"])))
count_DCIS_libsizeadjust = count_DCIS_libsizeadjust[~count_DCIS_libsizeadjust.index.str.match(pattern)]


In [None]:
# now we have a table of just normal counts, with proper gene names, and Ensemble-only genes removed
# now we should be able to actually start computing things!

# lets make a function
# Later information - this turns out to be for the Poisson distribution!
# need to figure out how to do expected NB!
def variance_mean(row):
    variance = np.var(row, ddof=1)
    mean = np.mean(row)
    std_dev = np.sqrt(variance) # sometimes this is considered dispersion
    
    # I'm a bit confused exactly how I'd compute a dataset-wide dispersion from this data
    # the formula given is Variance = mean + dispersion*mean^2
    # re-arranged, this gives dispersion = (variance - mean)/(mean*mean)
    # I guess we can try it
    dispersion = 0 # in case the mean is zero
    inverse_dispersion = 0
    if ((mean != 0) & (variance != mean)):
        #dispersion = (variance - mean)/(mean**2)
        dispersion = (mean**2)/(variance - mean) # did I have this backwards?
        inverse_dispersion = dispersion**(-1)
    
    # and they define "expected fraction zeros" as exp(-mean) - Poisson distribution!
    #prob_frac = math.exp(mean*-1)
    
    # lets do the NB formulation
    prob_frac = 1 # if dispersion is zero, the fraction zero is 100%
    if (dispersion >0):
        prob_frac = ((inverse_dispersion)/(mean + inverse_dispersion))**(inverse_dispersion)
        # another version I found of this calculation
        #prob_frac = ((1)/(1 + mean*dispersion))**(inverse_dispersion)
    
    # we will also want to know the fraction of the row that equals zero
    zero_frac = (row == 0).sum()/len(row)
    return pd.Series({'variance': variance, 'mean': mean, "StDev": std_dev, "Dispersion": dispersion, "Invert_Disp": inverse_dispersion, 'zero_fraction': zero_frac, 'prob_frac': prob_frac})

# apply function along rows axis
NORMAL_dispersion = count_NORMAL.apply(variance_mean, axis=1)
NORMAL_dispersion_libadj = count_NORMAL_libsizeadjust.apply(variance_mean, axis=1)

#print(NORMAL_dispersion)
STROMA_dispersion = count_STROMA.apply(variance_mean, axis=1)
STROMA_dispersion_libadj = count_STROMA_libsizeadjust.apply(variance_mean, axis=1)

DCIS_dispersion = count_DCIS.apply(variance_mean, axis=1)
DCIS_dispersion_libadj = count_DCIS_libsizeadjust.apply(variance_mean, axis=1)



In [None]:
print(DCIS_dispersion)

In [None]:
# how many genes had 0 counts across the board for normals: 2383
#print(NORMAL_dispersion[NORMAL_dispersion['zero_fraction'] > 0.99])

#info = NORMAL_dispersion[NORMAL_dispersion['zero_fraction'] > 0.99]

#if (info['mean'] > 100).any():
#    print(info.loc[info['mean'] > 100])
prob_over_zero = NORMAL_dispersion['prob_frac'] < 1
probe_over_zero_filt = NORMAL_dispersion[prob_over_zero]
print(probe_over_zero_filt.nlargest(100, 'prob_frac'))


In [None]:
# they then used curve_fit in scipy.optimize to make the curves in Figure 1, lets try it

# we need a curve function for the plot; this is from an example, I'm not sure if it's the right one yet
def poisson_func(x, mu):
    return poisson.pmf(prob, mu)

# extract 'variance' and 'zero_count' columns
ydata = NORMAL_dispersion['zero_fraction']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
xdata = NORMAL_dispersion['StDev'] 

mean = NORMAL_dispersion['mean']
prob = NORMAL_dispersion['prob_frac']


# fit curve to data using curve_fit
popt, pcov = curve_fit(poisson_func, prob, mean)

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
#plt.plot(xdata, poisson_func(xdata, *popt), 'r-', label='fit: mu=%5.3f' % popt[0])
plt.xscale('log')
plt.ylabel('ZeroFraction')
plt.xlabel('Dispersion')
#plt.legend()
plt.show()

# made the "expected" curve transparent so I could see the data under it
# how come the left-most "expected" are further left of the "obsrer"



In [None]:
# what if we plot the probability computed from mean, but plot variance?

# extract 'variance' and 'zero_count' columns
ydata = NORMAL_dispersion['mean']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
xdata = NORMAL_dispersion['Dispersion'] # standard deviation of the variance


prob = NORMAL_dispersion['prob_frac'] # probability of fraction zeros with 
mean = NORMAL_dispersion['Dispersion'] # StDev 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Mean')
plt.xlabel('Dispersion - Normal')
#plt.legend()
plt.xlim(0.01, 100)
plt.ylim(0, 200000)
plt.title("DCIS Normal - Mean vs Log(Dispersion)", fontdict=None, loc='center', pad=None)
plt.show()

# STROMA
ydata = STROMA_dispersion['mean']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
xdata = STROMA_dispersion['Dispersion'] # standard deviation of the variance


prob = STROMA_dispersion['prob_frac'] # probability of fraction zeros with 
mean = STROMA_dispersion['Dispersion'] # StDev 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Mean')
plt.xlabel('Dispersion - Stroma')
#plt.legend()
plt.xlim(0.01, 100)
plt.ylim(0, 200000)
plt.title("DCIS Stroma - Mean vs Log(Dispersion)", fontdict=None, loc='center', pad=None)

plt.show()



# DCIS
ydata = DCIS_dispersion['mean']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
xdata = DCIS_dispersion['Dispersion'] # standard deviation of the variance

prob = DCIS_dispersion['prob_frac'] # probability of fraction zeros with 
mean = DCIS_dispersion['Dispersion'] # StDev 


# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Mean')
plt.xlabel('Dispersion - DCIS')
#plt.legend()
plt.xlim(0.01, 100)
plt.ylim(0, 200000)
plt.title("DCIS Tumour - Mean vs Log(Dispersion)", fontdict=None, loc='center', pad=None)
plt.show()

In [None]:
# what if we plot mean only with zero fraction

# extract 'mean' and 'zero_count' columns
ydata = NORMAL_dispersion['zero_fraction']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
xdata = NORMAL_dispersion['mean'] 

mean = NORMAL_dispersion['mean']
prob = NORMAL_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Fraction Zeroes')
plt.xlabel('Mean - Normal')
#plt.legend()
plt.xlim(0, 1000000)
plt.title("DCIS Normal - %Zeros vs Log(Mean)", fontdict=None, loc='center', pad=None)
plt.show()

# STROMA
ydata = STROMA_dispersion['zero_fraction']
xdata = STROMA_dispersion['mean'] # standard deviation of the variance


prob = STROMA_dispersion['prob_frac'] # probability of fraction zeros with 
mean = STROMA_dispersion['mean'] # StDev 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Fraction Zeroes')
plt.xlabel('Mean - Stroma')
#plt.legend()
plt.xlim(0, 1000000)
plt.title("DCIS Stroma - %Zeros vs Log(Mean)", fontdict=None, loc='center', pad=None)

plt.show()



# DCIS
ydata = DCIS_dispersion['zero_fraction']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
xdata = DCIS_dispersion['mean'] # standard deviation of the variance

count_changes = (xdata > 6183).sum() # seeing how many times this happens
print(count_changes)

prob = DCIS_dispersion['prob_frac'] # probability of fraction zeros with 
mean = DCIS_dispersion['mean'] # StDev 


# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Fraction Zeroes')
plt.xlabel('Mean - DCIS')
#plt.legend()
plt.xlim(0, 1000000)
plt.title("DCIS Tumour - % Zeros vs Log(Mean)", fontdict=None, loc='center', pad=None)
plt.show()


In [None]:
# Define the negative binomial PDF
def neg_binom_pdf(k, r, p):
    return comb(k + r - 1, k) * p**r * (1 - p)**k

ydata = NORMAL_dispersion['zero_fraction']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
xdata = NORMAL_dispersion['Dispersion'] 


# Fit the distribution to the data
popt, pcov = curve_fit(neg_binom_pdf, xdata, ydata)

# Plot the data and fitted distribution
plt.plot(xdata, ydata, 'bo', label='data')
plt.plot(xdata, neg_binom_pdf(xdata, *popt), 'r-', label='fit', alpha=0.02)
plt.xlabel('k')
plt.ylabel('P(k)')
plt.legend()
plt.show()

# Print the estimated parameters
print('r =', popt[0])
print('p =', popt[1])



#ydata = NORMAL_dispersion['zero_fraction']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
#xdata = NORMAL_dispersion['Dispersion'] 

#mean = NORMAL_dispersion['mean']
#prob = NORMAL_dispersion['prob_frac']

# plot data points and fitted curve
##plt.scatter(xdata, ydata)
#plt.scatter(mean, prob)
#plt.xscale('log')
#plt.ylabel('ZeroFraction')
#plt.xlabel('Dispersion')
#plt.show()

In [None]:
# map out the distribution of counts across some patients

patient1 = count_NORMAL['DCRT_348_Obs_305_N']

plt.hist(patient1, range=(0, 2000), bins=2000)
plt.yscale('log')
# add labels and title
plt.xlabel('#Counts')
plt.ylabel('Frequency (log scale)')
plt.title('Histogram of Counts from DCRT_115_Obs_295_N')

plt.show() 

In [None]:

patient2 = count_NORMAL['DCRT_116_Obs_338_N']

plt.hist(patient2, range=(0, 2000), bins=2000)
plt.yscale('log')
# add labels and title
plt.xlabel('#Counts')
plt.ylabel('Frequency (log scale)')
plt.title('Histogram of Counts from DCRT_116_Obs_338_N')

plt.show() 


In [None]:

patient2 = count_NORMAL['DCRT_116_Obs_338_N']

plt.hist(patient2, range=(0, 10), bins=10)
#plt.yscale('log')
# add labels and title
plt.xlabel('#Counts')
plt.ylabel('Frequency (no scale)')
plt.title('Histogram of Counts from DCRT_116_Obs_338_N')

plt.show() 

In [None]:
print(patient2[patient2 >= -1])

In [None]:
# alright, lets try to plot the NORMAL_dispersion_libadj mean and fract zeros

# extract 'variance' and 'zero_count' columns
ydata = NORMAL_dispersion_libadj['zero_fraction']
# the "Dispersion" calculation based on the formula given in Svensson looks nothing like expected
# the StDev (also called the dispersion)
xdata = NORMAL_dispersion_libadj['mean'] 

#mean = NORMAL_dispersion['mean']
#prob = NORMAL_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.01)
#plt.plot(xdata, poisson_func(xdata, *popt), 'r-', label='fit: mu=%5.3f' % popt[0])
plt.xscale('log')
plt.ylabel('ZeroFraction of Gene')
plt.xlabel('Mean of Library-Adjusted Gene Expr - NORMAL')
#plt.legend()
plt.show()


# STROMA
# extract 'variance' and 'zero_count' columns
ydata = STROMA_dispersion_libadj['zero_fraction']
xdata = STROMA_dispersion_libadj['mean'] 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.ylabel('ZeroFraction of Gene')
plt.xlabel('Mean of Library-Adjusted Gene Expr - STROMA')
#plt.legend()
plt.show()

# DCIS
# extract 'variance' and 'zero_count' columns
ydata = DCIS_dispersion_libadj['zero_fraction']
xdata = DCIS_dispersion_libadj['mean'] 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.ylabel('ZeroFraction of Gene')
plt.xlabel('Mean of Library-Adjusted Gene Expr - DCIS')
#plt.legend()
plt.show()


In [None]:
##### THIRD PARTY DATASET ###### GSE146889

In [None]:
## here, we will repeat our plots but for a different data set
all_counts = pyreadr.read_r('/path/to/Third_Party_FFPE/GSE146889_GeneCount.rds')

# we may want to keep this table for Ensemble -> RefSeq conversion
gene_convert = pyreadr.read_r('/path/to/metadata/ensemble_to_refseq_gene_name_table.rds')



# now we want to isolate just the expression from a particular type of tissue

df = all_counts[None] # load all_counts into a pandas data frame

# we need to split the tumors and normals by name
count_TUMOR = df.filter(like='tumor')
count_NORMAL = df.filter(like='normal')

#print(df_tumor)


In [None]:
# I want to add a new count table, averaged by the number of reads in the entire sample
column_sums = np.sum(count_NORMAL, axis=0)

# Divide each element by the column sum
count_NORMAL_libsizeadjust = count_NORMAL / column_sums

column_sums = np.sum(count_TUMOR, axis=0)
count_TUMOR_libsizeadjust = count_TUMOR / column_sums

print(count_NORMAL)

In [None]:
def variance_mean(row):
    variance = np.var(row, ddof=1)
    mean = np.mean(row)
    std_dev = np.sqrt(variance) # sometimes this is considered dispersion
    
    # I'm a bit confused exactly how I'd compute a dataset-wide dispersion from this data
    # the formula given is Variance = mean + dispersion*mean^2
    # re-arranged, this gives dispersion = (variance - mean)/(mean*mean)
    # I guess we can try it
    dispersion = 0 # in case the mean is zero, or if variance = mean
    if ((mean != 0) & (variance != mean)):
        #dispersion = (variance - mean)/(mean**2)
        dispersion = (mean**2)/(variance - mean) # did I have this backwards?
        inverse_dispersion = dispersion**-1
    
    # and they define "expected fraction zeros" as exp(-mean) - Poisson distribution!
    #prob_frac = math.exp(mean*-1)
    
    # lets do the NB formulation
    prob_frac = 1 # if dispersion is zero, the fraction zero is 100%
    if (dispersion >0):
        prob_frac = ((inverse_dispersion)/(mean + inverse_dispersion))**(inverse_dispersion)
    
    # we will also want to know the fraction of the row that equals zero
    zero_frac = (row == 0).sum()/len(row)
    return pd.Series({'variance': variance, 'mean': mean, "StDev": std_dev, "Dispersion": dispersion, 'zero_fraction': zero_frac, 'prob_frac': prob_frac})

# apply function along rows axis
NORMAL_dispersion = count_NORMAL.apply(variance_mean, axis=1)
NORMAL_dispersion_libadj = count_NORMAL_libsizeadjust.apply(variance_mean, axis=1)

TUMOR_dispersion = count_TUMOR.apply(variance_mean, axis=1)
TUMOR_dispersion_libadj = count_TUMOR_libsizeadjust.apply(variance_mean, axis=1)

In [None]:
# lets look at the normals
prob_over_zero = NORMAL_dispersion['prob_frac'] < 1
probe_over_zero_filt = NORMAL_dispersion[prob_over_zero]
print(probe_over_zero_filt.nsmallest(100, 'Dispersion'))


In [None]:
# extract 'variance' and 'zero_count' columns
ydata = NORMAL_dispersion['zero_fraction']
xdata = NORMAL_dispersion['mean'] 

mean = NORMAL_dispersion['mean']
prob = NORMAL_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE146889 Normal - Zero Frac. vs Log(Mean) [Blue]; Expected Mean [Orange]", fontdict=None, loc='center', pad=None)
plt.show()

# and again for tumours
ydata = TUMOR_dispersion['zero_fraction']
xdata = TUMOR_dispersion['mean'] 

mean = TUMOR_dispersion['mean']
prob = TUMOR_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE146889 Tumor - Zero Frac. vs Log(Mean) [Blue]; Expected Mean [Orange]", fontdict=None, loc='center', pad=None)
plt.show()


In [None]:
# what if we plot the probability computed from mean, but plot variance?

# extract 'variance' and 'zero_count' columns
ydata = NORMAL_dispersion['zero_fraction']
xdata = NORMAL_dispersion['Dispersion'] # standard deviation of the variance

prob = NORMAL_dispersion['prob_frac'] # probability of fraction zeros with 
mean = NORMAL_dispersion['Dispersion'] # StDev 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('ZeroFraction')
plt.xlabel('Dispersion - Normal')
plt.title("GSE146889 Normal - Zero Frac. vs Log(Dispersion); Expected Disp. [Orange]", fontdict=None, loc='center', pad=None)
plt.xlim([0, 50])
plt.show()

# DCIS
ydata = TUMOR_dispersion['zero_fraction']
xdata = TUMOR_dispersion['Dispersion'] # standard deviation of the variance

prob = TUMOR_dispersion['prob_frac'] # probability of fraction zeros with 
mean = TUMOR_dispersion['Dispersion'] # StDev 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('ZeroFraction')
plt.xlabel('Dispersion - Tumor')
plt.title("GSE146889 Tumor - Zero Frac. vs Log(Dispersion); Expected Disp. [Orange]", fontdict=None, loc='center', pad=None)
plt.xlim([0, 50])
plt.show()


In [None]:
# what if we plot mean vs dispersion

# extract 'variance' and 'zero_count' columns
ydata = NORMAL_dispersion['mean']
xdata = NORMAL_dispersion['Dispersion'] # standard deviation of the variance

prob = NORMAL_dispersion['prob_frac'] # probability of fraction zeros with 
mean = NORMAL_dispersion['Dispersion'] # StDev 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Mean')
plt.xlabel('Dispersion - Normal')
plt.title("GSE146889 Normal - Zero Frac. vs Log(Dispersion)", fontdict=None, loc='center', pad=None)
plt.xlim([0, 50])
plt.show()

# DCIS
ydata = TUMOR_dispersion['mean']
xdata = TUMOR_dispersion['Dispersion'] # standard deviation of the variance

prob = TUMOR_dispersion['prob_frac'] # probability of fraction zeros with 
mean = TUMOR_dispersion['Dispersion'] # StDev 

# plot data points and fitted curve
plt.scatter(xdata, ydata)
#plt.scatter(mean, prob, alpha=0.02)
plt.xscale('log')
plt.ylabel('Mean')
plt.xlabel('Dispersion - Tumor')
plt.title("GSE146889 Tumor - Mean vs Log(Dispersion)", fontdict=None, loc='center', pad=None)
plt.xlim([0, 50])
plt.ylim(0, 200000)
plt.show()


In [None]:
# histogram of counts for a patient
patient2 = count_NORMAL['MSI_MLH1G_normal_10_count']

plt.hist(patient2, range=(0, 2000), bins=2000)
plt.yscale('log')
# add labels and title
plt.xlabel('#Counts')
plt.ylabel('Frequency (log scale)')
plt.title('Histogram of Counts from MSI_MLH1G_normal_10_count')

plt.show() 

In [None]:
### Third Party Dataset - GSE209998 ### 

In [None]:
## here, we will repeat our plots but for a different data set
all_counts = pyreadr.read_r('/path/to/Third_Party_FFPE/GSE209998_GeneCount.rds')


sample_information = pyreadr.read_r('/path/to/Third_Party_FFPE/GSE209998_Sample_Data.rds')

# now we want to isolate just the expression from a particular type of tissue

df_counts = all_counts[None] # load all_counts into a pandas data frame
df_sample = sample_information[None] # load all_counts into a pandas data frame

# here, we need to match if a sample is normal or tumour by !Sample_source_name_ch1 row

# so I need to: 1) match columns between sample_information and all_counts 
# are they in the same order
columns_df1 = df_counts.columns
columns_df2 = df_sample.columns

#if columns_df1.equals(columns_df2):
#    print("The columns are in the same order.")
# this code shows the columns are indeed in the same order

# Now we find what samples were tumours and what were normal
samples_row = df_sample.loc["!Sample_source_name_ch1"]

split_dfs = {}
for sample_type in samples_row.unique():
    matching_columns = [col for col in df_counts.columns if col in df_sample.columns and samples_row[col] == sample_type]
    split_dfs[sample_type] = df_counts[matching_columns]

# Access the split DataFrames using the sample type
count_NORMAL = split_dfs["Normal tissue"]
count_TUMOR = split_dfs["Primary tumor"]
count_META = split_dfs["Metastatic tumor"]

print("NORMAL", count_NORMAL.shape[1])
print("PRIMARY", count_TUMOR.shape[1])
print("METASTAT.", count_META.shape[1])
# there are only 6 normal samples so they might not be good for further analysis
# there are 44 primary tumour and 79 metastatic

sample_source = df_sample.loc["!Sample_source"]

split_source = {}
for sample_type in sample_source.unique():
    matching_columns = [col for col in df_counts.columns if col in df_sample.columns and sample_source[col] == sample_type]
    split_source[sample_type] = df_counts[matching_columns]


count_FRESH = split_source["Fresh frozen"]
count_FFPE = split_source["FFPE"]

print("FRESH", count_FRESH.shape[1])
print("FFPE", count_FFPE.shape[1])
# note that this would be a mix of primary, metastatic and normals

# how many normals are in fresh/frozen?

split_dfs = {}
for sample_type in samples_row.unique():
    matching_columns = [col for col in count_FRESH.columns if col in df_sample.columns and samples_row[col] == sample_type]
    split_dfs[sample_type] = count_FRESH[matching_columns]

# Access the split DataFrames using the sample type
count_NORMAL = split_dfs["Normal tissue"]
count_TUMOR = split_dfs["Primary tumor"]
count_META = split_dfs["Metastatic tumor"]

print("NORMAL", count_NORMAL.shape[1])
print("PRIMARY", count_TUMOR.shape[1])
print("METASTAT.", count_META.shape[1])

In [None]:
# I want to add a new count table, averaged by the number of reads in the entire sample
column_sums = np.sum(count_FRESH, axis=0)

# Divide each element by the column sum
count_FRESH_libsizeadjust = count_FRESH / column_sums

column_sums = np.sum(count_FFPE, axis=0)
count_FFPE_libsizeadjust = count_FFPE / column_sums

#print(count_FFPE_libsizeadjust)


In [None]:
def variance_mean(row):
    variance = np.var(row, ddof=1)
    mean = np.mean(row)
    std_dev = np.sqrt(variance) # sometimes this is considered dispersion
    
    # I'm a bit confused exactly how I'd compute a dataset-wide dispersion from this data
    # the formula given is Variance = mean + dispersion*mean^2
    # re-arranged, this gives dispersion = (variance - mean)/(mean*mean)
    # I guess we can try it
    dispersion = 0 # in case the mean is zero, or if variance = mean
    if ((mean != 0) & (variance != mean)):
        #dispersion = (variance - mean)/(mean**2)
        dispersion = (mean**2)/(variance - mean) # did I have this backwards?
        inverse_dispersion = dispersion**-1
    
    # and they define "expected fraction zeros" as exp(-mean) - Poisson distribution!
    #prob_frac = math.exp(mean*-1)
    
    # lets do the NB formulation
    prob_frac = 1 # if dispersion is zero, the fraction zero is 100%
    if (dispersion >0):
        prob_frac = ((inverse_dispersion)/(mean + inverse_dispersion))**(inverse_dispersion)
    
    # we will also want to know the fraction of the row that equals zero
    zero_frac = (row == 0).sum()/len(row)
    return pd.Series({'variance': variance, 'mean': mean, "StDev": std_dev, "Dispersion": dispersion, 'zero_fraction': zero_frac, 'prob_frac': prob_frac})

# apply function along rows axis
FRESH_dispersion = count_FRESH.apply(variance_mean, axis=1)
FRESH_dispersion_libadj = count_FRESH_libsizeadjust.apply(variance_mean, axis=1)

FFPE_dispersion = count_FFPE.apply(variance_mean, axis=1)
FFPE_dispersion_libadj = count_FFPE_libsizeadjust.apply(variance_mean, axis=1)

In [None]:
# extract 'variance' and 'zero_count' columns
ydata = FRESH_dispersion['zero_fraction']
xdata = FRESH_dispersion['mean'] 

mean = FRESH_dispersion['mean']
prob = FRESH_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.xlim(0.001, 1000000)
plt.title("GSE209998 FreshFrozen - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", fontdict=None, loc='center', pad=None)
plt.show()

# and again for tumours
ydata = FFPE_dispersion['zero_fraction']
xdata = FFPE_dispersion['mean'] 

mean = FFPE_dispersion['mean']
prob = FFPE_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE209998 FFPE - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", fontdict=None, loc='center', pad=None)
plt.show()


In [None]:
### Fourth Party Dataset - GSE47462 ### 

In [None]:
# this time instead of starting from RDS files, I want to pre-process the data here
# we start with GSE47462
# sample names have normals, IDC and EN (early neoplasia)

# Read the CSV file into a DataFrame
data = pd.read_csv('/path/to/Third_Party_FFPE/Additional_Sets/GSE47462_Raw_counts_Refseq_genes.txt',
                  delimiter='\t')

# Split the DataFrame into subsets based on column names indicating sample type
normal_data = data.filter(like='_normal')
EN_data = data.filter(like='_EN')
DCIS_data = data.filter(like='_DCIS')
IDC_data = data.filter(like='_IDC')

# since there isn't a ton of data, I also want to group tumors
tumours_data = data.loc[:, ~data.columns.str.contains('_normal')]
tumours_data = tumours_data.iloc[:, 1:]

# convert to normal numpy matrices of counts
normal_counts = normal_data.to_numpy()
en_counts = EN_data.to_numpy()
dcis_counts = DCIS_data.to_numpy()
idc_counts = IDC_data.to_numpy()
tumours_counts = tumours_data.to_numpy()


In [None]:
print("NORMAL", normal_counts.shape[1])
print("IDC", idc_counts.shape[1])
print("DCIS", dcis_counts.shape[1])
print("Early Neoplastia", en_counts.shape[1])
print("All Tumours", tumours_counts.shape[1])

In [None]:
# dispersion of tumours
tumours_counts = pd.DataFrame(tumours_counts)

column_sums = np.sum(tumours_counts, axis=0)
count_TUMOR_libsizeadjust = tumours_counts / column_sums

TUMOR_dispersion = tumours_counts.apply(variance_mean, axis=1)
TUMOR_dispersion_libadj = count_TUMOR_libsizeadjust.apply(variance_mean, axis=1)

# and normals
normal_counts = pd.DataFrame(normal_counts)

column_sums = np.sum(normal_counts, axis=0)
count_NORMAL_libsizeadjust = normal_counts / column_sums

NORMAL_dispersion = normal_counts.apply(variance_mean, axis=1)
NORMAL_dispersion_libadj = count_NORMAL_libsizeadjust.apply(variance_mean, axis=1)

In [None]:
# now lets draw the dispersion
# extract 'variance' and 'zero_count' columns
ydata = NORMAL_dispersion['zero_fraction']
xdata = NORMAL_dispersion['mean'] 

mean = NORMAL_dispersion['mean']
prob = NORMAL_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.xlim(0.001, 1000000)
plt.title("GSE47462 Normal - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", fontdict=None, loc='center', pad=None)
plt.show()

# and again for tumours
ydata = TUMOR_dispersion['zero_fraction']
xdata = TUMOR_dispersion['mean'] 

mean = TUMOR_dispersion['mean']
prob = TUMOR_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE47462 Tumours - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", fontdict=None, loc='center', pad=None)
plt.show()

In [None]:
# Next Dataset - GSE120795_total_norms_raw_counts.tsv

In [None]:
# these are all from normal patients
# not all the data is FFPE, some is fresh, so I'll need to separate them

# Read the CSV file into a DataFrame
data = pd.read_csv('/path/to/Third_Party_FFPE/Additional_Sets/GSE120795_total_norms_raw_counts.tsv',
                  delimiter='\t')

# in the series matrix"disease: healthy", 
patient_info = pd.read_csv('/path/to/Third_Party_FFPE/Additional_Sets/GSE120795_cell_info.txt',
                  delimiter='\t')
# note that the column order of these two files are NOT the same




In [None]:
mask = patient_info.iloc[0] == "healthy"

filtered_data = patient_info.loc[:, mask]
patient_names = filtered_data.columns


In [None]:
# use patient info to filter data
# remember that patient names from "filtered_data" don't have ".fastq.gz" at the end of them
column_names_with_extension = [name + ".fastq.gz" for name in patient_names]
column_names_with_extension = column_names_with_extension[1:]

# Assuming 'second_list' is the list where you want to filter based on column names
filtered_data = data[column_names_with_extension]

print(data.shape)
print(filtered_data.shape)


In [None]:
# dispersion of tumours
ffpe_counts = pd.DataFrame(filtered_data)

column_sums = np.sum(ffpe_counts, axis=0)
count_FFPE_libsizeadjust = ffpe_counts / column_sums

FFPE_dispersion = ffpe_counts.apply(variance_mean, axis=1)
FFPE_dispersion_libadj = count_TUMOR_libsizeadjust.apply(variance_mean, axis=1)


In [None]:
ydata = FFPE_dispersion['zero_fraction']
xdata = FFPE_dispersion['mean'] 

mean = FFPE_dispersion['mean']
prob = FFPE_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE120795 FFPE - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", fontdict=None, loc='center', pad=None)
plt.show()

In [None]:
# ONE MORE Dataset - GSE193103

In [None]:
# all FFPE breast tumours, different treatment (not split up)

data = pd.read_csv('/path/to/Third_Party_FFPE/Additional_Sets/GSE193103_salmon_gene.matrix_RAP101_plus_Normals24.txt',
                  delimiter='\t')

# this data set has fresh and FFPE, so we'll need to separate them
# it also has breast tumour vs metastatic breast, so there are differences there too
# not 100% if the metastasis is from the tissue it is found in, or from the breast
patient_info = pd.read_csv('/path/to/Third_Party_FFPE/Additional_Sets/GSE193103_Patient_Data.txt',
                  delimiter='\t')






In [None]:
# row [0] is metastasis location, [1] is cancer type (breast or metastasis) and [2] is fresh/FFPE
# "Fresh frozen" or "Formalin-Fixed Paraffin-Embedded"
mask = patient_info.iloc[2] == "Formalin-Fixed Paraffin-Embedded"
FFPE_data = patient_info.loc[:, mask]
FFPE_patient_names = FFPE_data.columns

mask = patient_info.iloc[2] == "Fresh frozen"
Fresh_data = patient_info.loc[:, mask]
Fresh_patient_names = Fresh_data.columns

# now we filter the data
filtered_FFPE_data = data[FFPE_patient_names]
filtered_Fresh_data = data[Fresh_patient_names]

print(data.shape)
print(filtered_FFPE_data.shape) # 20
print(filtered_Fresh_data.shape) # 105



In [None]:
# dispersion of tumours - All Data
tumours_counts = pd.DataFrame(data)
# removing gene column
tumours_counts = tumours_counts.drop(tumours_counts.columns[0], axis=1)
column_sums = np.sum(tumours_counts, axis=0)
TUMOR_dispersion = tumours_counts.apply(variance_mean, axis=1)

# dispersion of tumours - FFPE Only
FFPE_counts = pd.DataFrame(filtered_FFPE_data)
# removing gene column
FFPE_counts = FFPE_counts.drop(FFPE_counts.columns[0], axis=1)
column_sums = np.sum(FFPE_counts, axis=0)
FFPE_dispersion = FFPE_counts.apply(variance_mean, axis=1)

# dispersion of tumours - Fresh Only
Fresh_counts = pd.DataFrame(filtered_Fresh_data)
# removing gene column
Fresh_counts = Fresh_counts.drop(Fresh_counts.columns[0], axis=1)
column_sums = np.sum(Fresh_counts, axis=0)
Fresh_dispersion = Fresh_counts.apply(variance_mean, axis=1)



In [None]:
# all samples (fresh and FFPE)
ydata = TUMOR_dispersion['zero_fraction']
xdata = TUMOR_dispersion['mean'] 

mean = TUMOR_dispersion['mean']
prob = TUMOR_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE193103 All - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", 
          fontdict=None, loc='center', pad=None)
plt.show()

In [None]:
# split by FFPE and Fresh/Frozen
print(FFPE_dispersion)

ydata = FFPE_dispersion['zero_fraction']
xdata = FFPE_dispersion['mean'] 

mean = FFPE_dispersion['mean']
prob = FFPE_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE193103 FFPE - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", 
          fontdict=None, loc='center', pad=None)
plt.show()

## Fresh/Frozen
# split by FFPE and Fresh/Frozen
ydata = Fresh_dispersion['zero_fraction']
xdata = Fresh_dispersion['mean'] 

mean = Fresh_dispersion['mean']
prob = Fresh_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE193103 Fresh/Froz. - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", 
          fontdict=None, loc='center', pad=None)
plt.show()



In [None]:
# new dataset - GSE181466

In [None]:
# all FFPE breast tumours, different treatment (not split up)
data = pd.read_csv('/path/to/Third_Party_FFPE/Additional_Sets/GSE181466_rsem_genes_matrix-97.txt',
                  delimiter='\t')

# patient information splitting is unnecessary, this appears to all be both FFPE and from tumours
# there is subtype and age information in the series matrix file, if we're interested


In [None]:
# dispersion of tumours - All Data
tumours_counts = pd.DataFrame(data)
# removing gene column at position 0
tumours_counts = tumours_counts.drop(tumours_counts.columns[0], axis=1)
column_sums = np.sum(tumours_counts, axis=0)
TUMOR_dispersion = tumours_counts.apply(variance_mean, axis=1)


In [None]:
print(data.shape)

ydata = TUMOR_dispersion['zero_fraction']
xdata = TUMOR_dispersion['mean'] 

mean = TUMOR_dispersion['mean']
prob = TUMOR_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE181466 FFPE - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", 
          fontdict=None, loc='center', pad=None)
plt.show()

In [None]:
data = pd.read_csv('/data/lab_vm/raw/preffect/7_datasets/third_party/CountMeIn_BConly_third_party_ffpe/MBC_CMI_Compiled_Counts.tsv',
                  delimiter=' ')

# "5" "ENSG00000000003.15" "TSPAN6" "protein_coding" 350 247 ...


# dispersion of tumours - All Data
tumours_counts = pd.DataFrame(data)
# first four columns are gene descriptors
tumours_counts = tumours_counts.iloc[:, 4:]

column_sums = np.sum(tumours_counts, axis=0)
TUMOR_dispersion = tumours_counts.apply(variance_mean, axis=1)


In [None]:
print(data.shape)

ydata = TUMOR_dispersion['zero_fraction']
xdata = TUMOR_dispersion['mean'] 

mean = TUMOR_dispersion['mean']
prob = TUMOR_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("TMBC FFPE - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", 
          fontdict=None, loc='center', pad=None)
plt.show()

In [None]:
# new dataset - GSE167977

In [None]:
# all FFPE breast tumours, different treatment (not split up)
import pandas as pd

data = pd.read_csv('/path/to/Third_Party_FFPE/Additional_Sets/GSE167977_Raw_Counts.txt',
                  delimiter='\t')

# no reason to load up the series matrix, as samples only really differ by treatment (and there's no outcome so ...)


In [None]:
# dispersion of tumours - All Data
tumours_counts = pd.DataFrame(data)
# removing gene column at position 0

# I also need to remove the last 5 columns, as they contained data of the genes
tumours_counts = tumours_counts.drop(tumours_counts.columns[0], axis=1) # column 1
tumours_counts = tumours_counts.drop(tumours_counts.columns[-5:], axis=1) # last 5 columns

column_sums = np.sum(tumours_counts, axis=0)
TUMOR_dispersion = tumours_counts.apply(variance_mean, axis=1)

print(tumours_counts)



In [None]:
print(data.shape)

ydata = TUMOR_dispersion['zero_fraction']
xdata = TUMOR_dispersion['mean'] 

mean = TUMOR_dispersion['mean']
prob = TUMOR_dispersion['prob_frac']

# plot data points and fitted curve
plt.scatter(xdata, ydata)
plt.scatter(mean, prob, alpha=0.01)
plt.xscale('log')
plt.xlim(0.001, 1000000)
plt.ylabel('ZeroFraction')
plt.xlabel('Log[mean]')
plt.title("GSE167977 FFPE - ZeroFrac vs Log(Mean) [Blue]; Exp. Mean [Orange]", 
          fontdict=None, loc='center', pad=None)
plt.show()

In [None]:
# lets make QQ Plots, comparing gene expression with an NB
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Simulated or real observed data
# use tumours_counts, as it cut out non-count data
observed_random_gene = tumours_counts.iloc[15, :]

# calculate r and p from this gene
# Estimate the parameters r and p using method of moments
mean_gene_expression = np.mean(observed_random_gene)
var_gene_expression = np.var(observed_random_gene)
p = mean_gene_expression / var_gene_expression
r = mean_gene_expression**2 / (var_gene_expression - mean_gene_expression)

# Sort the observed data
sorted_data = np.sort(observed_random_gene)

# Calculate observed quantiles
observed_quantiles = np.array([(i - 0.5) / len(sorted_data) for i in range(1, len(sorted_data) + 1)])

# Calculate theoretical quantiles
theoretical_quantiles = stats.nbinom.ppf(observed_quantiles, r, p)

# Create the Q-Q plot
plt.scatter(theoretical_quantiles, sorted_data)
plt.plot([min(theoretical_quantiles), max(theoretical_quantiles)], [min(sorted_data), max(sorted_data)], 'r--')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Observed Quantiles')
plt.title('Q-Q Plot for NB Data')
plt.show()



In [None]:
# now again, using a ZINB
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

# Simulated or real observed data
# use tumours_counts, as it cut out non-count data
observed_random_gene = tumours_counts.iloc[15, :]

# lets compute the ZINB
pi = sum(observed_random_gene == 0) / len(observed_random_gene)
non_zero_data = observed_random_gene[observed_random_gene > 0]
mean_gene_expression = np.mean(non_zero_data)
var_gene_expression = np.var(non_zero_data)
p = mean_gene_expression / var_gene_expression
r = mean_gene_expression**2 / (var_gene_expression - mean_gene_expression)

# Sort the observed data, removing zeros
sorted_data = np.sort(non_zero_data)

# Calculate observed quantiles
observed_quantiles = np.array([(i - 0.5) / len(sorted_data) for i in range(1, len(sorted_data) + 1)])

# Generate theoretical ZINB quantiles using a large simulated dataset
n_samples = 10000
nb_samples = stats.nbinom.rvs(r, p, size=n_samples)
zero_inflated = np.random.rand(n_samples) < pi
zinb_samples = np.where(zero_inflated, 0, nb_samples)
zinb_samples_no_zero = zinb_samples[zinb_samples > 0]
zinb_samples_no_zero.sort()

theoretical_quantiles = np.percentile(zinb_samples_no_zero, observed_quantiles*100)

# Create QQ plot
plt.scatter(theoretical_quantiles, sorted_data)
plt.plot([min(theoretical_quantiles), max(theoretical_quantiles)], [min(sorted_data), max(sorted_data)], 'r--')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Observed Quantiles')
plt.title('QQ-Plot for ZINB Data')
plt.show()


In [None]:
# using Maximum Likelihood Estimation (MLE) to compute ZINB

from scipy.optimize import minimize
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

def nll_zinb(params, data):
    r, p, pi = params
    nb_term = stats.nbinom.logpmf(data[data > 0], r, p)
    zero_term = np.log((1 - pi) + pi * stats.nbinom.pmf(0, r, p))
    nll = -np.sum(nb_term) - np.sum(zero_term)
    return nll

# Your observed gene expression data
observed_random_gene = tumours_counts.iloc[15, :]

# Initial guesses for r, p, and pi
initial_guess = [1, 0.5, 0.1]

# Bounds for r, p, and pi
bounds = [(0.01, 20), (0.01, 0.99), (0.01, 0.99)]

result = minimize(nll_zinb, initial_guess, args=(observed_random_gene,), bounds=bounds)
r_mle, p_mle, pi_mle = result.x

sorted_data = np.sort(observed_random_gene)

# Calculate observed quantiles
observed_quantiles = np.array([(i - 0.5) / len(sorted_data) for i in range(1, len(sorted_data) + 1)])

# Calculate theoretical quantiles with ZINB parameters
# Zero-inflation will affect the lower quantiles; we need to adjust for this.
adjusted_quantiles = (1 - pi_mle) + pi_mle * observed_quantiles
theoretical_quantiles = stats.nbinom.ppf(adjusted_quantiles, r_mle, p_mle)

# Create QQ plot
plt.scatter(theoretical_quantiles, sorted_data)
plt.plot([min(theoretical_quantiles), max(theoretical_quantiles)], [min(theoretical_quantiles), max(theoretical_quantiles)], 'r--')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Observed Quantiles')
plt.title('QQ plot using Maximum Likelihood Estimation (ZINB)')
plt.show()


In [None]:
from PIL import Image, ImageOps
import os

image_path = "/path/to/1_estimating_common_dispersion/mean_vs_zerofract_plots/"

# List of image file paths in the directory
image_files = sorted([os.path.join(image_path, file) for file in os.listdir(image_path) if file.endswith(('.png', '.jpg', '.jpeg'))])

# Check if there are exactly 13 images
if len(image_files) != 13:
    print(len(image_files))
    raise ValueError("There must be exactly 13 image files in the directory")

# Load images and crop black text borders
def crop_image(image):
    # Convert image to grayscale
    gray_image = image.convert('L')
    # Invert the image
    inverted_image = ImageOps.invert(gray_image)
    # Get bounding box of non-zero regions in the inverted image
    bbox = inverted_image.getbbox()
    # Crop the original image to the bounding box
    cropped_image = image.crop(bbox)
    return cropped_image

images = [crop_image(Image.open(img)) for img in image_files]

# Assuming all images are the same size after cropping
img_width, img_height = images[0].size

# Define grid size (4x4)
grid_width = 5
grid_height = 3

# Create a blank image with the appropriate size
combined_image = Image.new('RGB', (grid_width * img_width, grid_height * img_height))

# Paste images into the combined image
for idx, image in enumerate(images):
    x = (idx % grid_width) * img_width
    y = (idx // grid_width) * img_height
    combined_image.paste(image, (x, y))



# Save the combined image
final_impage = image_path + "combined_plot.png"

combined_image.save(final_impage)