In [132]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [153]:
#infile = "~/Desktop/Research/Datasets/GSE206932_merged.counts.bulk.csv"
infile = "/Users/jolivie1/Desktop/Research/equivalence_testing_output/scripts/output/deseq_normalization/GSE206932_deseq2_normalized_counts.csv"
meta = "~/Desktop/Research/Datasets/GSE206932_meta.csv"
condition = "age"

delta = 1

df = pd.read_csv(infile, index_col = 0)

meta = pd.read_csv(meta, index_col = 0)

# only include samples that are in the metadata
df = df[meta.index]

# find which samples are in each group
groups = []
for name, group in meta.groupby(condition):
    groups.append(list(group.index))

# Remove columns that have <= 1 nonzero entry
# do in R script?
#nnz = (~(df == 0).all())
#df = df[nnz[nnz].index]
    
nnz_plus = ((df != 0).sum(axis=1) > 1)
df = df.loc[nnz_plus[nnz_plus].index,]


# log 2 scale all entries
df = np.log2(df + 1)

In [185]:
def degrees_freedom_welch(df):
    # see methods for formula
    # deviation from simple nf + ng - 2 because we're not assuming equal variances
    #df["nu"] = (df["sf2"]/df["nf"] + df["sg2"]/df["ng"])**2/(df["sf2"]**2/(df["nf"]**2*(df["nf"] - 1)) + df["sg2"]**2/(df["nf"]**2*(df["ng"] - 1)))
    df["nu"] = df["nf"]  + df["ng"]  - 2
    return df

def calc_t(df, delta):
    #df["num"] = (df["muf"] - df["mug"]) + delta
    #df["denom"] = np.sqrt((df["sf2"]/df["nf"]) + (df["sg2"]/df["ng"]))
    #df["t{}".format(delta)] = df["num"]/df["denom"]

    df["t{}".format(delta)] = ((df["muf"] - df["mug"]) + delta)/(np.sqrt((df["sf2"]/df["nf"]) + (df["sg2"]/df["ng"])))
    return df

def calc_diff_pval(df):
    df["cdf_t0"] = stats.t.cdf(df["t0"],df["nu"])
    df["diff_pval"] = 2*pd.concat([df["cdf_t0"], 1 - df["cdf_t0"]],axis=1).min(axis=1)
    return df

def calc_equiv_pval(df):
    df["cdf_t1"] = stats.t.cdf(df["t{}".format(-delta)], df["nu"])
    df["cdf_t2"] = 1 - stats.t.cdf(df["t{}".format(delta)], df["nu"])

    df["equiv_pval"] = pd.concat([calc_df["cdf_t1"], calc_df["cdf_t2"]],axis=1).max(axis=1)
    return df

In [186]:
calc_df = pd.concat([df[groups[0]].mean(axis=1), # mean group 1
           df[groups[1]].mean(axis=1), # mean group 2
           df[groups[0]].var(axis=1), # sample variance group 1
           df[groups[1]].var(axis=1)
          ],axis=1).rename({0 : "muf", 1 : "mug", 2 : "sf2", 3 : "sg2"},axis=1)
calc_df["nf"] = len(groups[0])
calc_df["ng"] = len(groups[1])


degrees_freedom_welch(calc_df)
calc_t(calc_df, 0)
calc_t(calc_df, delta)
calc_t(calc_df, -delta)

# adjust p values
# add fold change
# add num nnz
# add log vs non-log values
# add true/false based on fold change
calc_diff_pval(calc_df)

calc_equiv_pval(calc_df)

calc_df.sort_values("diff_pval")

Unnamed: 0,muf,mug,sf2,sg2,nf,ng,nu,t0,t1,t-1,cdf_t0,diff_pval,cdf_t1,cdf_t2,equiv_pval
Gm43094,0.917133,0.000000,0.005431,0.000000,6,6,10,30.483920,63.722196,-2.754356,1.000000e+00,3.381317e-11,1.016286e-02,1.099121e-14,1.016286e-02
Scgn,6.825182,3.747797,0.022352,0.065920,6,6,10,25.371537,33.616050,17.127024,1.000000e+00,2.074652e-10,1.000000e+00,6.413092e-12,1.000000e+00
Prrxl1,6.083772,3.325149,0.059716,0.064999,6,6,10,19.134090,26.070192,12.197989,1.000000e+00,3.306543e-09,9.999999e-01,7.935097e-11,9.999999e-01
Gm7609,3.463566,0.000000,0.222662,0.000000,6,6,10,17.979452,23.170474,12.788429,1.000000e+00,6.062924e-09,9.999999e-01,2.535024e-10,9.999999e-01
Rfng,9.092809,9.510107,0.001388,0.002080,6,6,10,-17.357115,24.236901,-58.951130,4.268590e-09,8.537181e-09,2.395685e-14,1.628156e-10,1.628156e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zc3h8,6.294303,6.294354,0.042274,0.032637,6,6,10,-0.000457,8.949134,-8.950048,4.998222e-01,9.996444e-01,2.175137e-06,2.177132e-06,2.177132e-06
n.R5s197,1.892780,1.892667,0.217510,0.207470,6,6,10,0.000421,3.757853,-3.757010,5.001639e-01,9.996722e-01,1.870239e-03,1.867653e-03,1.870239e-03
Gm25821,0.685335,0.685499,0.326975,0.621555,6,6,10,-0.000411,2.514659,-2.515481,4.998399e-01,9.996799e-01,1.531179e-02,1.533340e-02,1.533340e-02
Olfr1301,0.449902,0.450075,0.574330,0.506197,6,6,10,-0.000408,2.356039,-2.356856,4.998411e-01,9.996823e-01,2.008408e-02,2.011207e-02,2.011207e-02


In [183]:
calc_df.sort_values("equiv_pval")

Unnamed: 0,muf,mug,sf2,sg2,nf,ng,nu,t0,t1,t-1,cdf_t0,diff_pval,cdf_t1,cdf_t2,equiv_pval
Dcaf8,11.589439,11.572316,0.000565,0.000380,6,6,9.629062,1.364517,81.051236,-78.322202,0.898279,2.034417e-01,3.926680e-15,2.886580e-15,3.926680e-15
Tpr,10.817219,10.802622,0.000302,0.000524,6,6,9.327765,1.243758,86.452445,-83.964929,0.878031,2.439374e-01,4.757270e-15,3.663736e-15,4.757270e-15
Zmpste24,9.963893,9.897418,0.000567,0.000700,6,6,9.890803,4.574185,73.385170,-64.236800,0.999476,1.048996e-03,1.345894e-14,3.552714e-15,1.345894e-14
Usp12,9.779509,9.738158,0.000797,0.000593,6,6,9.790465,2.716423,68.408556,-62.975710,0.988958,2.208390e-02,2.114036e-14,9.436896e-15,2.114036e-14
Tor1b,9.844967,9.907562,0.000738,0.000760,6,6,9.997708,-3.961225,59.322357,-67.244808,0.001341,2.682324e-03,6.481572e-15,2.264855e-14,2.264855e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gm16028,3.764315,0.318782,0.192730,0.246203,6,6,9.853755,12.738929,16.436160,9.041698,1.000000,1.924201e-07,9.999978e-01,8.670792e-09,9.999978e-01
Trank1,8.777007,5.422982,0.154560,0.208148,6,6,9.786386,13.641541,17.708756,9.574327,1.000000,1.088781e-07,9.999986e-01,4.643080e-09,9.999986e-01
Igha,12.196837,8.581238,0.117026,0.200597,6,6,9.352536,15.714473,20.060772,11.368174,1.000000,4.874335e-08,9.999996e-01,2.638620e-09,9.999996e-01
Prrxl1,6.083772,3.325149,0.059716,0.064999,6,6,9.982087,19.134090,26.070192,12.197989,1.000000,3.388911e-09,9.999999e-01,8.176249e-11,9.999999e-01


In [184]:
calc_df.loc["Smndc1"]

muf           9.011582e+00
mug           8.962767e+00
sf2           5.096687e-04
sg2           3.321802e-03
nf            6.000000e+00
ng            6.000000e+00
nu            6.499025e+00
t0            1.931726e+00
t1            4.150417e+01
t-1          -3.764072e+01
cdf_t0        9.510580e-01
diff_pval     9.788408e-02
cdf_t1        3.743394e-09
cdf_t2        1.988368e-09
equiv_pval    3.743394e-09
Name: Smndc1, dtype: float64

In [168]:
stats.t.cdf(4.5,9.4)

0.9993325540212846

In [170]:
(12.23 - 12.11)/np.sqrt(0.0015/6 + 0.0025/6)

4.647580015448939

In [158]:
calc_df.loc["Scgn"]

muf           6.825182e+00
mug           3.747797e+00
sf2           2.235188e-02
sg2           6.591977e-02
nf            6.000000e+00
ng            6.000000e+00
nu            8.041123e+00
t0            2.091760e+02
t1            2.771479e+02
t-1           1.412040e+02
cdf_t0        1.000000e+00
diff_pval     2.220446e-16
cdf_t1        1.000000e+00
cdf_t2        0.000000e+00
equiv_pval    1.000000e+00
Name: Scgn, dtype: float64

In [129]:
np.mean([141,134,104,136, 131,119])

127.5

In [130]:
np.mean([136.8749404,112.4771911,104.1948542,101.8498493,110.5323942,111.5936238])




112.92047550000001

In [90]:
calc_df = degrees_freedom_welch(calc_df)

In [109]:
stats.t.cdf(calc_df["t_0"],calc_df["nu"])

array([0.50383531, 0.6123588 , 0.4995195 , ..., 0.96737294, 0.7587263 ,
       0.99812505])