## 1. Write a python function to run an external edgeR analysis

In [2]:
import numpy as np
import pandas as pd


def merge_input_files(file_cond_1,file_cond_2):
    """
    This function takes two file names each of a single condition
    and merge them into a single file with a total of 6 samples - 3 from each condition
    """
    pd.read_table(file_cond_1)
    file_1 = pd.read_table(file_cond_1)
    file_2 = pd.read_table(file_cond_2)
    file_with_two_conditions = pd.merge(file_1,file_2,on='#gene')
    file_with_two_conditions.to_csv('merged_file', sep="\t",index=False)

def run_edgeR(infile_1,infile_2):

    """
    This function takes in two files from two different conditions, merge them into one file, write the merged file
    and write an r script that can be used to run the edgeR algorithm on the input
    """

    merge_input_files(infile_1,infile_2)

    # l1 through l10 are lines in the .R script

    l1 = "library(edgeR)"
    l2 = "infile <- 'merged_file'"
    l3 = "group <- factor(c(1,1,1,2,2,2))"
    l4 = "outfile <-'analysis.out'"
    l5 = "x <-read.table(infile, row.names=1)"
    l6 = "y <-DGEList(counts=x,group=group)"
    l7 = "y <-estimateDisp(y)"
    l8 = "et <- exactTest(y)"
    l9 = "tab <- topTags(et, nrow(x))"
    l10 = "write.table(tab, file=outfile, quote=FALSE)"

    with open('edgeR.r','w') as f:
        f.writelines("%s\n" % line for line in [l1, l2, l3, l4, l5, l6, l7, l8, l9, l10])
    ! Rscript edgeR.r
    
    output_df = pd.read_table('analysis.out',sep=" ")

    return output_df



### Testing the function out.

In [3]:
run_edgeR('w07-data.1','w07-data.2')

Loading required package: limma
Using classic mode.


Unnamed: 0,logFC,logCPM,PValue,FDR
huckleberry,7.707685,10.829353,4.465235e-34,8.944312e-30
watercress,7.312414,9.782148,1.896153e-32,1.899092e-28
chestnut,7.506817,7.867870,3.059913e-31,1.828872e-27
rosemary,7.375350,9.955355,3.652083e-31,1.828872e-27
cauliflower,6.924342,10.785685,7.079227e-30,2.836080e-26
...,...,...,...,...
PRRG2,0.000034,6.777514,1.000000e+00,1.000000e+00
HS3ST2,-0.000005,3.980425,1.000000e+00,1.000000e+00
EXT2,0.000000,-4.895947,1.000000e+00,1.000000e+00
CCDC153,0.000000,-4.895947,1.000000e+00,1.000000e+00


## 2. Reproduce Wiggins' data, assign the missing labels

### Comparing data files 1 and 2:

In [8]:
results_1_2 = run_edgeR('w07-data.1','w07-data.2')
results_1_2.head()

Loading required package: limma
Using classic mode.


Unnamed: 0,logFC,logCPM,PValue,FDR
huckleberry,7.707685,10.829353,4.465235e-34,8.944311999999999e-30
watercress,7.312414,9.782148,1.896153e-32,1.899092e-28
chestnut,7.506817,7.86787,3.059913e-31,1.8288720000000003e-27
rosemary,7.37535,9.955355,3.6520830000000004e-31,1.8288720000000003e-27
cauliflower,6.924342,10.785685,7.079227e-30,2.83608e-26


### Comparing data files 1 and 3:

In [70]:
results_1_3 = run_edgeR('w07-data.1','w07-data.3')
results_1_3.head()

Loading required package: limma
Using classic mode.


Unnamed: 0,logFC,logCPM,PValue,FDR
MRPL49,2.045484,6.441372,3.8e-05,0.620922
ZNF623,1.934356,4.972154,8.3e-05,0.620922
ZNF718,-1.929874,6.817988,9.3e-05,0.620922
TUBA8,1.806545,2.40552,0.000274,0.958774
spinach,-1.761288,5.314717,0.000329,0.958774


### Comparing data files 2 and 3:

In [71]:
results_2_3 = run_edgeR('w07-data.2','w07-data.3')
results_2_3.head()

Loading required package: limma
Using classic mode.


Unnamed: 0,logFC,logCPM,PValue,FDR
rosemary,-7.326192,9.955655,6.217532e-31,7.840068e-27
arugula,-7.348702,10.237827,7.827934000000001e-31,7.840068e-27
cauliflower,-7.165466,10.783871,5.99683e-30,4.004083e-26
huckleberry,-7.090442,10.833016,1.2542280000000001e-29,6.280861e-26
parsley,-6.964984,9.287336,1.221178e-28,4.892284e-25


In [74]:
# Searching through the p-values of each comparison for the number of p-values that is less than the 0.05 threshold

print('(1,2) smaller than 0.05:')
print(sum(results_1_2.PValue < 0.05))
print('(1,3) smaller than 0.05:')
print(sum(results_1_3.PValue < 0.05))
print('(2,3) smaller than 0.05:')
print(sum(results_2_3.PValue < 0.05))

(1,2) smaller than 0.05:
2147
(1,3) smaller than 0.05:
986
(2,3) smaller than 0.05:
2136


### At the significance level of 0.05, Wiggin's obtained his results of 2147 differentially expressed genes from data files 1 and 2. From the number of unadjusted p-values that are below the 0.05 threshold in each comparison, we observe the (1,2) and (2,3) comparison both have around 2140 significant p-values but the (1,3) comparison only have 986 significant p-values. Thus, I conclude that data files 1 and 3 are wild type and data file 2 is the mutant type.

## 3. Wiggins doesn't understand p-values


### I do not agree with Wiggins' conclusion that 2147 genes are differentially expressed because he used the raw p-values that have not been adjusted for false discovery rate. For a given gene, at the significance level of 0.05, we also have an expected false positive rate of 0.05. There are 20031 genes in the dataset so we would expect 20031 x 0.05 = 1002 false positive. According to this this calculation, nearly half of the 2147 genes deemed differentially expressed by Wiggin are expected to be false positives. He also did not filter genes with low expression.

### I decided to control for false discovery rate at the 0.05 level, meaning out of all of the genes that I deemed as differentially expressed, 0.05 of them are expected to be false positives. (FP / FP + TP). The FDR has been conviently calculated for us by the edgeR algorithm, now we just need to rank the FDR from the smallest to the biggest and find all the genes that have FDR < 0.05.

In [21]:
# low expression threshold
log_cpm_threshold = np.log2(100)

print("last five genes with FDR < 0.05:")
print(results_1_2.query("(FDR < 0.05) & (logCPM > @log_cpm_threshold)").tail())
print("Number of differentially expressed gene with FDR < 0.05:")
print(results_1_2.query("(FDR < 0.05) & (logCPM > @log_cpm_threshold)").shape[0])

last five genes with FDR < 0.05:
              logFC     logCPM        PValue           FDR
raspberry  5.041524   8.298227  1.094444e-18  4.567252e-16
eggplant   4.746436  10.314267  6.561074e-18  2.682140e-15
parsnip    4.724994   9.074043  6.169461e-17  2.471609e-14
FAM179A   -2.221777   6.879760  7.785914e-06  2.942635e-03
ACO2      -1.958138   7.071671  7.943618e-05  2.552192e-02
Number of differentially expressed gene with FDR < 0.05:
50


### According to my calculations, there are only 75 statistically significant genes at FDR threshold of 0.05

## 4. Wiggins missed somthing else too

### Wiggins' missed the assumption that we are measuring relative abundance of each gene in each sample, not total RNA output on a per-cell basis. In other words, we are only interested in measuring statistically significant "direct" effects on relative RNA abundance. Therefore, if a small proportion of highly expressed genes consume a big proportion of the total library size for a particular sample, this will cause the remaining genes to be undersampled.  We need to add the trimmed mean of M-values (TMM) normalization step to the R-script.

In [15]:

def run_edgeR_TMM(infile_1,infile_2):

    """
    This function takes in two files from two different conditions, merge them into one file, write the merged file
    and write an r script that can be used to run the edgeR algorithm on the input
    """

    merge_input_files(infile_1,infile_2)

    # l1 through l10 are lines in the .R script

    l1 = "library(edgeR)"
    l2 = "infile <- 'merged_file'"
    l3 = "group <- factor(c(1,1,1,2,2,2))"
    l4 = "outfile <-'analysis_tmm.out'"
    l5 = "x <-read.table(infile, row.names=1)"
    l6 = "y <-DGEList(counts=x,group=group)"
    l7 = "y_adjusted <- calcNormFactors(y)"
    l8 = "y <-estimateDisp(y_adjusted)"
    l9 = "et <- exactTest(y)"
    l10 = "tab <- topTags(et, nrow(x))"
    l11 = "write.table(tab, file=outfile, quote=FALSE)"

    with open('edgeR.r','w') as f:
        f.writelines("%s\n" % line for line in [l1, l2, l3, l4, l5, l6, l7, l8, l9, l10,l11])
    ! Rscript edgeR.r
    
    output_df = pd.read_table('analysis_tmm.out',sep=" ")

    return output_df

In [23]:
results_1_2_adjusted = run_edgeR_TMM('w07-data.1','w07-data.2')

Loading required package: limma
Using classic mode.


In [27]:
print("Post-adjustment number of statistically significant genes given FDR threshold of 0.05")
print(results_1_2_adjusted.query("(FDR < 0.05)").shape[0])

Post-adjustment number of statistically significant genes given FDR threshold of 0.05
54


In [26]:
# taking the set theory difference between genes previously deemed statistically significant and genes deemd significant after extra normalization
len(set(results_1_2.query("(FDR < 0.05) & (logCPM > @log_cpm_threshold)").index) - set(results_1_2_adjusted.query("(FDR < 0.05) & (logCPM > @log_cpm_threshold)").index))

1

### After the TMM normalization step, only 54 genes are statistically significant at the FDR threshold. 22 genes that are previously deteremined to be statistically significant are no longer statistically significant. This is because the TMM normalization step minimzied the log-fold changes between the samples for most genes, so the genes that were previously deemed statistically significant due to "indirect" effects from overexpressed gene in certain samples are eliminated.