# Data Filtering
This code/script is for sorting data based on filtering. 
1. upload the reference data 
2. upload the test data 
3. filter the test data based on the reference data 
4. store the filtered test data into file

In [33]:
import numpy as np
import math
import matplotlib.pylab as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### The following functions are tools to pull the data out from each text file. 
1. extract_data_fromM0: The outcome of data is the list of gene symbols/names and stores as "Reference"
2. extract_dta_fromM1toM2_test: The outcome of data is the list of "Fold-change (log2[FC])", "gene symbols", "gene description"

In [34]:
def extract_data_fromM0(filename):             ## <---- Reference data file name: this should be in the format of string(ex. 'yourfilename.txt')
    infile = open(filename,'r')                ## <---- open the file: 'r' means "Reading mode" and 'w' means "Writing mode"
    infile.readline()                          ## Skip the first line
    symbols  = []                              ## Creating the dummy list where all your collection goes
    for line in infile:                        ## infile is the format of sort of list but it will be the entire row (it can be "hello world" instead of "hello" and "world")
        if line.strip():                       ## by doing so, you know line.strip() exists or not. if not, then this conditional statement will pass the element of infile (line) 
            line = line.strip("\n ' '")        ## I would say this is precautionary to make sure it cut the sentence if there is any \n 
            line = line.split("	")             ## split row into column 
            symbol = line[0]                   ## pull the very first element as symbol (I think this file format has only one column)
            symbols.append(symbol)             ## what this does is filling the dummy list as you progress
    infile.close()                             ## close writing process 
    return symbols                             ## return the outcome 

def extract_data_fromM1toM2_test(filename):
    infile = open(filename,'r')
    infile.readline()
    numbers1 = []
    symbols  = []
    descriptions = []
    for line in infile:
        if line.strip():
            line = line.strip("\n ' '")
            line = line.split("	")
            number1 = float(line[0])          ## This time what you are storing is a number (float) but python will treat as "string" in the first place so you have to convert to float to treat as numerical value
            symbol = line[2]                  ## If you check the text file, there are many columns. So once you split them, you need to treat them as column and the integer in [] indicates the index of the column you are interested in.
            description = line[5]
            numbers1.append(number1)
            symbols.append(symbol)
            descriptions.append(description)            
    infile.close()
    return numbers1, symbols, descriptions    ## These are float, string, and string.


### Comparison
1. it will go through matching process for filtering irrelevant data.
2. it will store them as output 

In [35]:
def comparison(testUpfile,testDownfile): ## inputs are string so make sure you put as comparison('yourtestfile1.txt','yourtestfile2.txt')
    
    M0toM1up   = extract_data_fromM0('orecchioni-supp2-M0toM1-up.txt')   ## M1 specific upregualted Genes
    M0toM2up   = extract_data_fromM0('orecchioni-supp2-M0toM2-up.txt')   ## M2 specific upregulated Genes
       
    ### Pulling out the specific genes that are down or up regulated from the mRNA seq data obtained and collected by Chris and Rob 
    [logFCdowntest, symbolsdowntest, dscrptdowntest] = extract_data_fromM1toM2_test(testDownfile) ## you don't have to give them the same name as return upthere. 
    [logFCuptest, symbolsuptest, dscrptuptest] = extract_data_fromM1toM2_test(testUpfile)
       
    x = len(logFCdowntest)   # This will pull the size of list. if there are ten elements, then your x = 10
    y = len(logFCuptest)
    
    DownSymbols = [] ## M1 specific like gene
    DownDscrpts = []
    DownFCtest  = []
    
    UpSymbols = [] ## M2 specific like gene
    UpDscrpts = []
    UpFCtest  = []
    
               
    ##### Check the M1 or M2 specificity
    ## Case 1 : If M1 -> M2, then M1 specific genes should be downregulated from the process 
    ## Therefore, from "DownRegulated Genes" in M1->M2, we should be able to see overlapped from M0 -> M1 upregulated
    
    n2 = len(M0toM1up)
    ## Against Test Data
    SpecificSymbolDowntest = []
    SpecificDscrptDowntest = []
    for k in np.arange(n2):        # if you put np.arange(2), the outcome is [0,1] and these elements can be use as index 
        for l in np.arange(x):
            if M0toM1up[k] == symbolsdowntest[l]:    ## This will go through each element of M0toM1up and compared with symbolsdowntest to see if they match or not. If not, it will pass.
                SpecificSymbolDowntest.append(symbolsdowntest[l])  ## downregulated genes during M1->M2 polarization are M1 specific
                SpecificDscrptDowntest.append(dscrptdowntest[l])
    
    file = open('M1specificDownM1toM2test.txt','w')
    file.write('This data shows "M1 specific genes" appearing in the process of M0 to M1 being "down-regulated" in the process of M1 to M2 in test \n')
    file.write('mRNA_Symbol, description \n')
    for num in np.arange(len(SpecificSymbolDowntest)):
         file.write('{}, {} \n'.format(SpecificSymbolDowntest[num],SpecificDscrptDowntest[num]))
    file.close()
    
    ## Case 2 : If M1 -> M2, then M2 specific genes should be upregulated from the process 
    ## Therefore, from "UpRegulated Genes" in M1->M2, we should be able to see overlapped from M0 -> M2 upregulated    
    
    n4 = len(M0toM2up)
    ## Against Test Data
    SpecificSymbolUptest = []
    SpecificDscrptUptest = []
    for k in np.arange(n4):
        for l in np.arange(y):
            if M0toM2up[k] == symbolsuptest[l]:
                SpecificSymbolUptest.append(symbolsuptest[l])
                SpecificDscrptUptest.append(dscrptuptest[l])
    
    file = open('M2specificUpM1toM2test.txt','w')
    file.write('This data shows "M2 specific genes" appearing in the process of M0 to M1 being "up-regulated" in the process of M1 to M2 in test \n')
    file.write('mRNA_Symbol, description \n')
    for num in np.arange(len(SpecificSymbolUptest)):
         file.write('{}, {} \n'.format(SpecificSymbolUptest[num],SpecificDscrptUptest[num]))
    file.close()
    
   
    return

In [36]:
comparison('m1tom2beyond2FCup.txt','m1tom2beyond2FCdown.txt')