## Group 1

In [None]:
import csv
import requests
from typing import List

class Group1:
    def __init__(self, experiment_id: List[str], array_design: int, p_thred: float, fc_thred: float):
        self.p_thred = p_thred
        self.fc_thred = fc_thred
        self.experiment_id = experiment_id
        self.array_design = array_design
        self.dataset = dict()
        self.park_cols = list()
        
        self.__atlas_api_request()
        
    def __atlas_api_request(self):
        for eid in self.experiment_id:
            # request experiment file
            url_analytics = f"http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/{eid}/{eid}_{array_design}-analytics.tsv"
            response = requests.get(url_analytics)
            if response.ok:
                mat = [line.split('\t') for line in response.text.splitlines()]
                self.dataset['eid'] = pd.DataFrame(mat[1:], columns = mat[0])
            
            # request configuration file
            url = f"http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/{eid}/{eid}-configuration.xml"
            response = requests.get(url)
            if response.ok:
                j = xml2dict.parse(response.text)
            contrast = j['configuration']['analytics']['contrasts']['contrast']
            new_var_name = dict()
            if isinstance(contrast, list):
                for i,config in enumerate(contrast):
                    id = contrast[i]['@id']
                    name = contrast[i]['name']
                    new_var_name[id] = name
            else:
                id = contrast['@id']
                name = contrast['name']
                new_var_name[id] = name              
            
            # check the variables
            for id, name in new_var_name.items():
                is_park = "'Parkinson's disease' vs 'normal'" in name
                if is_park:
                    self.park_cols.append(id)

    def get_regulation(self, experiment_id: str, group_id: str, p_thred: float, fc_thred: float) -> Dict:
        """Get all up/down regulated gene ids.
        Returns
        ------
        hgnc_ids : Dict
        """
        return {"up": ["HGNC687", "HGNC6383"], "down": ["HGNC6295", "HGNC6387"]}

## Real-world API example

In [5]:
import csv
import pandas as pd
import requests

url = "http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7307/E-GEOD-7307_A-AFFY-44-analytics.tsv"
response = requests.get(url)
text = response.text.splitlines()
mat = [n.split('\t') for n in text]
data = pd.DataFrame(mat[1:], columns = mat[0])
data

Unnamed: 0,Gene ID,Gene Name,Design Element,g12_g5.p-value,g12_g5.t-statistic,g12_g5.log2foldchange,g12_g6.p-value,g12_g6.t-statistic,g12_g6.log2foldchange,g3_g2.p-value,...,g78_g21.log2foldchange,g82_g16.p-value,g82_g16.t-statistic,g82_g16.log2foldchange,g83_g17.p-value,g83_g17.t-statistic,g83_g17.log2foldchange,g88_g101.p-value,g88_g101.t-statistic,g88_g101.log2foldchange
0,ENSG00000000003,TSPAN6,209108_at,0.971106244269371,0.318607647973256,0,0.00106612873335634,-5.15081981220813,-0.5,,...,0.1,0.116127200247363,-2.07013131330731,-0.6,0.0953924620078502,-2.26220610329014,-0.5,0.128540734039497,-2.11556970337408,-0.7
1,ENSG00000000005,TNMD,220065_at,0.980500058238262,-0.200143495565014,0,,0.105346907849106,0,,...,-2.1,,1.17392229243968,0.3,0.937076512739623,0.109329045952407,0,0.0792927841067375,-2.51770646142493,-2.2
2,ENSG00000000419,DPM1,202673_at,0.845530339233909,-1.25233107190212,-0.2,0.513320474823469,0.785593624492703,0.1,,...,0.3,,-2.31045182420838,-0.5,,-2.47624291369624,-0.4,0.757479402226486,-0.429640553391134,-0.1
3,ENSG00000000457,SCYL3,205607_s_at,0.958495374162834,0.430481005001034,0,2.03102768857493e-05,8.71449856775735,0.9,,...,0.2,,1.83799221867179,0.3,,0.078980529600348,0,0.902772570755807,0.17837139579673,0
4,ENSG00000000460,C1orf112,220840_s_at,0.850385948099399,1.21591876144656,0.1,7.4255984543235e-08,-17.3880688358432,-1.4,,...,1.3,,0.748838583232621,0.2,,-0.0914719386152586,0,,0.878720752201448,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21211,ENSG00000288611,NPBWR1,231752_at,0.997468751180273,0.0450682270813076,0,0.389538024376504,1.03639133909172,0.1,,...,-0.1,,2.30528650105385,0.4,0.00415374018321716,5.62585830258156,0.9,0.751257514820387,-0.440696394754004,-0.2
21212,ENSG00000288642,CDR1,207276_at,0.975080774636055,0.24603313615322,0,,1.61746017918381,0.1,,...,-0.1,,-0.900017712163131,-0.2,,1.13180738886777,0.3,0.230259814185829,-1.6322544945266,-0.6
21213,ENSG00000288649,ACTL10,232619_at,0.997468751180273,0.0388311664129007,0,0.121593614772263,1.92438433962541,0.3,,...,0.1,,0.263725821996618,0,,2.12021478529583,0.4,0.13163266338067,-2.09618962588659,-0.5
21214,ENSG00000288658,,238727_at,0.886110554359145,-0.979169791657373,-0.1,0.026920510304836,-2.93538886549338,-0.2,,...,0.3,0.0329222089722274,-3.34735188493016,-0.8,0.0700222547608516,-2.50972341772677,-0.5,,-1.35552543119059,-0.2


In [2]:
import json
import xml2dict
import requests

url = "http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7307/E-GEOD-7307-configuration.xml"
response = requests.get(url)
j = xml2dict.parse(response.text)
contrast = j['configuration']['analytics']['contrasts']['contrast']
new_var_name = dict()
if isinstance(contrast, list):
    for i,config in enumerate(contrast):
        id = contrast[i]['@id']
        name = contrast[i]['name']
        new_var_name[id] = name
new_var_name

{'g3_g2': "'6 hour; SDF' vs '0 hour; none' in 'HepG2; hepatocellular carcinoma; liver'",
 'g12_g5': "'CXCL12, 10 nanomolar; 6 hour' vs 'none; 6 hour' in 'MDAMB231; adenocarcinoma; mammary gland'",
 'g12_g6': "'CXCL12, 75 nanomolar; 6 hour' vs 'none; 6 hour' in 'MDAMB231; adenocarcinoma; mammary gland'",
 'g48_g13': "'Parkinson's disease' vs 'normal' in 'globus pallidus'",
 'g63_g14': "'Parkinson's disease' vs 'normal' in 'nucleus accumbens'",
 'g72_g15': "'Parkinson's disease' vs 'normal' in 'putamen'",
 'g82_g16': "'Parkinson's disease' vs 'normal' in 'substantia nigra pars compacta'",
 'g83_g17': "'Parkinson's disease' vs 'normal' in 'substantia nigra pars reticulata'",
 'g71_g18': "'benign prostatic hyperplasia' vs 'normal' in 'prostate gland'",
 'g71_g100': "'prostate cancer' vs 'normal' in 'prostate gland'",
 'g78_g21': "'melanoma' vs 'normal' in 'skin'",
 'g88_g101': "'rheumatoid arthritis' vs 'normal' in 'synovial membrane of synovial joint'",
 'g61_g1': "'Adenomyosis' vs 'norma

## Real-world analysis example

In [29]:
import os
import pandas as pd

In [30]:
df = pd.read_csv("./E-GEOD-20168-A-AFFY-33-analytics.tsv", sep="\t")
df.columns = ["ensembl_id", "gene_name", "design_element", "p_value", "t-statistic", "log2foldchange"]
df

Unnamed: 0,ensembl_id,gene_name,design_element,p_value,t-statistic,log2foldchange
0,ENSG00000000003,TSPAN6,209108_at,0.623090,0.785727,0.2
1,ENSG00000000005,TNMD,220065_at,,2.179824,0.2
2,ENSG00000000419,DPM1,202673_at,0.033769,-3.595145,-1.0
3,ENSG00000000457,SCYL3,205607_s_at,,-0.899866,-0.1
4,ENSG00000000460,C1orf112,220840_s_at,,2.572624,0.1
...,...,...,...,...,...,...
12051,ENSG00000287080,H3C3,208577_at,,1.609577,0.1
12052,ENSG00000288569,LINC01949,217342_x_at,,2.409303,0.2
12053,ENSG00000288596,C8orf44,220216_at,,0.950915,0.1
12054,ENSG00000288642,CDR1,207276_at,0.132890,-2.375270,-0.3


In [6]:
p_thred = 0.05
fc_thred_pos = 1
fc_thred_neg = -1

In [18]:
sigf_genes = df[df.p_value < p_thred]
print("Number of significant genes:", len(sigf_genes.index))
sigf_genes.head()

Number of significant genes: 370


Unnamed: 0,ensembl_id,gene_name,design_element,p_value,t-statistic,log2foldchange
2,ENSG00000000419,DPM1,202673_at,0.033769,-3.595145,-1.0
24,ENSG00000002933,TMEM176A,218345_at,0.032012,3.717852,0.4
30,ENSG00000003402,CFLAR,211316_x_at,0.014177,4.456038,0.5
43,ENSG00000004534,RBM6,201967_at,0.020602,4.193546,0.6
48,ENSG00000004779,NDUFAB1,202077_at,0.032129,-3.708727,-0.9


In [16]:
upreg_genes = df[(df.p_value < p_thred) & (df.log2foldchange > fc_thred_pos)]
print("Number of up-regulated genes:", len(upreg_genes.index))
upreg_genes.head()

Number of up-regulated genes: 1


Unnamed: 0,ensembl_id,gene_name,design_element,p_value,t-statistic,log2foldchange
1781,ENSG00000099875,MKNK2,218205_s_at,0.026579,3.914997,1.1


In [17]:
downreg_genes = df[(df.p_value < p_thred) & (df.log2foldchange < fc_thred_neg)]
print("Number of down-regulated genes:", len(downreg_genes.index))
downreg_genes.head()

Number of down-regulated genes: 44


Unnamed: 0,ensembl_id,gene_name,design_element,p_value,t-statistic,log2foldchange
102,ENSG00000006128,TAC1,206552_s_at,0.021427,-4.120842,-1.7
303,ENSG00000014641,MDH1,200978_at,0.049699,-3.297273,-1.3
359,ENSG00000022355,GABRA1,206678_at,0.039589,-3.472968,-1.4
366,ENSG00000023228,NDUFS1,203039_s_at,0.000892,-6.65865,-1.2
371,ENSG00000023516,AKAP11,203156_at,0.031107,-3.74497,-1.3


In [3]:
chemical_data = {"AHD" : [{'chemical_id':7467, 'cas_id':8487,'interaction':'increase in expresion'}]}

In [15]:
for hgnc_symbol,data in chemical_data.items():
    for value in data:
        print(value['cas_id'])
#         for x in data[value]:
#             print(data[value][x])

8487


# 

In [None]:
class Group1:
    @staticmethod
    def get_up_and_down_regulated_hgnc_symbols(experiment_id: str, 
                       group_id: str,
                       threshold_p_value: float = 0.05,
                       threshold_log2fold_change: float = 1) -> Dict[list, list]:
        """Get all up/down regulated HGNC gene symbols.
        Returns
        ------ 
        hgnc_symbols : Dict
        """
        dummy_dict = {"p_value":[0.026579, 0.021427, 0.032012, 0.049699, 0.039589, 0.020602, 0.032129], 
                      "log2foldchange": [1.1, -1.7, 0.4, -1.3, -1.4, 0.6, -0.9], 
                      "gene_name": ["MKNK2", "TAC1", "TMEM176A", "MDH1", "GABRA1", "RBM6", "NDUFAB1"]}
        df = pd.DataFrame(dummy_dict)
        
        assert threshold_log2fold_change > 0
        upreg_genes = df[(df.p_value < threshold_p_value) & (df.log2foldchange > threshold_log2fold_change)]
        downreg_genes = df[(df.p_value < threshold_p_value) & (df.log2foldchange < -threshold_log2fold_change)]

        regulated_genes = {"up": upreg_genes["gene_name"].tolist(),
                           "down": downreg_genes["gene_name"].tolist()}
                    
        return regulated_genes
    
    
df['p_value'] or df['{group_id}.p_value']

In [9]:
import pandas as pd

In [23]:
parkinson_exp_dict = {'E-GEOD-7307':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7307/E-GEOD-7307_A-AFFY-44-analytics.tsv',
                    'E-MEXP-1416':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-MEXP-1416/E-MEXP-1416_A-AFFY-54-analytics.tsv',
                     'E-GEOD-7621':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7621/E-GEOD-7621_A-AFFY-44-analytics.tsv',
                     'E-GEOD-20168':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20168/E-GEOD-20168_A-AFFY-33-analytics.tsv',
                     'E-GEOD-20333':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20333/E-GEOD-20333_A-AFFY-41-analytics.tsv'}

def data_reader(experiment_id:str):
    if experiment_id in parkinson_exp_dict:
        url = parkinson_exp_dict[experiment_id]
#         print(url)
        df = pd.read_csv(url,sep='\t')
        return df
    else:
        raise ValueError ("Experiment ID does not belong to Parkinson's disease")

In [24]:
df1 = data_reader('E-GEOD-7307')
df1.head()

Unnamed: 0,Gene ID,Gene Name,Design Element,g12_g5.p-value,g12_g5.t-statistic,g12_g5.log2foldchange,g12_g6.p-value,g12_g6.t-statistic,g12_g6.log2foldchange,g3_g2.p-value,...,g78_g21.log2foldchange,g82_g16.p-value,g82_g16.t-statistic,g82_g16.log2foldchange,g83_g17.p-value,g83_g17.t-statistic,g83_g17.log2foldchange,g88_g101.p-value,g88_g101.t-statistic,g88_g101.log2foldchange
0,ENSG00000000003,TSPAN6,209108_at,0.971106,0.318608,0.0,0.001066129,-5.15082,-0.5,,...,0.1,0.116127,-2.070131,-0.6,0.095392,-2.262206,-0.5,0.128541,-2.11557,-0.7
1,ENSG00000000005,TNMD,220065_at,0.9805,-0.200143,0.0,,0.105347,0.0,,...,-2.1,,1.173922,0.3,0.937077,0.109329,0.0,0.079293,-2.517706,-2.2
2,ENSG00000000419,DPM1,202673_at,0.84553,-1.252331,-0.2,0.5133205,0.785594,0.1,,...,0.3,,-2.310452,-0.5,,-2.476243,-0.4,0.757479,-0.429641,-0.1
3,ENSG00000000457,SCYL3,205607_s_at,0.958495,0.430481,0.0,2.031028e-05,8.714499,0.9,,...,0.2,,1.837992,0.3,,0.078981,0.0,0.902773,0.178371,0.0
4,ENSG00000000460,C1orf112,220840_s_at,0.850386,1.215919,0.1,7.425598e-08,-17.388069,-1.4,,...,1.3,,0.748839,0.2,,-0.091472,0.0,,0.878721,0.1


In [25]:
df2 = data_reader('E-MEXP-1416')
df2

Unnamed: 0,Gene ID,Gene Name,Design Element,g2_g1.p-value,g2_g1.t-statistic,g2_g1.log2foldchange,g4_g3.p-value,g4_g3.t-statistic,g4_g3.log2foldchange
0,ENSG00000000003,TSPAN6,g2995860_3p_at,,1.703524,1.0,0.522520,1.241137,0.9
1,ENSG00000000005,TNMD,g11545882_3p_at,,-0.329159,-0.1,,0.575565,0.1
2,ENSG00000000419,DPM1,g4503362_3p_at,,2.286994,2.0,0.997115,-0.006181,0.0
3,ENSG00000000457,SCYL3,g9967093_3p_s_at,,1.762578,1.6,0.723872,0.678925,0.4
4,ENSG00000000460,C1orf112,g8922604_3p_a_at,,-1.018646,-0.1,,-0.285732,-0.1
...,...,...,...,...,...,...,...,...,...
20978,ENSG00000288611,NPBWR1,Hs.248117.0.S1_3p_at,,-3.143602,-0.6,,-1.394599,-0.4
20979,ENSG00000288642,CDR1,g4757963_3p_at,,-0.590396,-0.1,,1.053423,0.2
20980,ENSG00000288649,ACTL10,Hs.191063.0.S1_3p_at,,-2.066269,-0.4,,0.415094,0.1
20981,ENSG00000288658,,Hs.146225.0.A1_3p_at,,4.611951,1.1,,1.125041,0.3


In [26]:
df3 = data_reader('E-GEOD-7621')
df3.head()

Unnamed: 0,Gene ID,Gene Name,Design Element,g1_g2.p-value,g1_g2.t-statistic,g1_g2.log2foldchange
0,ENSG00000000003,TSPAN6,209108_at,0.834705,-0.444655,-0.1
1,ENSG00000000005,TNMD,220065_at,,-4.040212,-0.3
2,ENSG00000000419,DPM1,202673_at,0.906982,-0.270559,0.0
3,ENSG00000000457,SCYL3,205607_s_at,0.422994,-1.460597,-0.2
4,ENSG00000000460,C1orf112,220840_s_at,,-0.67755,0.0


In [27]:
df4 = data_reader('E-GEOD-20168')
df4

Unnamed: 0,Gene ID,Gene Name,Design Element,g2_g1.p-value,g2_g1.t-statistic,g2_g1.log2foldchange
0,ENSG00000000003,TSPAN6,209108_at,0.623090,0.785727,0.2
1,ENSG00000000005,TNMD,220065_at,,2.179824,0.2
2,ENSG00000000419,DPM1,202673_at,0.033769,-3.595145,-1.0
3,ENSG00000000457,SCYL3,205607_s_at,,-0.899866,-0.1
4,ENSG00000000460,C1orf112,220840_s_at,,2.572624,0.1
...,...,...,...,...,...,...
12051,ENSG00000287080,H3C3,208577_at,,1.609577,0.1
12052,ENSG00000288569,LINC01949,217342_x_at,,2.409303,0.2
12053,ENSG00000288596,C8orf44,220216_at,,0.950915,0.1
12054,ENSG00000288642,CDR1,207276_at,0.132890,-2.375270,-0.3


In [28]:
df5 = data_reader('E-GEOD-20333')
df5

Unnamed: 0,Gene ID,Gene Name,Design Element,g2_g1.p-value,g2_g1.t-statistic,g2_g1.log2foldchange
0,ENSG00000000003,TSPAN6,209108_at,,-2.336661,-0.5
1,ENSG00000000005,TNMD,220065_at,,1.738032,0.2
2,ENSG00000000419,DPM1,202673_at,0.129577,-3.063216,-0.9
3,ENSG00000000971,CFH,213800_at,0.772056,-0.368670,-0.1
4,ENSG00000001084,GCLC,202922_at,,-1.117266,-0.2
...,...,...,...,...,...,...
7601,ENSG00000281708,ERC2-IT1,208247_at,,2.783343,0.4
7602,ENSG00000282608,ADORA3,206171_at,,1.918752,0.3
7603,ENSG00000286522,H3C2,208576_s_at,,-0.680612,-0.1
7604,ENSG00000287080,H3C3,208577_at,,1.063646,0.1


In [6]:

class Group1:
    
    def __init__(self,experiment_id:str,
                 group_id:str,
                 threshold_p_value : float = 0.05,
                 threshold_log2fold_change:float=1):
        
        self.experiment_id = experiment_id
        self.group_id = group_id
        self.threshold_p_value = threshold_p_value
        self.threshold_log2fold_change = threshold_log2fold_change
        self.parkinson_exp_dict =  {'E-GEOD-7307':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7307/E-GEOD-7307_A-AFFY-44-analytics.tsv',
                                                    'E-MEXP-1416':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-MEXP-1416/E-MEXP-1416_A-AFFY-54-analytics.tsv',
                                                     'E-GEOD-7621':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7621/E-GEOD-7621_A-AFFY-44-analytics.tsv',
                                                     'E-GEOD-20168':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20168/E-GEOD-20168_A-AFFY-33-analytics.tsv',
                                                     'E-GEOD-20333':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20333/E-GEOD-20333_A-AFFY-41-analytics.tsv'}
        self.df = self.data_reader()
        
    def data_reader(self):
        if self.experiment_id in self.parkinson_exp_dict:
            url = self.parkinson_exp_dict[self.experiment_id]
            df = pd.read_csv(url,sep='\t')
            return df
        else:
            raise ValueError ("Experiment ID does not belong to Parkinson's disease")
            
    def get_up_and_down_regulated_hgnc_symbols(self):
        df = self.df
        p_value=self.threshold_p_value
        log2foldchange = self.threshold_log2fold_change
        
        assert threshold_log2fold_change > 0
        upreg_genes = df[(df.p_value < threshold_p_value) & (df.log2foldchange > threshold_log2fold_change)]
        downreg_genes = df[(df.p_value < threshold_p_value) & (df.log2foldchange < -threshold_log2fold_change)]

        regulated_genes = {"up": upreg_genes["gene_name"].tolist(),
                           "down": downreg_genes["gene_name"].tolist()}
                    
        return regulated_genes