In [68]:
import os
import pymysql
import pandas as pd
from typing import Dict,List
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

import warnings
warnings.simplefilter(action='ignore')

In [None]:
parkinson_exp_dict =  {
            'E-GEOD-7307':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7307/E-GEOD-7307_A-AFFY-44-analytics.tsv',
            'E-MEXP-1416':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-MEXP-1416/E-MEXP-1416_A-AFFY-54-analytics.tsv',
             'E-GEOD-7621':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7621/E-GEOD-7621_A-AFFY-44-analytics.tsv',
             'E-GEOD-20168':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20168/E-GEOD-20168_A-AFFY-33-analytics.tsv',
             'E-GEOD-20333':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20333/E-GEOD-20333_A-AFFY-41-analytics.tsv'
        }

In [71]:
class Group1:
    '''Class for extracting upregulated and downregulated genes from Expression Atlas.'''
    
    def __init__(self):
        self.exp_metadata = None
        self.exp_dfs = dict()
        
        self.data_reader()
        self.import_data_to_database()

        
    def data_reader(self) -> pd.DataFrame:
        '''Downloads the tsv file from ftp query and returns a dataframe
        Returns
        -------
        pd.DataFrame
            Pandas dataframe from tsv file downloaded via ftp query
        '''
        parkinson_exp_dict =  {
            'E-GEOD-7307':'E-GEOD-7307_A-AFFY-44-analytics.tsv',
            'E-MEXP-1416':'E-MEXP-1416_A-AFFY-54-analytics.tsv',
            'E-GEOD-7621':'E-GEOD-7621_A-AFFY-44-analytics.tsv',
            'E-GEOD-20168':'E-GEOD-20168_A-AFFY-33-analytics.tsv',
            'E-GEOD-20333':'E-GEOD-20333_A-AFFY-41-analytics.tsv',
        }
        
        experiment_groups = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
             'E-GEOD-20333' : ['g2_g1'],
             'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
             'E-GEOD-7621' : ['g1_g2'],
             'E-GEOD-20168' : ['g2_g1']}
        
        self.exp_metadata = pd.DataFrame(list(experiment_groups.items()), columns=["experiment_id", "group_id"]).explode('group_id')
        self.exp_metadata.reset_index(inplace=True, drop=True)
        self.exp_metadata.index.name = "id"
        
        dfs = dict()
        for exp_id, g_id in experiment_groups.items():
            data_path = os.path.join("./data/", parkinson_exp_dict[exp_id])
            dfs[exp_id] = pd.read_csv(data_path, sep='\t')
            
        for exp_id in experiment_groups.keys():
            for group in experiment_groups[exp_id]:
                colnames = {f'{group}.p-value' : 'p_value',
                           f'{group}.log2foldchange' : 'log2foldchange',
                           'Gene Name': 'gene_name'}
                df = dfs[exp_id][['Gene Name', f'{group}.p-value', f'{group}.log2foldchange']].copy(deep=False)
                df.rename(columns=colnames, inplace=True)
                df['group'] = [group for i in range(len(df))]
                df['experiment_group'] = 999
                df.loc[:,'experiment_group'][df.group==group] = self.exp_metadata[(self.exp_metadata.experiment_id==exp_id) & (self.exp_metadata.group_id==group)].index.values[0]
                df.drop(columns='group', inplace=True)
                if exp_id in self.exp_dfs:
                    group_df = self.exp_dfs[exp_id].copy(deep=False)
                    self.exp_dfs[exp_id] = pd.concat([group_df, df])
                else:
                    self.exp_dfs[exp_id] = df
    
    def import_data_to_database(self):
        '''Creates a new database expression_atlas'''
        con_str ='mysql+pymysql://ea_user:ea_password@localhost/expression_atlas'
        engine = create_engine(con_str)
        
        for exp_id, df in self.exp_dfs.items():
            df.index.name = 'id'
            df.to_sql(exp_id, engine, if_exists='replace')
            
        self.exp_metadata.to_sql('parkinson_experiment', engine, if_exists='replace')
        
        
    @staticmethod
    def get_up_and_down_regulated_hgnc_symbols(
                 experiment_id: str,
                 group_id: str,
                 threshold_p_value : float = 0.05,
                 threshold_log2fold_change: float=1) -> Dict[list, list]:
        '''Queries the database according to the input values
        Returns
        -------
        Dict[list, list]
            a dictionary of two lists(genes_up and genes_down)
        '''
        
        # check whether experiment id and group id are correct
        experiment_groups = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
             'E-GEOD-20333' : ['g2_g1'],
             'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
             'E-GEOD-7621' : ['g1_g2'],
             'E-GEOD-20168' : ['g2_g1']}
        if experiment_id not in experiment_groups.keys():
            raise ValueError ("Incorrect experiment ID for Parkinson's disease")
        elif group_id not in experiment_groups[experiment_id]:
            raise ValueError (f"Incorrect group ID for experiment {experiment_id}")        

        con_str ='mysql+pymysql://ea_user:ea_password@localhost/expression_atlas'
        
        experiment_group_id = session.query(parkinson_experiment).filter(parkinson_experiment.experiment_id==experiment_id, parkinson_experiment.group_id==group_id).one()
        
        genes_up = session.query(parkinson_experiment).join(experiment_id).filter(experiment_id.p_value < threshold_p_value, experiment_id.log2foldchange > threshold_log2fold_change).all()
        genes_down = session.query(parkinson_experiment).join(experiment_id).filter(experiment_id.p_value < threshold_p_value, experiment_id.log2foldchange < - threshold_log2fold_change).all()
        
        
        sql_experiment_group = f"""
        SELECT id FROM 
        """
        
        sql_up = f"""
        SELECT exp.hgnc_symbol FROM parkinson_experiment AS pe
        INNER JOIN
        {experiment_id} AS exp
        ON
        pe.id = exp.experiment_group
        WHERE exp.p_value < {threshold_p_value} 
        AND exp.log2foldchange > {threshold_log2fold_change}
        """

        sql_down = f"""
        SELECT exp.hgnc_symbol FROM parkinson_experiment AS pe
        INNER JOIN
        {experiment_id} AS exp
        ON
        pe.id = exp.experiment_group
        WHERE exp.p_value < {threshold_p_value} 
        AND exp.log2foldchange < - {threshold_log2fold_change}
        """

        genes_up= pd.read_sql(sql_up, con_str)['gene_name'].tolist()
        genes_down = pd.read_sql(sql_down, con_str)['gene_name'].tolist()
        
        return {'up':genes_up,'down':genes_down}

In [None]:
class Group1:
    '''Class for extracting upregulated and downregulated genes from Expression Atlas.'''

    @staticmethod
    def get_up_and_down_regulated_hgnc_symbols(
                 experiment_id: str,
                 group_id: str,
                 threshold_p_value : float = 0.05,
                 threshold_log2fold_change: float=1) -> Dict[list, list]:
        '''Queries the database according to the input values
        Returns
        -------
        Dict[list, list]
            a dictionary of two lists(genes_up and genes_down)
        '''
        
        # check whether experiment id and group id are correct
        experiment_groups = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
             'E-GEOD-20333' : ['g2_g1'],
             'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
             'E-GEOD-7621' : ['g1_g2'],
             'E-GEOD-20168' : ['g2_g1']}
        if experiment_id not in experiment_groups.keys():
            raise ValueError ("Incorrect experiment ID for Parkinson's disease")
        elif group_id not in experiment_groups[experiment_id]:
            raise ValueError (f"Incorrect group ID for experiment {experiment_id}")        

        # SQL query
        con_str ='mysql+pymysql://ea_user:ea_password@localhost/expression_atlas'
        
        experiment_group_id = session.query(parkinson_experiment).filter(parkinson_experiment.experiment_id==experiment_id, parkinson_experiment.group_id==group_id).one()
        
        genes_up = session.query(parkinson_experiment).join(experiment_id).filter(experiment_id.p_value < threshold_p_value, experiment_id.log2foldchange > threshold_log2fold_change).all()
        genes_down = session.query(parkinson_experiment).join(experiment_id).filter(experiment_id.p_value < threshold_p_value, experiment_id.log2foldchange < - threshold_log2fold_change).all()
        
        return {'up':genes_up,'down':genes_down}


In [73]:
Group1()
df = Group1.get_up_and_down_regulated_hgnc_symbols(experiment_id='E-GEOD-7621',group_id='g1_g2',threshold_p_value=0.05,threshold_log2fold_change=1)

{'E-MEXP-1416':       gene_name  p_value  log2foldchange  experiment_group
0        TSPAN6      NaN             1.0                 0
1          TNMD      NaN            -0.1                 0
2          DPM1      NaN             2.0                 0
3         SCYL3      NaN             1.6                 0
4      C1orf112      NaN            -0.1                 0
...         ...      ...             ...               ...
20978    NPBWR1      NaN            -0.4                 1
20979      CDR1      NaN             0.2                 1
20980    ACTL10      NaN             0.1                 1
20981       NaN      NaN             0.3                 1
20982    PRRC2B      NaN             0.1                 1

[41966 rows x 4 columns], 'E-GEOD-20333':      gene_name   p_value  log2foldchange  experiment_group
0       TSPAN6       NaN            -0.5                 2
1         TNMD       NaN             0.2                 2
2         DPM1  0.129577            -0.9                

OperationalError: (pymysql.err.OperationalError) (1045, "Access denied for user 'ea_user'@'localhost' (using password: YES)")
(Background on this error at: https://sqlalche.me/e/14/e3q8)