In [8]:
import os
import pandas as pd
from typing import Dict,List
from sqlalchemy import create_engine

In [None]:
parkinson_exp_dict =  {
            'E-GEOD-7307':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7307/E-GEOD-7307_A-AFFY-44-analytics.tsv',
            'E-MEXP-1416':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-MEXP-1416/E-MEXP-1416_A-AFFY-54-analytics.tsv',
             'E-GEOD-7621':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7621/E-GEOD-7621_A-AFFY-44-analytics.tsv',
             'E-GEOD-20168':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20168/E-GEOD-20168_A-AFFY-33-analytics.tsv',
             'E-GEOD-20333':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20333/E-GEOD-20333_A-AFFY-41-analytics.tsv'
        }

In [13]:
class Group1:
    '''Class for extracting upregulated and downregulated genes from Expression Atlas.'''
    def __init__(self):
        self.exp_dfs = self.data_reader()

    @staticmethod
    def data_reader() -> pd.DataFrame:
        '''Downloads the tsv file from ftp query and returns a dataframe
        Returns
        -------
        pd.DataFrame
            Pandas dataframe from tsv file downloaded via ftp query
        '''
        parkinson_exp_dict =  {
            'E-GEOD-7307':'E-GEOD-7307_A-AFFY-44-analytics.tsv',
            'E-MEXP-1416':'E-MEXP-1416_A-AFFY-54-analytics.tsv',
            'E-GEOD-7621':'E-GEOD-7621_A-AFFY-44-analytics.tsv',
            'E-GEOD-20168':'E-GEOD-20168_A-AFFY-33-analytics.tsv',
            'E-GEOD-20333':'E-GEOD-20333_A-AFFY-41-analytics.tsv',
        }
        
        experiment_groups = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
             'E-GEOD-20333' : ['g2_g1'],
             'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
             'E-GEOD-7621' : ['g1_g2'],
             'E-GEOD-20168' : ['g2_g1']}
        
        dfs = dict()
        for exp_id, g_id in experiment_groups.items():
            data_path = os.path.join("./data/", parkinson_exp_dict[exp_id])
            dfs[exp_id] = pd.read_csv(data_path, sep='\t')
            
        exp_dfs = {}
        for exp_id in experiment_groups.keys():
            for group in experiment_groups[exp_id]:
                colnames = {f'{group}.p-value' : 'p-value',
                           f'{group}.log2foldchange' : 'log2foldchange'}
                df = dfs[exp_id][['Gene Name', f'{group}.p-value', f'{group}.log2foldchange']].copy(deep=False)
                df.rename(columns=colnames, inplace=True)
                df['group'] = [group for i in range(len(df))]
                df['experiment'] = [exp_id for i in range(len(df))]
                if exp_id in exp_dfs:
                    group_df = exp_dfs[exp_id].copy(deep=False)
                    exp_dfs[exp_id] = pd.concat([group_df, df])
                else:
                    exp_dfs[exp_id] = df
        return exp_dfs

            
    def check_id(self):
        '''Checks whether experiment_id and group_id are relevant for the database'''
        experiment_groups = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
             'E-GEOD-20333' : ['g2_g1'],
             'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
             'E-GEOD-7621' : ['g1_g2'],
             'E-GEOD-20168' : ['g2_g1']}
        if self.experiment_id not in experiment_groups:
            raise ValueError ("Incorrect experiment ID for Parkinson's disease")
        if self.group_id in experiment_groups[self.experiment_id]:
            return True
        else:
            return False
            
    def filter_dataframe(self)->pd.DataFrame:
        """Filter dataframe obtained from experiment id api
        Returns
        -------
        pd.DataFrame
            filtered dataframe according to group_id
        """
        comparison_groups = [x.split('.')[0] for x in self.df.columns if x.endswith('p-value')]
        for comparison_group in comparison_groups:
            df_sort = self.df.copy()[['Gene Name',f'{comparison_group}.p-value',
                                     f'{comparison_group}.log2foldchange']]
            df_sort.columns = ['hgnc_symbol','p_value','log2foldchange']
            df_sort.index.rename('id',inplace = True)
            df_sort.index +=1
            df_sort['group_comparison'] = comparison_group
        return df_sort
    
    def create_database(self):
        '''Creates a new database expression_atlas'''
        con_str ='mysql+pymysql://ea_user:ea_password@localhost/expression_atlas'
        self.engine = create_engine(con_str)
        self.sorted_df.to_sql('atlas', self.engine, if_exists='replace')
    
    @staticmethod
    def get_up_and_down_regulated_hgnc_symbols(self,
                 experiment_id: str,
                 group_id: str,
                 threshold_p_value : float = 0.05,
                 threshold_log2fold_change: float=1) -> Dict[list, list]:
        '''Queries the database according to the input values
        Returns
        -------
        Dict[list, list]
            a dictionary of two lists(genes_up and genes_down)
        '''
        con_str ='mysql+pymysql://ea_user:ea_password@localhost/expression_atlas'
        
        sql_up=f"""
        SELECT hgnc_symbol FROM atlas 
        WHERE p_value < {threshold_p_value} AND log2foldchange > {threshold_log2fold_change}
        """
        sql_down = f"""
        SELECT hgnc_symbol FROM atlas 
        WHERE p_value < {threshold_p_value} AND log2foldchange < - {threshold_log2fold_change}
        """
        
        genes_up= pd.read_sql(sql_up, con_str)['hgnc_symbol'].tolist()
        genes_down = pd.read_sql(sql_down, con_str)['hgnc_symbol'].tolist()
        
        return {'up':genes_up,'down':genes_down}

In [69]:
#Group1(experiment_id='E-GEOD-7621',group_id='g1_g2',threshold_p_value=0.05,threshold_log2fold_change=0.8).get_up_and_down_regulated_hgnc_symbols()