In [73]:
import pandas as pd
from typing import Dict,List
from sqlalchemy import create_engine

In [75]:
class Group1:
    '''Class for extracting upregulated and downregulated genes from Expression Atlas.'''
    def __init__(self,experiment_id:str,
                 group_id:str,
                 threshold_p_value : float = 0.05,
                 threshold_log2fold_change:float=1):
        
        self.experiment_id = experiment_id
        self.group_id = group_id
        if not self.check_id():
            raise ValueError ('Group ID does not belong to this experiment')    
        self.threshold_p_value = threshold_p_value
        self.threshold_log2fold_change = threshold_log2fold_change
        self.parkinson_exp_dict =  {
            'E-GEOD-7307':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7307/E-GEOD-7307_A-AFFY-44-analytics.tsv',
            'E-MEXP-1416':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-MEXP-1416/E-MEXP-1416_A-AFFY-54-analytics.tsv',
             'E-GEOD-7621':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-7621/E-GEOD-7621_A-AFFY-44-analytics.tsv',
             'E-GEOD-20168':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20168/E-GEOD-20168_A-AFFY-33-analytics.tsv',
             'E-GEOD-20333':'http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/E-GEOD-20333/E-GEOD-20333_A-AFFY-41-analytics.tsv'
        }
        self.df = self.data_reader()
        self.sorted_df = self.filter_dataframe()
        self.engine = None
        self.create_database()
        
    def data_reader(self)->pd.DataFrame:
        '''Downloads the tsv file from ftp query and returns a dataframe
        Returns
        -------
        pd.DataFrame
            Pandas dataframe from tsv file downloaded via ftp query
        '''
        if self.experiment_id in self.parkinson_exp_dict:
            url = self.parkinson_exp_dict[self.experiment_id]
            df = pd.read_csv(url,sep='\t')
            df.dropna(subset=['Gene Name'],inplace = True)
            return df
        else:
            raise ValueError ("Experiment ID does not belong to Parkinson's disease")
            
    def check_id(self):
        '''Checks whether experiment_id and group_id are relevant for the database'''
        experiment_groups = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
             'E-GEOD-20333' : ['g2_g1'],
             'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
             'E-GEOD-7621' : ['g1_g2'],
             'E-GEOD-20168' : ['g2_g1']}
        if self.experiment_id not in experiment_groups:
            raise ValueError ("Incorrect experiment ID for Parkinson's disease")
        if self.group_id in experiment_groups[self.experiment_id]:
            return True
        else:
            return False
            
    def filter_dataframe(self)->pd.DataFrame:
        """Filter dataframe obtained from experiment id api
        Returns
        -------
        pd.DataFrame
            filtered dataframe according to group_id
        """
        comparison_groups = [x.split('.')[0] for x in self.df.columns if x.endswith('p-value')]
        for comparison_group in comparison_groups:
            df_sort = self.df.copy()[['Gene Name',f'{comparison_group}.p-value',
                                     f'{comparison_group}.log2foldchange']]
            df_sort.columns = ['hgnc_symbol','p_value','log2foldchange']
            df_sort.index.rename('id',inplace = True)
            df_sort.index +=1
            df_sort['group_comparison'] = comparison_group
        return df_sort
    
    def create_database(self):
        '''Creates a new database expression_atlas'''
        con_str ='mysql+pymysql://ea_user:ea_password@localhost/expression_atlas'
        self.engine = create_engine(con_str)
        self.sorted_df.to_sql('atlas', self.engine, if_exists='replace')
        
    def get_up_and_down_regulated_hgnc_symbols(self) -> Dict[list, list]:
        '''Queries the database according to the input values
        Returns
        -------
        Dict[list, list]
            a dictionary of two lists(genes_up and genes_down)
        '''
        sql_up=f"Select hgnc_symbol from atlas where p_value < {self.threshold_p_value} and log2foldchange > {self.threshold_log2fold_change}"
        sql_down = f"Select hgnc_symbol from atlas where p_value < {self.threshold_p_value} and log2foldchange < - {self.threshold_log2fold_change}"
        genes_up= pd.read_sql(sql_up, self.engine)['hgnc_symbol'].tolist()
        genes_down = pd.read_sql(sql_down, self.engine)['hgnc_symbol'].tolist()
        return {'up':genes_up,'down':genes_down}

In [69]:
#Group1(experiment_id='E-GEOD-7621',group_id='g1_g2',threshold_p_value=0.05,threshold_log2fold_change=0.8).get_up_and_down_regulated_hgnc_symbols()