In [11]:
import os
import requests
import pymysql
from pathlib import Path
from getpass import getpass

## Change folder name
home_dir = Path.home()
PROJECT_DIR = home_dir.joinpath(".Biodb_expression_atlas")
DATA_DIR = PROJECT_DIR.joinpath("data")
DB_PATH = PROJECT_DIR.joinpath("group1.db")
Data_folder = '..\\pd_data_files\\'

# create data folder if not exists
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# download data files
atlas_ftp = "http://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/"
experiments = [
    "E-GEOD-7307/E-GEOD-7307_A-AFFY-44-analytics.tsv",
    "E-MEXP-1416/E-MEXP-1416_A-AFFY-54-analytics.tsv",
    "E-GEOD-7621/E-GEOD-7621_A-AFFY-44-analytics.tsv",
    "E-GEOD-20168/E-GEOD-20168_A-AFFY-33-analytics.tsv",
    "E-GEOD-20333/E-GEOD-20333_A-AFFY-41-analytics.tsv"]

datafile_paths = [] # store data files path

for exp in experiments:
    filename = exp.split('/')[1]
    path = os.path.join(DATA_DIR, filename)
    datafile_paths.append(path)
    if not os.path.exists(path):
        url = atlas_ftp + exp
        req = requests.get(url)
        open(path, 'wb').write(req.content)


root_password = getpass(prompt='MySQL root password: ')

# create MySQL database and user
connection_root = pymysql.connect(host='localhost',
                          user='root',
                          password=root_password,
                          charset='utf8mb4')
cursor_root = connection_root.cursor()
cursor_root.execute("drop database if exists pd_atlas")
cursor_root.execute("create database if not exists pd_atlas")

cursor_root.execute("CREATE USER IF NOT EXISTS 'pd_user'@'localhost' IDENTIFIED BY 'pd_password'")
cursor_root.execute("GRANT ALL ON `pd_atlas`.* TO 'pd_user'@'localhost'")
cursor_root.execute("flush privileges")
connection_root.close()

MySQL root password: ········


In [12]:
import os
import pandas as pd

from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, ForeignKey, Float, create_engine


Base = declarative_base()

con_str ='mysql+pymysql://pd_user:pd_password@localhost/pd_atlas'
engine = create_engine(con_str)
session = Session(engine)


class Experiments(Base):
    __tablename__ = 'parkinson_experiment'
    exp_id = Column(Integer,primary_key = True)
    experiment_id = Column(String(30),nullable=False)
    group_id = Column(String(30),nullable=False)

class E_MEXP_1416(Base):
    __tablename__ = 'E_MEXP_1416'
    id = Column(Integer,primary_key = True)
    gene_name = Column(String(30),nullable=False)
    p_value = Column(Float)
    log2foldchange = Column(Float,nullable=False)
    experiment_group = Column(Integer, ForeignKey('parkinson_experiment.exp_id'), nullable=False)

class E_GEOD_20333(Base):
    __tablename__ = 'E_GEOD_20333'
    id = Column(Integer,primary_key = True)
    gene_name = Column(String(30),nullable=False)
    p_value = Column(Float)
    log2foldchange = Column(Float,nullable=False)
    experiment_group = Column(Integer, ForeignKey('parkinson_experiment.exp_id'), nullable=False)

class E_GEOD_7307(Base):
    __tablename__ = 'E_GEOD_7307'
    id = Column(Integer,primary_key = True)
    gene_name = Column(String(30),nullable=False)
    p_value = Column(Float)
    log2foldchange = Column(Float,nullable=False)
    experiment_group = Column(Integer, ForeignKey('parkinson_experiment.exp_id'), nullable=False)
    
class E_GEOD_7621(Base):
    __tablename__ = 'E_GEOD_7621'
    id = Column(Integer,primary_key = True)
    gene_name = Column(String(30),nullable=False)
    p_value = Column(Float)
    log2foldchange = Column(Float,nullable=False)
    experiment_group = Column(Integer, ForeignKey('parkinson_experiment.exp_id'), nullable=False)
    
class E_GEOD_20168(Base):
    __tablename__ = 'E_GEOD_20168'
    id = Column(Integer,primary_key = True)
    gene_name = Column(String(30),nullable=False)
    p_value = Column(Float)
    log2foldchange = Column(Float,nullable=False)
    experiment_group = Column(Integer, ForeignKey('parkinson_experiment.exp_id'), nullable=False)


class PD_db:
    """ Create and Import data in the database """
    
    def __init__(self, engine, Base):
        self.Base = Base
        self.engine = engine
        self.parkinson_exp = None
        self.exp_tables = None
    

    def create_database(self):
        self.Base.metadata.drop_all(self.engine)
        self.Base.metadata.create_all(self.engine)
        self._import_data()
    
    def _experiment_groups(self):
        exp_group = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
                     'E-GEOD-20333' : ['g2_g1'],
                     'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
                     'E-GEOD-7621' : ['g1_g2'],
                     'E-GEOD-20168' : ['g2_g1']}
        # table with all experiments and groups
        parkinson_exp = pd.DataFrame(exp_group.items(), columns=['experiment_id', 'group_id'])
        parkinson_exp = parkinson_exp.explode('group_id', ignore_index=True)
        parkinson_exp.set_axis([i for i in range(1, len(parkinson_exp) + 1)], axis=0, inplace=True)
        parkinson_exp.rename_axis('exp_id', inplace=True)
        self.parkinson_exp = parkinson_exp
    
    def _experiment_tables(self):
        # Create tables for each experiment
        # Each experiment has different groups with pvalue, log2foldchange for same genes
        # hence, create a small tables with gene name, pvalue and log2foldchange for every group in a experiment
        # and these small tables to one big table of single experiment
        # concate tables of different groups with same experiment to one experiment table

        # store tables (to insert in database)
        exp_tables = {}

        for path in datafile_paths:
            # read data files
            data = pd.read_csv(path, sep='\t')
            data.dropna(subset='Gene Name', inplace=True, axis=0)
            exp_name = os.path.basename(path).split('_')[0]
            # find the groups with same the experiment
            groups = self.parkinson_exp[self.parkinson_exp['experiment_id'] == exp_name]
            # concat group tables
            for exp_id, (exp_name, group) in groups.iterrows():
                colnames = {'Gene Name' : 'gene_name', 
                            f'{group}.p-value' : 'p_value',
                            f'{group}.log2foldchange' : 'log2foldchange'}
                df = data[['Gene Name', f'{group}.p-value', f'{group}.log2foldchange']].copy(deep=False)
                df.rename(columns=colnames, inplace=True)
                df['experiment_group'] = [exp_id for i in range(len(df))]
                if exp_name in exp_tables:
                    group_df = exp_tables[exp_name].copy(deep=False)
                    exp_tables[exp_name] = pd.concat([group_df, df])
                else:
                    exp_tables[exp_name] = df
        
        self.exp_tables = exp_tables
    
    def _import_data(self):
        self._experiment_groups()
        self._experiment_tables()
        self.parkinson_exp.to_sql('parkinson_experiment', self.engine, if_exists='append', index=True)
        for name, table in self.exp_tables.items():
            table.set_axis([i for i in range(1, len(table) + 1)], axis=0, inplace=True)
            table.rename_axis('id', inplace=True)
            table.to_sql(name.lower().replace('-', '_'), self.engine, if_exists='append', index=True)


### wrapper function to create the database
def create_database():
    obj = PD_db(engine=engine, Base=Base)
    obj.create_database()



In [15]:
create_database()

In [50]:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from typing import Dict

class Group1:
    '''Class for extracting upregulated and downregulated genes from Expression Atlas.'''

    @staticmethod
    def get_up_and_down_regulated_hgnc_symbols(
                 experiment_id: str,
                 group_id: str,
                 threshold_p_value : float = 0.05,
                 threshold_log2fold_change: float = 1) -> Dict[list, list]:
        '''Queries the database according to the input values
        Returns
        -------
        Dict[list, list]
            a dictionary of two lists(genes_up and genes_down)
        '''
        
        # check whether experiment id and group id are correct
        experiment_groups = {'E-MEXP-1416' : ['g2_g1', 'g4_g3'],
             'E-GEOD-20333' : ['g2_g1'],
             'E-GEOD-7307' : ['g83_g17','g82_g16', 'g72_g15', 'g63_g14', 'g48_g13'],
             'E-GEOD-7621' : ['g1_g2'],
             'E-GEOD-20168' : ['g2_g1']}
        if experiment_id not in experiment_groups.keys():
            raise ValueError ("Incorrect experiment ID for Parkinson's disease")
        elif group_id not in experiment_groups[experiment_id]:
            raise ValueError (f"Incorrect group ID for experiment {experiment_id}")        

        map_dict = {'E-MEXP-1416' : E_MEXP_1416,
             'E-GEOD-20333' : E_GEOD_20333,
             'E-GEOD-7307' : E_GEOD_7307,
             'E-GEOD-7621' : E_GEOD_7621,
             'E-GEOD-20168' : E_GEOD_20168}
            
        con_str ='mysql+pymysql://pd_user:pd_password@localhost/pd_atlas'
        engine = create_engine(con_str)
        session = Session(engine)
        
        # SQL query
        experiment_group_id = session.query(Experiments).filter(Experiments.experiment_id==experiment_id, Experiments.group_id==group_id).one()
        
        id = experiment_group_id.exp_id
        
        genes_up = session.query(map_dict[experiment_id].gene_name).filter(map_dict[experiment_id].experiment_group==id, map_dict[experiment_id].p_value < threshold_p_value, \
                    map_dict[experiment_id].log2foldchange > threshold_log2fold_change).all()
        genes_up = [g[0] for g in genes_up]
        
        genes_down = session.query(map_dict[experiment_id].gene_name).filter(map_dict[experiment_id].experiment_group==id, map_dict[experiment_id].p_value < threshold_p_value, \
                    map_dict[experiment_id].log2foldchange < - threshold_log2fold_change).all()
        genes_down = [g[0] for g in genes_down]
        
        return {'up':genes_up,'down':genes_down}