# Day5 exercise 2

- 18.03.2022
- Kriti Amin

# SQLAlchemy

SQLAlchemy is the Python SQL toolkit and Object Relational Mapper

* [website](https://www.sqlalchemy.org/)
* [documentation](https://docs.sqlalchemy.org/en/14/)
* [Introduction as Jupyter Notebook](sqlalchemy_into.ipynb)

If needed execute following SQLs in MySQL CLI

```sql
DROP DATABASE IF EXISTS biodb;
CREATE DATABASE biodb;
SHOW DATABASES like 'biodb';
CREATE USER IF NOT EXISTS 'biodb_user'@'localhost' IDENTIFIED BY 'biodb_password';
SELECT User FROM mysql.user WHERE User LIKE 'biodb_user';
GRANT ALL ON `biodb`.* TO 'biodb_user'@'localhost';
FLUSH PRIVILEGES;
```

In [1]:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
engine = create_engine('mysql+pymysql://biodb_user:biodb_password@localhost/biodb')
session = Session(engine)

## Import SQL dump

If link to dump file is not provided until now by Christian, please complain!

1. Download zip file
2. Unzip file

execute on CMD/terminal:
```bash
mysql -u biodb_user -pbiodb_password biodb < C:\Users\kriti\Dropbox\PC\Desktop\BioDB\biodb.sql
```

"C:\Users\kriti\Dropbox\PC\Desktop\BioDB\biodb.sql"

Should be finished < 1 min.

## Model definition

In [3]:
"""UniProt RDBMS model definition."""
from sqlalchemy.orm import relationship
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Table, Text, ForeignKey

from collections import defaultdict

Base = declarative_base()

uniprot__uniprot_keyword = Table('uniprot__uniprot_keyword', Base.metadata,
                                 Column('uniprot_id', Integer, ForeignKey('uniprot.id')),
                                 Column('uniprot_keyword_id', Integer,
                                        ForeignKey('uniprot_keyword.keywordid'))
                                 )

uniprot__uniprot_host = Table('uniprot__uniprot_host', Base.metadata,
                              Column('uniprot_id', Integer, ForeignKey('uniprot.id')),
                              Column('uniprot_organism_id', Integer, ForeignKey('uniprot_organism.taxid'))
                              )

uniprot__uniprot_xref = Table('uniprot__uniprot_xref', Base.metadata,
                              Column('uniprot_id', Integer, ForeignKey('uniprot.id')),
                              Column('uniprot_xref_id', Integer, ForeignKey('uniprot_xref.id'))
                              )

uniprot__uniprot_subcellular_location = Table('uniprot__uniprot_subcellular_location', Base.metadata,
                                              Column('uniprot_id', Integer, ForeignKey('uniprot.id')),
                                              Column('uniprot_subcellular_location_id',
                                                     Integer, ForeignKey('uniprot_subcellular_location.id'))
                                              )


class Uniprot(Base):
    """Class definition for the UniProt table."""

    __tablename__ = 'uniprot'

    id = Column(Integer, primary_key=True)

    accession = Column(String(20), unique=True)
    name = Column(String(100), nullable=False, unique=True)
    recommended_name = Column(String(255), nullable=True)

    taxid = Column(Integer, ForeignKey('uniprot_organism.taxid'), nullable=False, index=True)
    organism = relationship("Organism")

    function_id = Column(Integer, ForeignKey('uniprot_function.id'), nullable=True)
    function = relationship("Function")

    gene_names = relationship("Gene", back_populates="uniprot")

    gene_symbol = relationship("GeneSymbol", uselist=False, back_populates="uniprot")

    keywords = relationship(
        "Keyword",
        secondary=uniprot__uniprot_keyword,
        back_populates="uniprots")

    xrefs = relationship(
        "Xref",
        secondary=uniprot__uniprot_xref,
        back_populates="uniprots"
    )

    subcellular_locations = relationship(
        "SubcellularLocation",
        secondary=uniprot__uniprot_subcellular_location,
        back_populates="uniprots"
    )

    def __repr__(self):
        return self.name

    def as_dict(self):
        """Convert object values to dictionary."""
        xrefs_grouped = defaultdict(list)
        for xref in self.xrefs:
            xrefs_grouped[xref.db].append(xref.identifier)
        xrefs_grouped = {k: sorted(v) for k, v in xrefs_grouped.items()}

        return {
            'name': self.name,
            'accession': self.accession,
            'recommended_name': self.recommended_name,
            'taxid': self.taxid,
            'function_description': self.function.description if self.function else self.function,
            'gene_names': [x.name for x in self.gene_names],
            'gene_symbol': self.gene_symbol.symbol if self.gene_symbol else self.gene_symbol,
            'keywords': [{'keyword': x.keyword_name, 'id': x.keywordid} for x in self.keywords],
            'xrefs': xrefs_grouped,
            'subcellular_locations': [x.name for x in self.subcellular_locations],
            'organism': self.organism.scientific_name
        }


class GeneSymbol(Base):
    """Class definition for the uniprot_gene_symbol table."""
    __tablename__ = 'uniprot_gene_symbol'
    id = Column(Integer, primary_key=True)
    symbol = Column(String(100), nullable=False, index=True)
    uniprot_id = Column(Integer, ForeignKey('uniprot.id'))
    uniprot = relationship("Uniprot", back_populates="gene_symbol")

    def __repr__(self):
        """Define repr."""
        return self.symbol


class Gene(Base):
    """Class definition for the uniprot_gene table."""
    __tablename__ = 'uniprot_gene'
    id = Column(Integer, primary_key=True)
    name = Column(String(100), nullable=False, index=True)
    uniprot_id = Column(Integer, ForeignKey('uniprot.id'))
    uniprot = relationship("Uniprot", back_populates="gene_names")


class Keyword(Base):
    """Class definition for the uniprot_keyword table."""
    __tablename__ = 'uniprot_keyword'
    keywordid = Column(Integer, primary_key=True)
    keyword_name = Column(String(100), index=True)

    uniprots = relationship(
        "Uniprot",
        secondary=uniprot__uniprot_keyword,
        back_populates="keywords")

    def __repr__(self):
        """Define repr."""
        return f"{self.keyword_name}[{self.keywordid}]"


class Organism(Base):
    """Class definition for the uniprot_organism table."""
    __tablename__ = 'uniprot_organism'
    taxid = Column(Integer, primary_key=True)
    scientific_name = Column(String(255))


class SubcellularLocation(Base):
    """Class definition for the uniprot_subcellular_location table."""
    __tablename__ = 'uniprot_subcellular_location'
    id = Column(Integer, primary_key=True)
    name = Column(String(100), index=True)
    uniprots = relationship(
        "Uniprot",
        secondary=uniprot__uniprot_subcellular_location,
        back_populates="subcellular_locations")


class Xref(Base):
    """Class definition for the uniprot_xref table."""
    __tablename__ = 'uniprot_xref'
    id = Column(Integer, primary_key=True)
    db = Column(String(50), index=True)
    identifier = Column(String(100), index=True)
    uniprots = relationship(
        "Uniprot",
        secondary=uniprot__uniprot_xref,
        back_populates="xrefs")


class Function(Base):
    """Class definition for the uniprot_function table."""
    __tablename__ = 'uniprot_function'
    id = Column(Integer, primary_key=True)
    description = Column(Text)
    uniprots = relationship("Uniprot", back_populates="function")

## Query

In [4]:
session.query(Uniprot).limit(10).all()

[2A5D_HUMAN,
 2A5E_HUMAN,
 2A5G_HUMAN,
 5HT1B_HUMAN,
 5HT1D_HUMAN,
 A16L1_HUMAN,
 A16L2_HUMAN,
 A26L1_HUMAN,
 AA2AR_HUMAN,
 AA2BR_HUMAN]

In [10]:
query = session.query(Uniprot).filter_by(accession="P20138").limit(2)

print the SQL statement

In [11]:
from sqlalchemy.dialects import mysql
print(str(query.statement.compile(dialect=mysql.dialect())))

SELECT uniprot.id, uniprot.accession, uniprot.name, uniprot.recommended_name, uniprot.taxid, uniprot.function_id 
FROM uniprot 
WHERE uniprot.accession = %s 
 LIMIT %s


In [40]:
query.all()

[CD33_HUMAN]

In [41]:
cd33 = query.one()

In [42]:
cd33.as_dict()

{'name': 'CD33_HUMAN',
 'accession': 'P20138',
 'recommended_name': 'Myeloid cell surface antigen CD33',
 'taxid': 9606,
 'function_description': 'Sialic-acid-binding immunoglobulin-like lectin (Siglec) that plays a role in mediating cell-cell interactions and in maintaining immune cells in a resting state (PubMed:10611343, PubMed:15597323, PubMed:11320212). Preferentially recognizes and binds alpha-2,3- and more avidly alpha-2,6-linked sialic acid-bearing glycans (PubMed:7718872). Upon engagement of ligands such as C1q or syalylated glycoproteins, two immunoreceptor tyrosine-based inhibitory motifs (ITIMs) located in CD33 cytoplasmic tail are phosphorylated by Src-like kinases such as LCK (PubMed:28325905, PubMed:10887109). These phosphorylations provide docking sites for the recruitment and activation of protein-tyrosine phosphatases PTPN6/SHP-1 and PTPN11/SHP-2 (PubMed:10556798, PubMed:10206955, PubMed:10887109). In turn, these phosphatases regulate downstream pathways through depho

In [19]:
cd33.function.description

'Sialic-acid-binding immunoglobulin-like lectin (Siglec) that plays a role in mediating cell-cell interactions and in maintaining immune cells in a resting state (PubMed:10611343, PubMed:15597323, PubMed:11320212). Preferentially recognizes and binds alpha-2,3- and more avidly alpha-2,6-linked sialic acid-bearing glycans (PubMed:7718872). Upon engagement of ligands such as C1q or syalylated glycoproteins, two immunoreceptor tyrosine-based inhibitory motifs (ITIMs) located in CD33 cytoplasmic tail are phosphorylated by Src-like kinases such as LCK (PubMed:28325905, PubMed:10887109). These phosphorylations provide docking sites for the recruitment and activation of protein-tyrosine phosphatases PTPN6/SHP-1 and PTPN11/SHP-2 (PubMed:10556798, PubMed:10206955, PubMed:10887109). In turn, these phosphatases regulate downstream pathways through dephosphorylation of signaling molecules (PubMed:10206955, PubMed:10887109). One of the repressive effect of CD33 on monocyte activation requires phosp

In [22]:
[x.name for x in cd33.gene_names]

['CD33', 'SIGLEC3']

## Exercises

#### 1. Print the first 3 uniprot entries ordered by uniprot.id and print accession and name

In [19]:
query = session.query(Uniprot.accession, Uniprot.name).order_by(Uniprot.id).limit(3).all()
for i in query:
    print(i)

('Q14738', '2A5D_HUMAN')
('Q16537', '2A5E_HUMAN')
('Q13362', '2A5G_HUMAN')


#### 2. Get entries 100-103 from uniprot

https://docs.sqlalchemy.org/en/14/orm/tutorial.html#querying

In [28]:
session.query(Uniprot)[100:103]

[ARIP4_HUMAN, ARMX3_HUMAN, ARMX4_HUMAN]

#### 3. Get UniProt function description for name='TREM2_HUMAN'

In [59]:
trem2 = session.query(Uniprot).filter_by(name='TREM2_HUMAN').one()

In [60]:
trem2.function.description

'Forms a receptor signaling complex with TYROBP which mediates signaling and cell activation following ligand binding (PubMed:10799849). Acts as a receptor for amyloid-beta protein 42, a cleavage product of the amyloid-beta precursor protein APP, and mediates its uptake and degradation by microglia (PubMed:27477018, PubMed:29518356). Binding to amyloid-beta 42 mediates microglial activation, proliferation, migration, apoptosis and expression of pro-inflammatory cytokines, such as IL6R and CCL3, and the anti-inflammatory cytokine ARG1 (By similarity). Acts as a receptor for lipoprotein particles such as LDL, VLDL, and HDL and for apolipoproteins such as APOA1, APOA2, APOB, APOE, APOE2, APOE3, APOE4, and CLU and enhances their uptake in microglia (PubMed:27477018). Binds phospholipids (preferably anionic lipids) such as phosphatidylserine, phosphatidylethanolamine, phosphatidylglycerol and sphingomyelin (PubMed:29794134). Regulates microglial proliferation by acting as an upstream regula

#### 4. Print accession and gene_symbol for all uniprot entries where ***CD33*** in name

In [71]:
cd33 = session.query(Uniprot).filter(Uniprot.name.like('%CD33%'))
for i in cd33:
    print('Accession : ',i.accession, 'Gene Symbol : ', i.gene_symbol)

Accession :  Q8N5R6 Gene Symbol :  CCDC33
Accession :  P20138 Gene Symbol :  CD33


#### 5. Get all uniprot entries where name in ('CCD33_HUMAN', 'TREM2_HUMAN')

In [74]:
session.query(Uniprot).filter(Uniprot.name.in_(['CCD33_HUMAN', 'TREM2_HUMAN'])).all()

[CCD33_HUMAN, TREM2_HUMAN]

#### 6. Get all uniprot entries where name is *CD33_HUMAN* and accession *P20138*

In [75]:
session.query(Uniprot).filter(Uniprot.name == 'CD33_HUMAN', Uniprot.accession == 'P20138').all()

[CD33_HUMAN]

#### 7. Get all uniprot entries where name is 'CCD33_HUMAN' or 'TREM2_HUMAN'

In [76]:
from sqlalchemy import or_
session.query(Uniprot).filter(or_(Uniprot.name == 'CD33_HUMAN', Uniprot.name == 'TREM2_HUMAN')).all()

[CD33_HUMAN, TREM2_HUMAN]

#### 8. How many functions contains the word Alzheimer

In [77]:
session.query(Function).filter(Function.description.like('%Alzheimer%')).count()

14

#### 9. Get all uniprot entries where keyword name is 'Alzheimer disease'

In [78]:
session.query(Uniprot).join(uniprot__uniprot_keyword).join(Keyword).filter(Keyword.keyword_name=='Alzheimer disease').all()

[ABCA7_HUMAN,
 A4_HUMAN,
 TAU_HUMAN,
 CATD_HUMAN,
 APOE_HUMAN,
 GSK3A_HUMAN,
 GSK3B_HUMAN,
 SYUA_HUMAN,
 DREB_HUMAN,
 NU2M_HUMAN,
 PPP5_HUMAN,
 RGPS2_HUMAN,
 PSN1_HUMAN,
 PSN2_HUMAN,
 ADA10_HUMAN,
 NU1M_HUMAN,
 UNC5C_HUMAN,
 SORL_HUMAN]