In [1]:
import pandas as pd
from Bio import Entrez
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

import re

from Bio import SeqIO

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

37745


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
query = "SELECT * FROM sequence"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration'

In [21]:
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)

In [8]:
def get_taxonomy_data(record):
    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print("Fetched taxid from NCBI {}".format(id))
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for \n {} setting it to 1".format(s))
        taxids.append(1)  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print("Unexpected error: {}, Retrying, attempt {}".format(sys.exc_info()[0], i))
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None: taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None: taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add cH2B.15_(Homo_sapiens)

https://github.com/intbio/histonedb/blob/master/CURATED_SET/human_histones.csv

```H2BC12L histone gene There is a human-specific duplication of the H2BC12 gene from the chromosome 6 replication-dependent cluster gene on chromosome 21. CAGE tag data [37] supports expression of this gene and as there are no frameshifts or deletions within the open reading frame, it is annotated as coding. Although the gene appears to be expressed there is no direct evidence that a protein is produced. The encoded protein does not represent a new histone variant—it only has one nonsynonymous amino acid difference from the H2B protein encoded by the parent gene H2BC12, and is classified as a “canonical” histone when analyzing the sequence via the Histone DB2.0 database. Therefore, this gene has been named as H2BC12L for “H2B clustered histone 12 like”.```
**From article:** https://doi.org/10.1186/s13072-022-00467-2

**Protein accession:** NP_059141.1

In [9]:
ACCESSION = "NP_059141.1"

## Get sequence NP_059141.1 from NCBI

In [10]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: NP_059141.1
Name: NP_059141
Description: histone H2B type F-S [Homo sapiens]
Number of features: 8
/topology=linear
/data_file_division=PRI
/date=25-OCT-2021
/accessions=['NP_059141']
/sequence_version=1
/db_source=REFSEQ: accession NM_017445.3
/keywords=['RefSeq', 'MANE Select']
/source=Homo sapiens (human)
/organism=Homo sapiens
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Primates', 'Haplorrhini', 'Catarrhini', 'Hominidae', 'Homo']
/references=[Reference(title='Histone H2B monoubiquitination functions cooperatively with FACT to regulate elongation by RNA polymerase II', ...), Reference(title='Characterization of histone H2A and H2B variants and their post-translational modifications by mass spectrometry', ...), Reference(title='Monoubiquitination of human histone H2B: the factors involved and their roles in HOX gene regulation', ...), Reference(title='Apoptotic phosphorylation of histone H2B

In [11]:
str(record.seq)

'MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKESYSVYVYKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLPHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK'

In [12]:
record.annotations["organism"]

'Homo sapiens'

In [13]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Homo sapiens',
 'taxonomy_id': 9606,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding

In [14]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "cH2B.15_(Homo_sapiens)",
    "gi": None,
    "ncbi_gene_id": 54145,
    "hgnc_gene_name": "H2BC12L",
    "taxonomy_id": 9606,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'NP_059141.1',
 'variant': 'cH2B.15_(Homo_sapiens)',
 'gi': None,
 'ncbi_gene_id': 54145,
 'hgnc_gene_name': 'H2BC12L',
 'taxonomy_id': 9606,
 'organism': 'Homo sapiens',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKESYSVYVYKVLKQVHPDTGISSKAMGIMNSFVNDIFERIAGEASRLPHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK',
 'variant_under_consideration': None}

In [15]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'int'>
hgnc_gene_name <class 'str'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [16]:
cursor.execute(add_sequence, data_sequence)

In [17]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,NP_059141.1,cH2B.15_(Homo_sapiens),,54145,H2BC12L,9606,Homo sapiens,Chordata,Mammalia,,,MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKESYSVYVYKVL...,,,


## Add publication

In [20]:
pid = "36180920"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [22]:
pub_data = {
    "id": pid,
    "title": None,
    "doi": None,
    "author": None,
    "year": None
}
cursor.execute(add_publication, pub_data)

In [23]:
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,36180920,,,,


In [24]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [25]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,NP_059141.1,cH2B.15_(Homo_sapiens),,54145,H2BC12L,9606,Homo sapiens,Chordata,Mammalia,,,MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKESYSVYVYKVL...,,NP_059141.1,36180920


In [26]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='cH2B.15_(Homo_sapiens)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,NP_059141.1,cH2B.15_(Homo_sapiens),,54145,H2BC12L,9606,Homo sapiens,Chordata,Mammalia,,,MPEPAKSAPAPKKGSKKAVTKAQKKDGRKRKRSRKESYSVYVYKVL...,,NP_059141.1,36180920


In [27]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [28]:
cursor.close()
conn.close()
tunnel.stop()