In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

43087


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [7]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Update name for Bacterial dimers

In [8]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "Bacterial dimers"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,Bacterial dimers,variant_group,Bacteria,2,215.0,Singlet


In [9]:
query = f"UPDATE histone SET id='Filament_forming' WHERE id='Bacterial dimers'"
cursor.execute(query)

In [10]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "Bacterial dimers"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


In [11]:
histone_df[histone_df["id"] == "Filament_forming"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
82,Filament_forming,variant_group,Bacteria,2,215.0,Singlet


In [12]:
query = "SELECT * FROM sequence WHERE variant='Filament_forming'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,ADI38622.1,Filament_forming,,,,716544,Waddlia chondrophila WSU 86-1044,Chlamydiota,Chlamydiia,,,MNENLVVVSKVKKYIKSKAGMNTSANVMDQLSKIVEKEIEKAVQNA...,
1,AFX99764.1,Filament_forming,,,,1069642,Bdellovibrio bacteriovorus str. Tiberius,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,
2,AGH94288.1,Filament_forming,,,,1184267,Pseudobdellovibrio exovorus JSS,Bdellovibrionota,Bdellovibrionia,,,MSEEVVLVVTSKVKKFIKEKGEMNTSAETIDMLSKAIERLCLKGIE...,
3,ASD65320.1,Filament_forming,,,,959,Bdellovibrio bacteriovorus,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,
4,AZZ35344.1,Filament_forming,,,,1916293,Bdellovibrio sp. qaytius,Bdellovibrionota,Bdellovibrionia,,,MSDDILVVTSKVKKYIKEKGQMNTSAETIDMLTKAVERLCAKGIES...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,TVQ38025.1,Filament_forming,,,,1898206,Spirochaetaceae bacterium,Spirochaetota,Spirochaetia,,,MSDKESLVIASKVKSYIKNTGDLKCSAAVMDVLSDKIRAICDEAIR...,
207,TVR55163.1,Filament_forming,,,,1898206,Spirochaetaceae bacterium,Spirochaetota,Spirochaetia,,,MRYIMSQHFFEGVSMGEKEVLVIASKVKSYIKSKGDLKCSAAVADV...,
208,TWU31271.1,Filament_forming,,,,2528033,Candidatus Brocadiaceae bacterium S225,Planctomycetota,Candidatus Brocadiia,,,MSDSNSEKEVLVVTSKLKKYIRESSGMSTSANVAPALSDTIRNLCN...,
209,TXI77600.1,Filament_forming,,,,2291710,Dokdonella sp.,Pseudomonadota,Gammaproteobacteria,,,MAETLVVVSKIKKMVKDKGLRTGGDYIEGLSKKVEDIVNAAVAKVQ...,


In [13]:
# Make sure data is committed to the database
conn.commit()

# Update name for Bridge

In [14]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "Bridge"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,Bridge,variant_group,"Archaea, Bacteria, Viruses","2157, 2, 10239",214.0,Singlet


In [22]:
query = "SELECT * FROM histone WHERE parent='Bridge'"
cursor.execute(query)
children = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])['id'].values
children

array(['Bridge_(Methanococcales)', 'Coiled-coil', 'Phage-histones',
       'RdgC-histones'], dtype=object)

In [29]:
for ch in children:
    query = f"UPDATE histone SET parent=null WHERE id='{ch}'"
    cursor.execute(query)

In [30]:
query = "SELECT * FROM histone WHERE parent='Bridge'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])['id'].values

array([], dtype=object)

In [31]:
query = f"UPDATE histone SET id='DNA_bridging' WHERE id='Bridge'"
cursor.execute(query)

In [32]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "Bridge"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


In [33]:
histone_df[histone_df["id"] == "DNA_bridging"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
79,DNA_bridging,variant_group,"Archaea, Bacteria, Viruses","2157, 2, 10239",214.0,Singlet


In [34]:
for ch in children:
    query = f"UPDATE histone SET parent='DNA_bridging' WHERE id='{ch}'"
    cursor.execute(query)

In [35]:
query = "SELECT * FROM histone WHERE parent='DNA_bridging'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])['id'].values

array(['Bridge_(Methanococcales)', 'Coiled-coil', 'Phage-histones',
       'RdgC-histones'], dtype=object)

In [36]:
query = "SELECT * FROM sequence WHERE variant='DNA_bridging'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [37]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [38]:
cursor.close()
conn.close()
tunnel.stop()