In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

38721


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [9]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
# add_sequence = (
#     "INSERT INTO sequence "
#     "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
#     "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

In [None]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# To Do
    
## <span style="color:black">Add publicationes for added histone nodes</span>

# <span style="color:black">Add publicationes for added histone nodes</span>

**New *Drosophilidae* nodes:** cenH3.1_(Drosophilidae), cenH3.2_(Drosophila_eugracilis), cenH3.3_(Montium), cenH3.4_(Montium), cenH3.5_(Drosophila), cenH3.6_(Repleta)

**Publications:** kursel_recurrent_2017, kursel_gametic_2021, teixeira_concurrent_2018

**New *Culicidae* nodes:** cenH3.1_(Culicidae), cenH3.2_(Culicidae), cenH3.3_(Aedes)

**Publications:** kursel_ancient_2020

### Add Drosophilidae histones publications

**New *Drosophilidae* nodes:** cenH3.1_(Drosophilidae), cenH3.2_(Drosophila_eugracilis), cenH3.3_(Montium), cenH3.4_(Montium), cenH3.5_(Drosophila), cenH3.6_(Repleta)

**Publications:** kursel_recurrent_2017, kursel_gametic_2021, teixeira_concurrent_2018

In [8]:
pids = ["kursel_recurrent_2017", "kursel_gametic_2021", "teixeira_concurrent_2018"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
118,kursel_gametic_2021,,,,,
119,kursel_recurrent_2017,,,,,
173,teixeira_concurrent_2018,,,,,


In [10]:
variants = [
    "cenH3.1_(Drosophilidae)",
    "cenH3.2_(Drosophila_eugracilis)",
    "cenH3.3_(Montium)",
    "cenH3.4_(Montium)",
    "cenH3.5_(Drosophila)",
    "cenH3.6_(Repleta)",
]

In [12]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(variants)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
6,cenH3.1_(Drosophilidae),variant,Drosophilidae,7214,240.0,cenH3_(Drosophilidae),,
10,cenH3.2_(Drosophila_eugracilis),variant,Drosophila eugracilis,29029,241.0,cenH3_(Drosophilidae),,
14,cenH3.3_(Montium),variant,montium subgroup,32352,242.0,cenH3_(Drosophilidae),,
15,cenH3.4_(Montium),variant,montium subgroup,32352,243.0,cenH3_(Drosophilidae),,
16,cenH3.5_(Drosophila),variant,Drosophila,32281,244.0,cenH3_(Drosophilidae),,
17,cenH3.6_(Repleta),variant,repleta group,32321,245.0,cenH3_(Drosophilidae),,


In [13]:
for v in variants:
    for pid in pids:
        cursor.execute(add_histone_has_publication, (v, pid))

In [14]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(variants)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
6,cenH3.1_(Drosophilidae),variant,Drosophilidae,7214,240.0,cenH3_(Drosophilidae),cenH3.1_(Drosophilidae),kursel_gametic_2021
7,cenH3.1_(Drosophilidae),variant,Drosophilidae,7214,240.0,cenH3_(Drosophilidae),cenH3.1_(Drosophilidae),kursel_recurrent_2017
8,cenH3.1_(Drosophilidae),variant,Drosophilidae,7214,240.0,cenH3_(Drosophilidae),cenH3.1_(Drosophilidae),teixeira_concurrent_2018
12,cenH3.2_(Drosophila_eugracilis),variant,Drosophila eugracilis,29029,241.0,cenH3_(Drosophilidae),cenH3.2_(Drosophila_eugracilis),kursel_gametic_2021
13,cenH3.2_(Drosophila_eugracilis),variant,Drosophila eugracilis,29029,241.0,cenH3_(Drosophilidae),cenH3.2_(Drosophila_eugracilis),kursel_recurrent_2017
14,cenH3.2_(Drosophila_eugracilis),variant,Drosophila eugracilis,29029,241.0,cenH3_(Drosophilidae),cenH3.2_(Drosophila_eugracilis),teixeira_concurrent_2018
18,cenH3.3_(Montium),variant,montium subgroup,32352,242.0,cenH3_(Drosophilidae),cenH3.3_(Montium),kursel_gametic_2021
19,cenH3.3_(Montium),variant,montium subgroup,32352,242.0,cenH3_(Drosophilidae),cenH3.3_(Montium),kursel_recurrent_2017
20,cenH3.3_(Montium),variant,montium subgroup,32352,242.0,cenH3_(Drosophilidae),cenH3.3_(Montium),teixeira_concurrent_2018
21,cenH3.4_(Montium),variant,montium subgroup,32352,243.0,cenH3_(Drosophilidae),cenH3.4_(Montium),kursel_gametic_2021


In [15]:
df[df["id"].isin(variants)]["publication_id"].value_counts()

publication_id
kursel_gametic_2021         6
kursel_recurrent_2017       6
teixeira_concurrent_2018    6
Name: count, dtype: int64

In [16]:
# Make sure data is committed to the database
conn.commit()

### Add Culicidae histones publications

**New *Culicidae* nodes:** cenH3.1_(Culicidae), cenH3.2_(Culicidae), cenH3.3_(Aedes)

**Publications:** kursel_ancient_2020

In [17]:
pids = ["kursel_ancient_2020"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
117,kursel_ancient_2020,,,,,


In [18]:
variants = ["cenH3.1_(Culicidae)", "cenH3.2_(Culicidae)", "cenH3.3_(Aedes)"]

In [19]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(variants)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
5,cenH3.1_(Culicidae),variant,Culicidae,7157,246.0,cenH3_(Culicidae),,
11,cenH3.2_(Culicidae),variant,Culicidae,7157,247.0,cenH3_(Culicidae),,
17,cenH3.3_(Aedes),variant,Aedes,7158,248.0,cenH3_(Culicidae),,


In [20]:
for v in variants:
    for pid in pids:
        cursor.execute(add_histone_has_publication, (v, pid))

In [21]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(variants)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
5,cenH3.1_(Culicidae),variant,Culicidae,7157,246.0,cenH3_(Culicidae),cenH3.1_(Culicidae),kursel_ancient_2020
11,cenH3.2_(Culicidae),variant,Culicidae,7157,247.0,cenH3_(Culicidae),cenH3.2_(Culicidae),kursel_ancient_2020
17,cenH3.3_(Aedes),variant,Aedes,7158,248.0,cenH3_(Culicidae),cenH3.3_(Aedes),kursel_ancient_2020


In [22]:
df[df["id"].isin(variants)]["publication_id"].value_counts()

publication_id
kursel_ancient_2020    3
Name: count, dtype: int64

In [23]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [24]:
cursor.close()
conn.close()
tunnel.stop()