In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

38827


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

# Add description for cenH3_(Homo_sapiens)

In [10]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Homo_sapiens)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Homo_sapiens),variant,Homo sapiens,9606,158,cenH3_(Mammalia),158,,,,...,,,,,,,,,,


In [11]:
df["summary"].values[0]

'null'

In [12]:
# desc_dict = {
#     "summary": None,
#     "taxonomy": None,
#     "genes": None,
#     "evolution": None,
#     "expression": None,
#     "knock_out": None,
#     "function": None,
#     "sequence": None,
#     "localization": None,
#     "deposition": None,
#     "structure": None,
#     "interactions": None,
#     "disease": None,
#     "caveats": None,
# }

In [12]:
desc_dict = {
    "summary": "cenH3(Homo_sapiens) is a centromere-specific histone variant in human, often called CENP-A (Centromere Protein A), and an important component of active centromere required for chromosome segregation. For a general description see cenH3(Animals) class.",
    "knock_out": "CENP-A depletion at different stages of the cell cycle showed that the earlier CENP-A is removed, the more errors accumulate during later cell divisions [hoffmann_cenp-is_2016].",
    "function": "CENP-A nucleosomes are essential for chromosome segregation. A functional kinetochore interacts with active centromeric chromatin reached with CENP-A to form the mitotic spindle. Human CENP-A plays an important role in stabilization and retention inner kinetochore during G1 phase of cell cycle [pesenti_structure_2022, hoffmann_cenp-is_2016]. However, mitosis can proceed without CENP-A as long as the CENP-B protein remains stably bound to centromeric sequences, facilitating the assembly of the inner kinetochore [hoffmann_cenp-is_2016].",
    "localization": "CENP-A in human chromosomes usually localized in active centromere region within αSat arrays and associated with reduced CpG methylation [altemose_complete_2022, gershman_epigenetic_2022]. Although the centromere position is defined as a window with high CENP-A enrichment (totals 190 to 570 kb on each chromosome), CENP-A was also found near CDRs (centromere dip regions) and no all dips in CpG methylation was associated with CENP-A [altemose_complete_2022]. CENP-A is typically observed in young HOR-haps (although there are a few notable exceptions) [altemose_complete_2022]. Additionally, an analysis of CENP-A enrichment patterns on human chromosome X demonstrated that these patterns exhibit considerable inter-individual variability [altemose_complete_2022].",
    "deposition": "Deposition of CENP-A into centromeric nucleosomes during the late telophase/early G1 phase of the cell cycle mediated by four major and some minor factors and regulators, which are important for identifying centromeric localization and limit the process to a single round [pan_mechanism_2019, xu_gross_2023]. The specific CENP-A chaperone HJURP (Holliday Junction Recognition Protein) plays a key role in stabilizing the binding of CENP-A to histone H4 [foltz_centromere_2009, dunleavy_hjurp_2009, shuaib_hjurp_2010]. To target centromeres and load a new CENP-A HJURP form a stoichiometric complex with the two-subunit Mis18 complex (Mis18α and Mis18β) and Mis18-binding protein 1 (M18BP1) [hayashi_mis16_2004, pan_mechanism_2019]. Among minor proteins required for CENP-A loading there are RSF1, MgcRacGAP, Condensin II, and KAT7 [pan_mechanism_2019]. Furthermore, loading of CENP-A into centromeric nucleosomes contributed by CENP-I and CENP-B proteins that are specifically binds to the centromeric DNA. CENP-I, a subunit of the inner kinetochore complex, has been demonstrated to stabilise CENP-A nucleosomes in vitro [hu_cenp-i_2023]. The loss of CENP-B led to a slightly decrease in CENP-A incorporation, though the effect was less pronounced than with HJURP deletion, and a reduction in M18BP1 binding at centromeres assembled with a CENP-A variant without its CENP-C binding domain [fachinetti_dna_2015].",
    "interactions": "CENP-A nucleosomes stabilize the inner kinetochore known as the CCAN complex (Constitutive Centromere-Associated Network) through direct interaction with CENP-C and CENP-N proteins of this complex [cao_constitutive_2018, pesenti_structure_2022, sridhar_kinetochore_2022, xu_gross_2023]. Human CENP-C has been demonstrated to bind with the CENP-A C-terminal hydrophobic tail, the acidic patch of H2A-H2B and histone H4, thereby facilitating its engagement with all four histone subunits present on the nucleosome surface [allu_structure_2019]. The binding of CENP-C to CENP-A is stabilized by the process of phosphorylation of CDK1, which strengthens the complex's structure through the occurrence of an intramolecular interaction, and is critical for long-term viability in human RPE-1 cells [watanabe_cdk1-mediated_2019, walstein_assembly_2021]. This process is predominantly observed during the M-phase of the cell cycle [ariyoshi_cryoem_2021]. Human CENP-N has been observed to bind to CENP-A nucleosomes through a direct recognition of the L1 loop and RG loop of CENP-A [chittori_structural_2018, tian_molecular_2018, ariyoshi_cryoem_2021]. However, cryo-EM studies have revealed that CENP-C and CENP-N bind to a single CENP-A nucleosome in a non-simultaneous manner, thereby demonstrating an asymmetric structure where CENP-C and CENP-N bind to opposite sides of the nucleosome [ariyoshi_cryoem_2021]. Furthermore, in 94% of cases, phosphorylated CENP-C exhibits exclusive binding to the RG-loop of CENP-A, suggesting that this particular interaction is predominant [ariyoshi_cryoem_2021]. In addition, in vitro reconstruction of the human CENP-C protein showed that CENP-C can bind to two centromeric nucleosomes simultaneously [walstein_assembly_2021].",
}
desc_dict

{'summary': 'cenH3(Homo_sapiens) is a centromere-specific histone variant in human, often called CENP-A (Centromere Protein A), and an important component of active centromere required for chromosome segregation. For a general description see cenH3(Animals) class.',
 'knock_out': 'CENP-A depletion at different stages of the cell cycle showed that the earlier CENP-A is removed, the more errors accumulate during later cell divisions [hoffmann_cenp-is_2016].',
 'function': 'CENP-A nucleosomes are essential for chromosome segregation. A functional kinetochore interacts with active centromeric chromatin reached with CENP-A to form the mitotic spindle. Human CENP-A plays an important role in stabilization and retention inner kinetochore during G1 phase of cell cycle [pesenti_structure_2022, hoffmann_cenp-is_2016]. However, mitosis can proceed without CENP-A as long as the CENP-B protein remains stably bound to centromeric sequences, facilitating the assembly of the inner kinetochore [hoffman

In [14]:
desk_str = ', '.join([f'{k}="{v}"' for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    "WHERE id = 158"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary="cenH3(Homo_sapiens) is a centromere-specific histone variant in human, often called CENP-A (Centromere Protein A), and an important component of active centromere required for chromosome segregation. For a general description see cenH3(Animals) class.", knock_out="CENP-A depletion at different stages of the cell cycle showed that the earlier CENP-A is removed, the more errors accumulate during later cell divisions [hoffmann_cenp-is_2016].", function="CENP-A nucleosomes are essential for chromosome segregation. A functional kinetochore interacts with active centromeric chromatin reached with CENP-A to form the mitotic spindle. Human CENP-A plays an important role in stabilization and retention inner kinetochore during G1 phase of cell cycle [pesenti_structure_2022, hoffmann_cenp-is_2016]. However, mitosis can proceed without CENP-A as long as the CENP-B protein remains stably bound to centromeric sequences, facilitating the assembly of the inner k

In [15]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id = 'cenH3_(Homo_sapiens)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Homo_sapiens),variant,Homo sapiens,9606,158,cenH3_(Mammalia),158,cenH3(Homo_sapiens) is a centromere-specific h...,,,...,CENP-A depletion at different stages of the ce...,CENP-A nucleosomes are essential for chromosom...,,CENP-A in human chromosomes usually localized ...,Deposition of CENP-A into centromeric nucleoso...,,CENP-A nucleosomes stabilize the inner kinetoc...,,,


In [16]:
df["summary"].values[0]

'cenH3(Homo_sapiens) is a centromere-specific histone variant in human, often called CENP-A (Centromere Protein A), and an important component of active centromere required for chromosome segregation. For a general description see cenH3(Animals) class.'

In [17]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [18]:
cursor.close()
conn.close()
tunnel.stop()