In [2]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [5]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [6]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

37591


In [7]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [8]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [9]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
# add_sequence = (
#     "INSERT INTO sequence "
#     "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
#     "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
add_alternate_names = (
    "INSERT INTO alternative_name "
    "(name, taxonomy, gene, splice, histone) "
    "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
)
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

In [None]:
data_histone_description = {
    "summary": None,
    "taxonomy": None,
    "genes": None,
    "evolution": None, 
    "expression": None,
    "knock_out": None,
    "function": None,
    "sequence": None,
    "localization": None,
    "deposition": None,
    "structure": None,
    "interactions": None,
    "disease": None,
    "caveats": None,
}

In [10]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

In [8]:
query = "SELECT * FROM alternative_name " "WHERE name='cid'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,65,cid,,,,cenH3_(Plants)


In [14]:
query = "SELECT * FROM publication "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].str.startswith("hara")]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [85]:
query = (
    "SELECT * FROM histone as h LEFT JOIN histone_description as hd "
    "ON h.description = hd.id "
    # "WHERE h.id = 'H3.6_(Homo_sapiens)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df.iloc[:,0].str.contains("H3.6")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
17,cenH3.6_(Repleta),variant,repleta group,32321,245.0,cenH3_(Drosophilidae),245.0,cenH3.6_(Repleta) is a recently evolved centr...,,,...,,,,,,,,,,


In [28]:
df["genes"].values[0]

'CENP-A nucleosomes are essential for chromosome segregation. A functional kinetochore interacts with active centromeric chromatin reached with CENP-A to form the mitotic spindle. Notably, in certain holocentric organisms, CENP-A appears dispensable for meiotic chromosome segregation while remaining essential for mitosis. As demonstrated in Caenorhabditis elegans, CENP-A loading is specifically eliminated following meiosis I, and RNAi-mediated depletion of CENP-A during meiosis fails to disrupt proper chromosome segregation [monen_differential_2005]. This stands in striking contrast to mitotic divisions, where CENP-A is absolutely required for kinetochore assembly and faithful chromosome segregation. Moreover, C. elegans and C. remanei possess a second cenH3 histone gene that likely performs specialized functions distinct from canonical CENP-A roles [monen_separase_2015]. For more detailed information see the "gene" section.'

In [70]:
query = "SELECT * FROM histone_description "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
# df
for s in df.apply(lambda row: ''.join(row.astype(str)), axis=1):
    if "henikoff_heterochromatic_2000" in s:
        print(s)
        print("---------------")

In [41]:
query = "SHOW TABLES; "
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,Tables_in_db_name
0,alternative_name
1,histone
2,histone_description
3,histone_has_publication
4,publication
5,sequence
6,sequence_has_publication


# To Do H4

## <span style="color:black">Update description of H4</span>

### <span style="color:black">Update summary</span>

```H4 is one of the core histones. Two H4s and two H3s form H3-H4 tetramer via "hand shake" and "four helix bundle" motifs. Two H2A-H2B dimers then associate with H3-H4 tetramer to form complete nucleosome core. H4 is the most conservative histone type and has very few known variants. Structure of H4 has a histone fold domain and a flexible N-terminal tail. H4 provides sites for H2B interaction via "four-helix bundle" and forms a small β-sheet with H2Aa. The histone H4 tail is crucial for nucleosome integrity and the stability of DNA wrapping. Nonetheless, the loss of the H4 tail can be functionally compensated through internucleosomal interactions mediated by the remaining histone tails, namely H3, H2A, and H2B [stormberg_effect_2021]. Histone H4 plays a critical role in kinetochore assembly and prevents kinetochore formation at ectopic genomic sites. It also acts as a key regulator of the conformational state of the centromeric histone H3 variant (cenH3/CENP-A), ensuring its stability, proper localization at the centromere, and degradation in case of misincorporation [malik_conformational_2018].```

### <span style="color:black">Update sequence</span>

```The basic hydrophilic patch (R17, H18, R19) within the N-terminal tail is a key regulator of the ISWI remodeling complex [clapier_critical_2002]. The interaction mechanism of R17 and R19 with ISWI is likely conserved throughout evolution, highlighting its fundamental importance [corcoran_systematic_2022]. Additionally, the positively charged arginine residues R17 and R19 are critical for growth development [zhang_cis-_2022, corcoran_systematic_2022]. Monomethylation of lysine 20 on histone H4 (H4K20me1) promotes active transcription of genes, particularly housekeeping genes [fatemiyan_broad_2023, shoaib_histone_2021]. Shoaib et al. demonstrated that H4K20me1 increases the conformational dynamics of the histone H4 tail and facilitates chromatin openness and accessibility by disrupting chromatin compaction. Acetylation of the H4 tail within the basic patch region (K5, K8, K12, K16, K20) modulates its conformational ensemble through electrostatic effects, alterations in contact patterns, and subtle changes in the protonation of key residues, thereby contributing to the regulation of chromatin architecture [dewing_acetylation-dependent_2024]. Acetylation of the histone H4 tail modulates histone H3 tail dynamics, resulting in enhanced DNA binding of the H3 tail and increased susceptibility of H3K14 to acetylation [furukawa_acetylated_2020]. Histone H4 acetylation plays a dual role by mediating IκBα recruitment to chromatin in stem cells and conferring protection against proteolytic cleavage [marruecos_dynamic_2021]. This mechanism ensures precise spatiotemporal control of IκBα-chromatin binding, a prerequisite for launching the cell differentiation program. Of particular interest is the dual histone H4 modification (K5acme), which combines lysine acetylation and methylation; this modification is dynamically regulated at transcriptional start sites, suggesting its key role in gene regulation and stress response, and possesses a unique biochemical feature—resistance to histone deacetylase activity [fletcher_acetyl-methyllysine_2023]. The K5 and K8 residues are subject to not only acetylation but also butyrylation, which has been confirmed in humans, mice, and Tetrahymena [goudarzi_dynamic_2016]. It is important to note that the competition between these different modifications on the histone H4 tail constitutes a significant epigenetic mechanism that dynamically regulates gene expression and chromatin reorganization in spermatogenic cells. Given that butyrylation at the H4K5 position, unlike acetylation, completely prevents the binding of the first bromodomain (BD1) of the testis-specific protein Brdt, the specific modification state (acetyl vs. butyryl) of histone H4 can determine the timing of histone removal and, consequently, influence the final genome organization in mature sperm [goudarzi_dynamic_2016]. Interestingly, H4K20 acetylation represents an atypical histone modification associated with gene repression rather than activation, and likely functions by recruiting specific repressors, such as NRSF/REST [kaimori_histone_2016]. N-α-terminal acetylation of histone H4 suppresses asymmetric dimethylation of arginine 3, thereby regulating ribosomal DNA silencing and cell growth, particularly under stress conditions [schiza_n-alpha-terminal_2013]. The enrichment of histone H4 lysine 16 biotinylation (H4K16bio) in repressed genomic regions, including telomeres and repetitive elements, contributes to nucleosome condensation [singh_biotinylation_2013]. This contrasts with the acetylation of the same residue, which has a decondensing effect [shia_histone_2006].```

### <span style="color:black">Update disease</span>

```Acetylation of histone H4 at lysine 16 (H4K16ac), facilitated by MSL1, enhances the transcription of key cell cycle regulators, including Cyclin A2, Cyclin B1, and Cyclin D1, promoting hepatocyte proliferation and is essential for successful liver regeneration following injury [he_msl1_2023]. Direct binding of histone H4 to prothrombin induces its autoactivation, leading to thrombin generation independent of the classical coagulation cascade; this mechanism underlies the procoagulant activity of histone H4 in pathological states [barranco-medina_histone_2013].```

## <span style="color:black">Update description of H4_(Animals)</span>

### <span style="color:black">Update sequence</span>

```The threonine 71 position of histone H4 is highly conserved across numerous animal species and can be subjected to O-GlcNAcylation (H4T71Gc). This modification plays a critical role, in concert with H3K9me3, in the progression and maintenance of condensed chromatin [hayakawa_novel_2025]. The positively charged arginine residues R17 and R19 in histone H4, which are conserved among eukaryotes, are critical for growth development. Mutation of R17 to a negatively charged glutamic acid (R17E) results in embryonic lethality, while a neutral substitution (R17A) causes growth retardation, a severe reduction in H4K16 acetylation, and impaired X-chromosome dosage compensation in males [zhang_cis-_2022]. The R19A mutation, in turn, drastically reduces H3K79me3 levels by disrupting the binding of the methyltransferase Gpp to the H4 N-terminal tail [zhang_cis-_2022]. Histone H4 lysine acetylation (K5, K8, K12 and K16) plays a complex and species-specific role in chromatin remodeling during spermatogenesis, with H4K16ac likely acting as a key signal for initiating histone replacement [ketchum_early_2018, shirakata_histone_2014].```

### <span style="color:black">Update interactions</span>

```The histone chaperones p55 (Drosophila melanogaster) and its human ortholog RbAp48 serve as core subunits in diverse chromatin-modifying complexes. Their molecular function involves the recognition of histone H4 helix 1 within a unique lateral binding pocket of their β-propeller domain [song_structural_2008, murzina_structural_2008]. Competition between the chaperone CIA/ASF1 and histone H2A for the H4 C-terminus drives H3–H4 tetramer disruption, enabling chromatin dynamics without occluding sites for enzymatic histone modification [natsume_structure_2007].```

## <span style="color:black">Update description of H4_(Plants)</span>

### <span style="color:black">Update summary</span>

``` cH4_(Plants) canonical H4 histones of green plants (Viridiplantae), which include green algae and land plants. Mass spectrometry identified two histone H4 isoforms in soybean, H4.1 and H4.2, which vary at amino acid residue 60 (Ile/Val); the functional role of these variants is yet to be determined [wu_mass_2009].```

### <span style="color:black">Update sequence</span>

```The R17, R36, R39, and K44 residues of histone H4 in Arabidopsis thaliana regulate the floral transition and other aspects of plant development. The H4R17A mutation disrupts the interaction with the ISWI chromatin-remodeling complex, leading to reduced regularity of nucleosome positioning and subsequent activation of key flowering-time regulatory genes [corcoran_systematic_2022].```

## <span style="color:black">Add description of cH4_(Mammalia)</span>

### <span style="color:black">Add summary</span>

```cH4_(Mammalia) clustered H2A histones in mammals, often called canonical, replication-dependent, replication-coupled or "bulk" H4 histones. This is a subclass of cH4_(Vertebrata) and cH4_(Animals) (see it for a detailed description). Citrullination of histone H4 plays a protective role by attenuating excessive NET formation via reduced calcium influx and subsequent suppression of PAD4 activation in neutrophils [shi_neutrophil_2021].```

## <span style="color:black">Add description of cH4_(Protists)</span>

### <span style="color:black">Add summary</span>

```cH4_(Protists) canonical replication-dependent H4 histones in various protists.```

### <span style="color:black">Add sequence</span>

```The N-terminal domain of histone H4 in trypanosomatids is highly divergent, particularly in post-translational modification sites. Although the acetylated lysines (K4, K10, K14) are homologous to those in other eukaryotes, their impairment disrupts chromatin dynamics, leading to free histone accumulation and cell cycle arrest [ramos_expression_2015].```

>Ацетилирование гистона H4 в различных лизинах (K5, K8, K12, K16) динамично и пространственно-специфично изменяется в процессе заживления кожной раны, регулируя транскрипцию генов на разных стадиях репарации, включая удаленные от раны участки эпителия [nascimento-filho_skin_2020].

>This chromatin decompaction was suppressed by depletion of  histone acetyltransferase Mst1 or by arginine substitution of specific lysine residues (K8 and K12) of histone H4. These results suggest that acetylation of histone H4 residues K8 and K12 plays a critical role in loosening chromatin structures during DNA replication. [ruan_histone_2015]

# Close connections

In [35]:
cursor.close()
conn.close()
tunnel.stop()