In [2]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [5]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [6]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

37591


In [7]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [8]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [9]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
# add_sequence = (
#     "INSERT INTO sequence "
#     "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
#     "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
add_alternate_names = (
    "INSERT INTO alternative_name "
    "(name, taxonomy, gene, splice, histone) "
    "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
)
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

In [10]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

In [8]:
query = "SELECT * FROM alternative_name " "WHERE name='cid'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,65,cid,,,,cenH3_(Plants)


In [14]:
query = "SELECT * FROM publication "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].str.startswith("hara")]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [23]:
query = (
    "SELECT * FROM histone as h LEFT JOIN histone_description as hd "
    "ON h.description = hd.id "
    "WHERE h.id = 'cenH3_(Animals)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Animals),variant,Homo sapiens,9606,94,cenH3,94,cenH3_(Animals) is a centromere-specific histo...,,CENP-A nucleosomes are essential for chromosom...,...,,The nematodes Caenorhabditis elegans and C. re...,,,,,,,,


In [28]:
df["genes"].values[0]

'CENP-A nucleosomes are essential for chromosome segregation. A functional kinetochore interacts with active centromeric chromatin reached with CENP-A to form the mitotic spindle. Notably, in certain holocentric organisms, CENP-A appears dispensable for meiotic chromosome segregation while remaining essential for mitosis. As demonstrated in Caenorhabditis elegans, CENP-A loading is specifically eliminated following meiosis I, and RNAi-mediated depletion of CENP-A during meiosis fails to disrupt proper chromosome segregation [monen_differential_2005]. This stands in striking contrast to mitotic divisions, where CENP-A is absolutely required for kinetochore assembly and faithful chromosome segregation. Moreover, C. elegans and C. remanei possess a second cenH3 histone gene that likely performs specialized functions distinct from canonical CENP-A roles [monen_separase_2015]. For more detailed information see the "gene" section.'

In [70]:
query = "SELECT * FROM histone_description "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
# df
for s in df.apply(lambda row: ''.join(row.astype(str)), axis=1):
    if "henikoff_heterochromatic_2000" in s:
        print(s)
        print("---------------")

In [41]:
query = "SHOW TABLES; "
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,Tables_in_db_name
0,alternative_name
1,histone
2,histone_description
3,histone_has_publication
4,publication
5,sequence
6,sequence_has_publication


# To Do H3-like

## <span style="color:black">Add H3-like_(Plants)</span>

## <span style="color:black">Add description to H3-like_(Plants)</span>

### <span style="color:black">Add summary</span>

```H3-like_(Plants) is a group of poorly characterized histone variants that exhibit amino acid substitutions resembling those of either H3.3 variants (termed H3.3-like) or H3.1 variants (termed H3.1-like) [alvarez-venegas_canonical_2019]. This group includes three Arabidopsis genes (HTR6, HTR11, and HTR15), four rice genes (HTR701, HTR709, HTR714, and HTR715) and five lily genes (gcH3, gH3, leH3, soH3-1 и soH3-2), which are expected to encode functional proteins [alvarez-venegas_canonical_2019, probst_similar_2020].```

### <span style="color:black">Add function</span>

```The functions of most H3-like genes are still unknown. Nevertheless, the lily isoforms are presumed to participate in specialized chromatin remodeling during male germline development due to their specific expression in the pollen generative cell and association with the male gametophyte [alvarez-venegas_canonical_2019]. Interestingly, phylogenetic analysis clustered gH3 with centromeric histones, suggesting a potential role in centromere function [alvarez-venegas_canonical_2019].```

### <span style="color:black">Add sequence</span>

```The amino acid sequences of the Arabidopsis and rice H3-like variants show no significant homology and lack conserved patterns of amino acid substitutions, suggesting they may have emerged as specialized, divergent variants following lineage-specific duplication events [alvarez-venegas_canonical_2019]. In lily, soH3-1 and soH3-2 exhibit similarity to the conserved H3.3 variant, whereas gcH3, gH3, and leH3 are highly divergent.```

### <span style="color:black">Add caveats</span>

```Alvarez-Venegas et al. classified HTR10 and HTR14 as H3-like histone genes [alvarez-venegas_canonical_2019]. However, recent studies have characterized the corresponding histone variants H3.10 and H3.14 in detail, leading to their reclassification as distinct functional variants (see more on histone variants H3.10_(Arabidopsis) and H3.14_(Arabidopsis)) [borg_targeted_2020, vazquez_histone_2025].```

## <span style="color:black">Add H3.10_(Arabidopsis) to H3</span>

## <span style="color:black">Add description to H3.10_(Arabidopsis)</span>

### <span style="color:black">Add summary</span>

```H3.10_(Arabidopsis) is a poorly characterized sperm-specific histone variant in Arabidopsis, encoded by HTR10 gene [borg_targeted_2020, alvarez-venegas_canonical_2019].```

### <span style="color:black">Add function</span>

```H3.10 plays a critical role in reprogramming parental epigenetic memory. The deposition of H3.10 constitutes one of the key mechanisms leading to the loss of H3K27me3 in plant sperm cells [borg_targeted_2020].```

### <span style="color:black">Add sequence</span>

```The amino acid substitutions (Y31, H87, L90) in H3.10 are similar to those in H3.3 [alvarez-venegas_canonical_2019]. The absence of alanine at position 31 in the amino acid sequence around lysine 27 (K27) makes H3.10 immune to methylation at K27 [borg_targeted_2020].```

## <span style="color:black">Add H3.14_(Arabidopsis) to H3</span>

## <span style="color:black">Add description to H3.14_(Arabidopsis)</span>

### <span style="color:black">Add summary</span>

```H3.14_(Arabidopsis) is a specialized histone variant in Arabidopsis (encoded by the HTR14 gene) dedicated to managing the trade-off between growth and stress survival at the chromatin level. Rapid expression of HTR14 is induced by various abiotic stresses (e.g., salt, osmotic stress, oxidative stress, ABA) in a subset of responsive cells of the root transition zone [nunez-vazquez_histone_2025]. Its accumulation peaks during the growth arrest and quiescent phases of the stress response and declines before recovery.```

### <span style="color:black">Add function</span>

```H3.14 plays a dual role in the transcriptional reprogramming during the early stress response. It activates stress-responsive genes and represses growth- and development-related genes [nunez-vazquez_histone_2025].```

### <span style="color:black">Add expression</span>

```H3.14 is expressed in both vegetative tissues and reproductive structures, including the vegetative nucleus of mature pollen, the central cell, and the antipodes of the female gametophyte [nunez-vazquez_histone_2025, probst_similar_2020]. However, under normal non-stress conditions, its expression in vegetative cells remains very low but is rapidly and strongly induced in response to abiotic stress [nunez-vazquez_histone_2025].```

### <span style="color:black">Add localization</span>

```H3.14 is exclusively localized in euchromatin (colocalizes with H3K4me3) and is highly dynamic, with a fast turnover rate dependent on active transcription [nunez-vazquez_histone_2025].```

### <span style="color:black">Add deposition</span>

```H3.14 deposition is independent of the major histone chaperones CAF-1 (for H3.1) and HIRA (for H3.3), suggesting a unique incorporation pathway [nunez-vazquez_histone_2025].```

## <span style="color:black">Add description to H3.Y_(Primates)</span>

### <span style="color:black">Add summary</span>

```H3.Y_(Primates) is a primate-specific histone H3 variant induced by the transcription factor DUX4, which incorporates into the chromatin of DUX4 target genes and promotes their enhanced reactivation and transcriptional persistence [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add function</span>

```The functional role of H3.Y is to ensure the persistence of expression and enhanced reactivation of DUX4 target genes [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add expression</span>

```H3.Y is expressed in early embryos at the cleavage stage and in testicular tissue, consistently co-expressing with DUX4 [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add localization</span>

```H3.Y is incorporated into highly expressed genes, particularly those induced by DUX4, where it is enriched throughout the gene body, while in constitutively expressed genes, it is primarily localized to the transcription start site (TSS) region [resnick_dux4-induced_2019].```

## <span style="color:black">ВАЖНО!!!</span>

### Предыдущее описание H3.Y:

**Summary:** H3.Y is involved in memory formation due to its presence in neurons in human hippocampus. One splice isoform has an extended C-terminal alpha tail that may interacts with the H2A.Z acidic patch
**Taxonomic span:** Primates, 9443
**Alternate names:** H3.X

expression

# Close connections

In [35]:
cursor.close()
conn.close()
tunnel.stop()