In [2]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [5]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [6]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

37591


In [7]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [8]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [9]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
# add_sequence = (
#     "INSERT INTO sequence "
#     "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
#     "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
add_alternate_names = (
    "INSERT INTO alternative_name "
    "(name, taxonomy, gene, splice, histone) "
    "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
)
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

In [10]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

In [8]:
query = "SELECT * FROM alternative_name " "WHERE name='cid'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,65,cid,,,,cenH3_(Plants)


In [14]:
query = "SELECT * FROM publication "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].str.startswith("hara")]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [23]:
query = (
    "SELECT * FROM histone as h LEFT JOIN histone_description as hd "
    "ON h.description = hd.id "
    "WHERE h.id = 'cenH3_(Animals)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Animals),variant,Homo sapiens,9606,94,cenH3,94,cenH3_(Animals) is a centromere-specific histo...,,CENP-A nucleosomes are essential for chromosom...,...,,The nematodes Caenorhabditis elegans and C. re...,,,,,,,,


In [28]:
df["genes"].values[0]

'CENP-A nucleosomes are essential for chromosome segregation. A functional kinetochore interacts with active centromeric chromatin reached with CENP-A to form the mitotic spindle. Notably, in certain holocentric organisms, CENP-A appears dispensable for meiotic chromosome segregation while remaining essential for mitosis. As demonstrated in Caenorhabditis elegans, CENP-A loading is specifically eliminated following meiosis I, and RNAi-mediated depletion of CENP-A during meiosis fails to disrupt proper chromosome segregation [monen_differential_2005]. This stands in striking contrast to mitotic divisions, where CENP-A is absolutely required for kinetochore assembly and faithful chromosome segregation. Moreover, C. elegans and C. remanei possess a second cenH3 histone gene that likely performs specialized functions distinct from canonical CENP-A roles [monen_separase_2015]. For more detailed information see the "gene" section.'

In [70]:
query = "SELECT * FROM histone_description "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
# df
for s in df.apply(lambda row: ''.join(row.astype(str)), axis=1):
    if "henikoff_heterochromatic_2000" in s:
        print(s)
        print("---------------")

In [41]:
query = "SHOW TABLES; "
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,Tables_in_db_name
0,alternative_name
1,histone
2,histone_description
3,histone_has_publication
4,publication
5,sequence
6,sequence_has_publication


# To Do H3-like

## <span style="color:green">Add description to H3.Y.1_(Primates)</span>

### <span style="color:green">Add summary</span>

```H3.Y.1_(Primates) is a primate-specific histone H3 variant encoded by the gene homologous to human H3Y1 [wiedemann_identification_2010].```

## <span style="color:black">Update description to H3.Y.2_(Primates)</span>

### <span style="color:black">Update summary</span>

```H3.Y.2_(Primates) is a primate-specific histone H3 variant (also known as H3.X) encoded by the gene homologous to human H3Y2 [wiedemann_identification_2010].```

## <span style="color:black">Add description to H3.Y.1_(Homo_sapiens)</span>

### <span style="color:black">Add summary</span>

```H3.Y.1_(Homo_sapiens) is a human histone H3 variant encoded by the H3Y1 gene. Human H3.Y.1, together with H3.Y.2, promotes cell growth, regulates cell cycle genes, and is implicated in primate-specific brain function [wiedemann_identification_2010]. It also facilitates sustained expression of DUX4-target genes [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add function</span>

```Although the functions of H3.Y.1 remain poorly understood, it is known to promote cell growth and regulate the expression of genes involved in cell cycle control and mitosis [wiedemann_identification_2010]. The presence of H3.Y.1 (along with H3.Y.2) in hippocampal neurons suggests a potential role in primate-specific brain functions [wiedemann_identification_2010]. Furthermore, H3.Y.1 and H3.Y.2 are induced by the transcription factor DUX4 to facilitate the persistence and reactivation of DUX4 target genes following its transient expression [talbert_histone_2021, resnick_dux4-induced_2019]. Interestinglly, H3.Y has been identified as a specific marker of 8-cell-like cells (8CLCs) and is also detected in vivo within the nuclei of human 8-cell embryos at the peak of zygotic genome activation (ZGA) [taubenschmid-stowers_8c-like_2022]. During blastomere division, H3.Y shows strong association with condensed chromosomes at prophase and metaphase stages [taubenschmid-stowers_8c-like_2022]. Taubenschmid-Stowers et al. suggest that H3.Y may be necessary for large-scale genome activation during early human embryogenesis [taubenschmid-stowers_8c-like_2022].```

### <span style="color:black">Add sequence</span>

```The H3.Y.1 protein consists of 135 amino acids and shares high similarity with H3.Y.2 (89.7% identity), primarily differing by a shorter C-terminal tail [wiedemann_identification_2010]. Although its sequence resembles that of H3.3, H3.Y.1 contains specific amino acid substitutions at known post-translational modification sites of canonical H3 variants: S10A, S28R, K14Q, and K79S [wiedemann_identification_2010]. Mass spectrometry analysis confirmed that H3.Y.1 undergoes acetylation at lysines 18, 23, and 27 [wiedemann_identification_2010]. The H3.Y-specific residues, such as Lys42, Leu46, Lys53, and Gln59, are located at the nucleosomal DNA entry/exit sites and may potentially influence DNA-histone interactions and nucleosome stability [kujirai_structure_2016].```

### <span style="color:black">Add expression</span>

```H3.Y.1 and H3.Y.2 are detected at low levels in certain cell lines (e.g., osteosarcoma U2OS), as well as in a range of normal (brain, testis) and malignant (bone, breast, lung, and ovarian tumors) human tissues [wiedemann_identification_2010]. In addition, H3.Y.1 and H3.Y.2 are expressed in early embryos at the cleavage stage and in testicular tissue, consistently co-expressing with DUX4 [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add localization</span>

```Endogenous H3.Y.1 is predominantly localized outside dense DAPI regions (heterochromatin), associating with less condensed, transcriptionally active euchromatin, enriched H3K4me3 [wiedemann_identification_2010]. H3.Y.1 and H3.Y.2 are incorporated into highly expressed genes, particularly those induced by DUX4, where it is enriched throughout the gene body, while in constitutively expressed genes, it is primarily localized to the transcription start site (TSS) region [resnick_dux4-induced_2019].```

### <span style="color:black">Add deposition</span>

```H3.Y.1 is deposited into chromatin via the HIRA chaperone complex, which facilitates its replication-independent incorporation into actively transcribed genomic regions [resnick_dux4-induced_2019]. Despite its high similarity to H3.3, H3.Y.1 is incapable of interacting with the DAXX/ATRX complex responsible for H3.3 deposition into heterochromatin [zink_h3y_2017].```

### <span style="color:black">Add structure</span>

```The crystal structure of the H3.Y.1 nucleosome reveals that its specific amino acid residues located at the DNA entry/exit sites result in increased DNA end flexibility compared to H3.3-containing nucleosomes, as well as reduced binding of linker histone H1 [kujirai_structure_2016]. This facilitates transcription factor access to DNA and may promote transcription activation. Kujirai et al. suggest that the heterotypic H3.Y/H3.3 nucleosome, which retains the same biochemical properties as its homotypic counterpart, is likely the predominant form in cells [kujirai_structure_2016].```

### <span style="color:black">Add knock_out</span>

```Knockdown of both genes encoding H3.Y (H3Y1 and H3Y2) using siRNA suppresses the super-induction of DUX4 target genes upon reactivation and reduces the persistence of their expression, but does not affect constitutively expressed genes [resnick_dux4-induced_2019].```

## <span style="color:black">Add description to H3.Y.2_(Homo_sapiens)</span>

### <span style="color:black">Add summary</span>

```H3.Y.2_(Homo_sapiens) is a human histone H3 variant (also known as H3.X) encoded by the H3Y2 gene. The protein sequence of H3Y2 differs from that of H3Y1 by the presence of an additional 11 amino acid residues at the C-terminal tail [ding_primate-specific_2021]. Human H3.Y.2, together with H3.Y.1, promotes cell growth, regulates cell cycle genes, and is implicated in primate-specific brain function [wiedemann_identification_2010]. It also facilitates sustained expression of DUX4-target genes [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add function</span>

```The functions of H3.Y.2 remain poorly understood. However, the presence of H3.Y.2 (along with H3.Y.1) in hippocampal neurons suggests a potential role in primate-specific brain functions [wiedemann_identification_2010]. In addition, H3.Y.1 and H3.Y.2 are induced by the transcription factor DUX4 to facilitate the persistence and reactivation of DUX4 target genes following its transient expression [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add sequence</span>

```The H3.Y.2 protein consists of 146 amino acids and shares high similarity with H3.Y.1 (89.7% identity), primarily differing by an additional 11 amino acid residues at the C-terminal tail with no sequence homology to other proteins [wiedemann_identification_2010]. Although its sequence resembles that of H3.3, H3.Y.2 contains specific amino acid substitutions at known post-translational modification sites of canonical H3 variants: S10A, S28R, K14Q, and K79S [wiedemann_identification_2010]. The H3.Y-specific residues, such as Lys42, Leu46, Lys53, and Gln59, are located at the nucleosomal DNA entry/exit sites and may potentially influence DNA-histone interactions and nucleosome stability [kujirai_structure_2016].```

### <span style="color:black">Add expression</span>

```H3.Y.1 and H3.Y.2 are detected at low levels in certain cell lines (e.g., osteosarcoma U2OS), as well as in a range of normal (brain, testis) and malignant (bone, breast, lung, and ovarian tumors) human tissues [wiedemann_identification_2010]. In addition, H3.Y.1 and H3.Y.2 are expressed in early embryos at the cleavage stage and in testicular tissue, consistently co-expressing with DUX4 [talbert_histone_2021, resnick_dux4-induced_2019].```

### <span style="color:black">Add localization</span>

```H3.Y.1 and H3.Y.2 are incorporated into highly expressed genes, particularly those induced by DUX4, where it is enriched throughout the gene body, while in constitutively expressed genes, it is primarily localized to the transcription start site (TSS) region [resnick_dux4-induced_2019].```

### <span style="color:black">Add deposition</span>

```H3.Y.2 is deposited into chromatin via the HIRA chaperone complex, which facilitates its replication-independent incorporation into actively transcribed genomic regions [resnick_dux4-induced_2019]. Despite its high similarity to H3.3, H3.Y.2 is incapable of interacting with the DAXX/ATRX complex responsible for H3.3 deposition into heterochromatin [zink_h3y_2017].```

### <span style="color:black">Add structure</span>

```The crystal structure of the H3.Y.1 nucleosome reveals that its specific amino acid residues located at the DNA entry/exit sites result in increased DNA end flexibility compared to H3.3-containing nucleosomes, as well as reduced binding of linker histone H1 [talbert_histone_2021, kujirai_structure_2016]. This facilitates transcription factor access to DNA and may promote transcription activation. Kujirai et al. suggest that the heterotypic H3.Y/H3.3 nucleosome, which retains the same biochemical properties as its homotypic counterpart, is likely the predominant form in cells [kujirai_structure_2016].```

### <span style="color:black">Add knock_out</span>

```Knockdown of both genes encoding H3.Y (H3Y1 and H3Y2) using siRNA suppresses the super-induction of DUX4 target genes upon reactivation and reduces the persistence of their expression, but does not affect constitutively expressed genes [resnick_dux4-induced_2019].```

## <span style="color:black">ВАЖНО!!!</span>

### Предыдущее описание H3.Y:

**Summary:** H3.Y is involved in memory formation due to its presence in neurons in human hippocampus. One splice isoform has an extended C-terminal alpha tail that may interacts with the H2A.Z acidic patch
**Taxonomic span:** Primates, 9443
**Alternate names:** H3.X

# Close connections

In [35]:
cursor.close()
conn.close()
tunnel.stop()