In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

45073


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [39]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [38]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

In [None]:
data_histone_description = {
    "summary": None,
    "taxonomy": None,
    "genes": None,
    "evolution": None,
    "expression": None,
    "knock_out": None,
    "function": None,
    "sequence": None,
    "localization": None,
    "deposition": None,
    "structure": None,
    "interactions": None,
    "disease": None,
    "caveats": None,
}

# To Do H2B

## <span style="color:green">Update description of H2B.V_(Trypanosoma)</span>

### <span style="color:green">Update expression</span>

```H2B.V transcript levels peak during the S phase of the cell cycle, similarly to canonical histones, whereas protein abundance progressively accumulates as the cycle progresses. A significant increase in H2B.V levels is observed during the initial stages of metacyclogenesis, with maximum abundance detected in metacyclic trypomastigotes [roson_new_2025].```

### <span style="color:green">Update function</span>

```The role of H2B.V appears to involve the regulation of parasite differentiation and virulence processes [roson_new_2025].```

### <span style="color:green">Update localization</span>

```H2B.V, together with H4.V, marks new transcription initiation and termination sites. H2B.V localizes to divergent strand-switch regions, which demarcate transcription initiation sites, as well as near certain tRNA loci [roson_new_2025].```

### <span style="color:green">Update knock_out</span>

```Parasites with heterozygous knockout of H2B.V exhibit increased differentiation into metacyclic forms and enhanced invasion of mammalian host cells [roson_new_2025].```

### <span style="color:green">Update interections</span>

```H2B.V dimerizes with the histone variant H2A.Z. The H2B.V–H2A.Z dimers are more unstable compared to canonical H2B–H2A dimers, thereby promoting a more accessible chromatin configuration permissive for transcription initiation [roson_new_2025].```

# To Do H3

## <span style="color:green">Update description of H3.V_(Trypanosoma)</span>

### <span style="color:green">Update summary</span>

```H3.V is a histone variant characterized so far in some trypanosomatids. It shares ~45-60% sequence identity with canonical H3. H3.V is not essential for viability. Although H3.V associated with the mitotic spindle and enriched at telomeres it is not essential for mini-chromosome segregation, telomere maintenance or transcriptional silencing at the telomere-proximal expression sites [lowell_variant_2004, anderson_kinetoplastid-specific_2013]. While the general structure of H3.V resembles that of CenH3, it does not possess the specific sequence features required to be classified as a CenH3 variant [lowell_variant_2004].```

### <span style="color:green">Update function</span>

```H3.V plays a critical role in the regulation of antigenic variation. Together with H4.V they are involved in the positioning and expression of variant surface glycoprotein (VSG) genes, which the parasite Trypanosoma brucei switches to avoid immune detection [muller_genome_2018, schulz_base_2016]. H3.V is important for transcription termination and expression of downstream genes. The trypanosomatid-specific hyper-modified DNA base J and H3.V collaboratively influence the termination process resulting in read-through transcription and increased expression of downstream genes [siegel_four_2009, reynolds_histone_2016, schulz_base_2016, mcdonald_localization_2022].```

### <span style="color:green">Update localization</span>

```In Trypanosoma brucei, H3.V, together with H4.V, is enriched at transcription termination sites and in telomeric regions [anderson_kinetoplastid-specific_2013, roson_new_2025].```

### <span style="color:green">Update knock_out</span>

```While H3.V is dispensable for parasite viability, its deletion impairs variant surface glycoprotein (VSG) silencing. However, simultaneous deletion of H3.V, H4.V, and DNA base J leads to critical alterations in cell growth and replication [roson_new_2025].```

# To Do H1

## <span style="color:green">Update description of cH1.5</span>

### <span style="color:green">Update summary</span>

```cH1.5 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.5 plays a critical role in chromatin organization, thereby influencing cellular development and homeostasis [behrends_linker_2020].```

### <span style="color:green">Update expression</span>

```cH1.5 is present across various cell types. As demonstrated on human cells, its expression level changes during differentiation: for instance, it decreases during the differentiation of NT2 cells into neural cells but increases during the dedifferentiation of keratinocytes into induced pluripotent stem cells [behrends_linker_2020].```

## <span style="color:green">Update description of cH1.5_(Homo_sapiens)</span>

### <span style="color:green">Update summary</span>

```cH1.5_(Homo_sapiens) is a replication dependent linker histone in human. сH1.5 plays a critical role in maintaining centromere integrity and is essential for sustaining appropriate transcriptional programs in differentiated cell types in human cells [saha_linker_2025, li_dynamic_2012].```

### <span style="color:green">Update expression</span>

```cH1.5 is present across various human cell types. Notably, it serves as the predominant H1 variant in astrocyte-lineage SVGp12 cells and in glioblastoma multiforme cells [saha_linker_2025]. cH1.5 expression is downregulated during cellular differentiation. Compared to normal differentiated somatic cells, H1.5 exhibits higher expression levels in both pluripotent cells and fibroblasts [li_dynamic_2012].```

### <span style="color:green">Update function</span>

```cH1.5 is essential for the regulation of mitotic integrity and for sustaining appropriate transcriptional programs in differentiated cell types [saha_linker_2025, li_dynamic_2012].```

### <span style="color:green">Update localization</span>

```In differentiated cells, cH1.5 forms enrichment blocks in both genic and intergenic regions, preferentially associating with genes encoding membrane and membrane-associated proteins [li_dynamic_2012]. Moreover, cH1.5 is enriched at centromeric regions and co‑localizes with the centromeric histone CENP‑A in human, as confirmed by immunofluorescence and chromatin immunoprecipitation assays [saha_linker_2025]. cH1.5 binding is also associated with gene repression, chromatin compaction and promoting the inclusion of alternatively spliced exons into the mature mRNA [li_dynamic_2012, behrends_linker_2020].```

>It is also enriched around splicing sites, particularly on genes with alternative splicing, where its binding promotes the inclusion of alternatively spliced exons into the mature mRNA (see behrends_linker_2020).

### <span style="color:green">Update knock_out</span>

```cH1.5 depletion leads to decreased cell growth, increased chromatin accessibility and deregulation of gene expression [li_dynamic_2012, behrends_linker_2020]. Additionally, knockdown of cH1.5 results in the loss of centromeric α‑satellite DNA transcription, a reduction in de novo CENP‑A loading, and the accumulation of mitotic defects [saha_linker_2025].```

### <span style="color:green">Update structure</span>

```cH1.5 binds to CENP‑A nucleosomes in a non‑canonical, dyad‑independent manner [saha_linker_2025].```

### <span style="color:green">Update interections</span>

```cH1.5 directly interacts with CENP‑A‑containing nucleosomes both in vitro and in vivo [saha_linker_2025].```

### <span style="color:green">Update disease</span>

```cH1.5 can serve as a valuable prognostic marker in cancer patients. Its expression level is decreased in many cancer cell lines, while being highly overexpressed in glioblastoma multiforme [saha_linker_2025, behrends_linker_2020, li_dynamic_2012]. Furthermore, cH1.5 mutations are associated with follicular lymphoma and altered chromatin states. Impairment of cH1.5 function may contribute to chromosomal instability and the development of malignancies, particularly in brain cells[saha_linker_2025].```

>human H1.5 bind right on the center of the dyad symmetrically [wu_binding_2021, bednar_structure_2017].

## <span style="color:green">Update description of cH1.1</span>

### <span style="color:green">Update summary</span>

```cH1.1 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.1 plays an important role in the early stages of embryogenesis [funaya_involvement_2023].```

### <span style="color:green">Update function</span>

```cH1.1 plays a critical role in the early stages of embryogenesis. cH1.1 is absent in human differentiated keratinocytes and fibroblasts but is re-expressed upon reprogramming, suggesting that it is specific to pluripotent cells [terme_histone_2011]. In mice, it is essential for preimplantation development, particularly at the morula and blastocyst stages [funaya_involvement_2023].```

### <span style="color:green">Update expression</span>

```cH1.1 expression predominates in early embryonic cells compared to other replication-dependent linker histone variants. In mice, cH1.1—alongside H1.8—exhibits the highest expression levels at the one- and two-cell stages of embryonic development [funaya_involvement_2023, funaya_linker_2018]. Although its expression declines by the four-cell stage, it remains elevated through the blastocyst stage. In human cells, сH1.1 has also been shown to be highly expressed in pluripotent cells—including embryonic and induced pluripotent stem cells—and absent in differentiated keratinocytes and fibroblasts [terme_histone_2011].```

### <span style="color:green">Update localization</span>

```In mice, cH1.1 is associated with actively transcribed genes, whereas the somatic linker histone variants сH1.2–5 are excluded from genomic regions occupied by H1a, indicating that H1a exhibits a distinct and non-overlapping genomic localization [funaya_involvement_2023].```

### <span style="color:green">Update knock_out</span>

```Maternal knockout of the cH1.1 gene in mice results in reduced litter size and impaired embryonic development during the morula-to-blastocyst transition [funaya_involvement_2023].```



# To Do H3

## <span style="color:green">Add sequences to H3-like_(Plants)</span>

Accessions: NP_172794.1 (HTR6), NP_201338.1 (HTR11),  NP_196795.1 (HTR15) [alvarez-venegas_canonical_2019, okada_analysis_2005]

## <span style="color:green">Add sequences to H3-like_(Plants)</span>

Accessions: ABA97899.1 (HTR714), ABA97902.1 (HTR715) [alvarez-venegas_canonical_2019, hu_identification_2015]

## <span style="color:green">Add sequences to H3-like_(Plants)</span>

Accessions: BAE48427.1 (gcH3), BAA96098.1 (gH3), BAE48431.1 (leH3), BAE48433.1 (soH3-1), BAE48435.1 (soH3-2) [alvarez-venegas_canonical_2019, probst_similar_2020]

# DONE H2B

## <span style="color:black">Update description of H2B.V_(Trypanosoma)</span>

### <span style="color:black">Update expression</span>

```H2B.V transcript levels peak during the S phase of the cell cycle, similarly to canonical histones, whereas protein abundance progressively accumulates as the cycle progresses. A significant increase in H2B.V levels is observed during the initial stages of metacyclogenesis, with maximum abundance detected in metacyclic trypomastigotes [roson_new_2025].```

### <span style="color:black">Update function</span>

```The role of H2B.V appears to involve the regulation of parasite differentiation and virulence processes [roson_new_2025].```

### <span style="color:black">Update localization</span>

```H2B.V, together with H4.V, marks new transcription initiation and termination sites. H2B.V localizes to divergent strand-switch regions, which demarcate transcription initiation sites, as well as near certain tRNA loci [roson_new_2025].```

### <span style="color:black">Update knock_out</span>

```Parasites with heterozygous knockout of H2B.V exhibit increased differentiation into metacyclic forms and enhanced invasion of mammalian host cells [roson_new_2025].```

### <span style="color:black">Update interections</span>

```H2B.V dimerizes with the histone variant H2A.Z. The H2B.V–H2A.Z dimers are more unstable compared to canonical H2B–H2A dimers, thereby promoting a more accessible chromatin configuration permissive for transcription initiation [roson_new_2025].```

In [7]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H2B.V_(Trypanosoma)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.to_dict()

{'id': {0: 41},
 'level': {0: 'variant_group'},
 'taxonomic_span': {0: 'Trypanosoma'},
 'taxonomic_span_id': {0: '5690'},
 'description': {0: 41},
 'parent': {0: 'H2B'},
 'summary': {0: 'H2B.V_(Trypanosoma) is a histone variant characterized so far in Trypanosoma brucei. It shares ~38% sequence identity with major H2B. H2B.V is essential for viability. H2A.Z and H2B.V colocalize throughout the cell cycle and exhibit nearly identical genomic distribution. Data strongly suggest that H2A.Z and H2B.V function together within a single nucleosome [lowell_histone_2005]. H2BV possibly regulates H3 K4 and K76 trimethylation in Trypanosoma brucei [mandava_trypanosome_2008].'},
 'taxonomy': {0: 'Trypanosoma'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'null'},
 'knock_out': {0: 'null'},
 'function': {0: 'null'},
 'sequence': {0: 'null'},
 'localization': {0: 'null'},
 'deposition': {0: 'null'},
 'structure': {0: 'null'},
 'interactions': {0: 'null'},
 'disease': {0: 'nu

In [9]:
expression_desc = "H2B.V transcript levels peak during the S phase of the cell cycle, similarly to canonical histones, whereas protein abundance progressively accumulates as the cycle progresses. A significant increase in H2B.V levels is observed during the initial stages of metacyclogenesis, with maximum abundance detected in metacyclic trypomastigotes [roson_new_2025]."
knock_out_desc = "Parasites with heterozygous knockout of H2B.V exhibit increased differentiation into metacyclic forms and enhanced invasion of mammalian host cells [roson_new_2025]."
function_desc = "The role of H2B.V appears to involve the regulation of parasite differentiation and virulence processes [roson_new_2025]."
localization_desc = "H2B.V, together with H4.V, marks new transcription initiation and termination sites. H2B.V localizes to divergent strand-switch regions, which demarcate transcription initiation sites, as well as near certain tRNA loci [roson_new_2025]."
interactions_desc = "H2B.V dimerizes with the histone variant H2A.Z. The H2B.V–H2A.Z dimers are more unstable compared to canonical H2B–H2A dimers, thereby promoting a more accessible chromatin configuration permissive for transcription initiation [roson_new_2025]."
query = f"UPDATE histone_description SET expression='{expression_desc}', knock_out='{knock_out_desc}', function='{function_desc}', localization='{localization_desc}', interactions='{interactions_desc}' WHERE id=41"
print(query)
cursor.execute(query)

UPDATE histone_description SET expression='H2B.V transcript levels peak during the S phase of the cell cycle, similarly to canonical histones, whereas protein abundance progressively accumulates as the cycle progresses. A significant increase in H2B.V levels is observed during the initial stages of metacyclogenesis, with maximum abundance detected in metacyclic trypomastigotes [roson_new_2025].', knock_out='Parasites with heterozygous knockout of H2B.V exhibit increased differentiation into metacyclic forms and enhanced invasion of mammalian host cells [roson_new_2025].', function='The role of H2B.V appears to involve the regulation of parasite differentiation and virulence processes [roson_new_2025].', localization='H2B.V, together with H4.V, marks new transcription initiation and termination sites. H2B.V localizes to divergent strand-switch regions, which demarcate transcription initiation sites, as well as near certain tRNA loci [roson_new_2025].', interactions='H2B.V dimerizes with

In [10]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H2B.V_(Trypanosoma)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.to_dict()

{'id': {0: 41},
 'level': {0: 'variant_group'},
 'taxonomic_span': {0: 'Trypanosoma'},
 'taxonomic_span_id': {0: '5690'},
 'description': {0: 41},
 'parent': {0: 'H2B'},
 'summary': {0: 'H2B.V_(Trypanosoma) is a histone variant characterized so far in Trypanosoma brucei. It shares ~38% sequence identity with major H2B. H2B.V is essential for viability. H2A.Z and H2B.V colocalize throughout the cell cycle and exhibit nearly identical genomic distribution. Data strongly suggest that H2A.Z and H2B.V function together within a single nucleosome [lowell_histone_2005]. H2BV possibly regulates H3 K4 and K76 trimethylation in Trypanosoma brucei [mandava_trypanosome_2008].'},
 'taxonomy': {0: 'Trypanosoma'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'H2B.V transcript levels peak during the S phase of the cell cycle, similarly to canonical histones, whereas protein abundance progressively accumulates as the cycle progresses. A significant increase in H2B.V levels is ob

In [11]:
# Make sure data is committed to the database
conn.commit()

# DONE H3

## <span style="color:black">Update description of H3.V_(Trypanosoma)</span>

### <span style="color:black">Update summary</span>

```H3.V is a histone variant characterized so far in some trypanosomatids. It shares ~45-60% sequence identity with canonical H3. H3.V is not essential for viability. Although H3.V associated with the mitotic spindle and enriched at telomeres it is not essential for mini-chromosome segregation, telomere maintenance or transcriptional silencing at the telomere-proximal expression sites [lowell_variant_2004, anderson_kinetoplastid-specific_2013]. While the general structure of H3.V resembles that of CenH3, it does not possess the specific sequence features required to be classified as a CenH3 variant [lowell_variant_2004].```

### <span style="color:black">Update function</span>

```H3.V plays a critical role in the regulation of antigenic variation. Together with H4.V they are involved in the positioning and expression of variant surface glycoprotein (VSG) genes, which the parasite Trypanosoma brucei switches to avoid immune detection [muller_genome_2018, schulz_base_2016]. H3.V is important for transcription termination and expression of downstream genes. The trypanosomatid-specific hyper-modified DNA base J and H3.V collaboratively influence the termination process resulting in read-through transcription and increased expression of downstream genes [siegel_four_2009, reynolds_histone_2016, schulz_base_2016, mcdonald_localization_2022].```

### <span style="color:black">Update localization</span>

```In Trypanosoma brucei, H3.V, together with H4.V, is enriched at transcription termination sites and in telomeric regions [anderson_kinetoplastid-specific_2013, roson_new_2025].```

### <span style="color:black">Update knock_out</span>

```While H3.V is dispensable for parasite viability, its deletion impairs variant surface glycoprotein (VSG) silencing. However, simultaneous deletion of H3.V, H4.V, and DNA base J leads to critical alterations in cell growth and replication [roson_new_2025].```

In [14]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.V_(Trypanosomatidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,H3.V_(Trypanosomatidae),variant_group,Trypanosomes,93954,205,H3,205,H3.V is a histone variant characterized so far...,,,...,,,,,,,,,,


In [15]:
df.to_dict()

{'id': {0: 205},
 'level': {0: 'variant_group'},
 'taxonomic_span': {0: 'Trypanosomes'},
 'taxonomic_span_id': {0: '93954'},
 'description': {0: 205},
 'parent': {0: 'H3'},
 'summary': {0: 'H3.V is a histone variant characterized so far in some trypanosomatids. It shares ~45-60% sequence identity with canonical H3. H3.V is not essential for viability. Although H3.V associated with the mitotic spindle and enriched at telomeres it is not essential for mini-chromosome segregation, telomere maintenance or transcriptional silencing at the telomere-proximal expression sites [lowell_variant_2004,  anderson_kinetoplastid-specific_2013]. While the general structure of H3.V resembles that of CenH3, it does not possess the specific sequence features required to be classified as a CenH3 variant [lowell_variant_2004]. H3.V plays a critical role in the regulation of antigenic variation. Together with H4.V they are involved in the positioning and expression of variant surface glycoprotein (VSG) genes

In [17]:
summary_desc = "H3.V is a histone variant characterized so far in some trypanosomatids. It shares ~45-60% sequence identity with canonical H3. H3.V is not essential for viability. Although H3.V associated with the mitotic spindle and enriched at telomeres it is not essential for mini-chromosome segregation, telomere maintenance or transcriptional silencing at the telomere-proximal expression sites [lowell_variant_2004, anderson_kinetoplastid-specific_2013]. While the general structure of H3.V resembles that of CenH3, it does not possess the specific sequence features required to be classified as a CenH3 variant [lowell_variant_2004]."
knock_out_desc = "While H3.V is dispensable for parasite viability, its deletion impairs variant surface glycoprotein (VSG) silencing. However, simultaneous deletion of H3.V, H4.V, and DNA base J leads to critical alterations in cell growth and replication [roson_new_2025]."
function_desc = "H3.V plays a critical role in the regulation of antigenic variation. Together with H4.V they are involved in the positioning and expression of variant surface glycoprotein (VSG) genes, which the parasite Trypanosoma brucei switches to avoid immune detection [muller_genome_2018, schulz_base_2016]. H3.V is important for transcription termination and expression of downstream genes. The trypanosomatid-specific hyper-modified DNA base J and H3.V collaboratively influence the termination process resulting in read-through transcription and increased expression of downstream genes [siegel_four_2009, reynolds_histone_2016, schulz_base_2016, mcdonald_localization_2022]."
localization_desc = "In Trypanosoma brucei, H3.V, together with H4.V, is enriched at transcription termination sites and in telomeric regions [anderson_kinetoplastid-specific_2013, roson_new_2025]."
query = f"UPDATE histone_description SET summary='{summary_desc}', knock_out='{knock_out_desc}', function='{function_desc}', localization='{localization_desc}' WHERE id=205"
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='H3.V is a histone variant characterized so far in some trypanosomatids. It shares ~45-60% sequence identity with canonical H3. H3.V is not essential for viability. Although H3.V associated with the mitotic spindle and enriched at telomeres it is not essential for mini-chromosome segregation, telomere maintenance or transcriptional silencing at the telomere-proximal expression sites [lowell_variant_2004, anderson_kinetoplastid-specific_2013]. While the general structure of H3.V resembles that of CenH3, it does not possess the specific sequence features required to be classified as a CenH3 variant [lowell_variant_2004].', knock_out='While H3.V is dispensable for parasite viability, its deletion impairs variant surface glycoprotein (VSG) silencing. However, simultaneous deletion of H3.V, H4.V, and DNA base J leads to critical alterations in cell growth and replication [roson_new_2025].', function='H3.V plays a critical role in the regulation of anti

In [18]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.V_(Trypanosomatidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.to_dict()

{'id': {0: 205},
 'level': {0: 'variant_group'},
 'taxonomic_span': {0: 'Trypanosomes'},
 'taxonomic_span_id': {0: '93954'},
 'description': {0: 205},
 'parent': {0: 'H3'},
 'summary': {0: 'H3.V is a histone variant characterized so far in some trypanosomatids. It shares ~45-60% sequence identity with canonical H3. H3.V is not essential for viability. Although H3.V associated with the mitotic spindle and enriched at telomeres it is not essential for mini-chromosome segregation, telomere maintenance or transcriptional silencing at the telomere-proximal expression sites [lowell_variant_2004, anderson_kinetoplastid-specific_2013]. While the general structure of H3.V resembles that of CenH3, it does not possess the specific sequence features required to be classified as a CenH3 variant [lowell_variant_2004].'},
 'taxonomy': {0: None},
 'genes': {0: None},
 'evolution': {0: None},
 'expression': {0: None},
 'knock_out': {0: 'While H3.V is dispensable for parasite viability, its deletion imp

In [19]:
# Make sure data is committed to the database
conn.commit()

# DONE H1

## <span style="color:black">Update description of cH1.5</span>

### <span style="color:black">Update summary</span>

```cH1.5 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.5 plays a critical role in chromatin organization, thereby influencing cellular development and homeostasis [behrends_linker_2020].```

### <span style="color:black">Update expression</span>

```cH1.5 is present across various cell types. As demonstrated on human cells, its expression level changes during differentiation: for instance, it decreases during the differentiation of NT2 cells into neural cells but increases during the dedifferentiation of keratinocytes into induced pluripotent stem cells [behrends_linker_2020].```

In [20]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cH1.5'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cH1.5,variant,Mammalia,40674,16,cH1,16,cH1.5 is a replication dependent linker histon...,,,...,,,,,,,,,,


In [21]:
df.to_dict()

{'id': {0: 16},
 'level': {0: 'variant'},
 'taxonomic_span': {0: 'Mammalia'},
 'taxonomic_span_id': {0: '40674'},
 'description': {0: 16},
 'parent': {0: 'cH1'},
 'summary': {0: 'cH1.5 is a replication dependent linker histone located within large histone gene clusters in mammals [happel_histone_2009, talbert_histone_2021].'},
 'taxonomy': {0: 'null'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'null'},
 'knock_out': {0: 'null'},
 'function': {0: 'null'},
 'sequence': {0: 'null'},
 'localization': {0: 'null'},
 'deposition': {0: 'null'},
 'structure': {0: 'null'},
 'interactions': {0: 'null'},
 'disease': {0: 'null'},
 'caveats': {0: 'null'},
 'relations': {0: None}}

In [23]:
summary_desc = "cH1.5 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.5 plays a critical role in chromatin organization, thereby influencing cellular development and homeostasis [behrends_linker_2020]."
expression_desc = "cH1.5 is present across various cell types. As demonstrated on human cells, its expression level changes during differentiation: for instance, it decreases during the differentiation of NT2 cells into neural cells but increases during the dedifferentiation of keratinocytes into induced pluripotent stem cells [behrends_linker_2020]."
query = f"UPDATE histone_description SET summary='{summary_desc}', expression='{expression_desc}' WHERE id=16"
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='cH1.5 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.5 plays a critical role in chromatin organization, thereby influencing cellular development and homeostasis [behrends_linker_2020].', expression='cH1.5 is present across various cell types. As demonstrated on human cells, its expression level changes during differentiation: for instance, it decreases during the differentiation of NT2 cells into neural cells but increases during the dedifferentiation of keratinocytes into induced pluripotent stem cells [behrends_linker_2020].' WHERE id=16


In [24]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cH1.5'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.to_dict()

{'id': {0: 16},
 'level': {0: 'variant'},
 'taxonomic_span': {0: 'Mammalia'},
 'taxonomic_span_id': {0: '40674'},
 'description': {0: 16},
 'parent': {0: 'cH1'},
 'summary': {0: 'cH1.5 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.5 plays a critical role in chromatin organization, thereby influencing cellular development and homeostasis [behrends_linker_2020].'},
 'taxonomy': {0: 'null'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'cH1.5 is present across various cell types. As demonstrated on human cells, its expression level changes during differentiation: for instance, it decreases during the differentiation of NT2 cells into neural cells but increases during the dedifferentiation of keratinocytes into induced pluripotent stem cells [behrends_linker_2020].'},
 'knock_out': {0: 'null'},
 'function': {0: 'null'},
 'sequence': {0: 'null'},
 'localization': {0: 'null'},
 'deposition': {0: 'null'},
 'structu

In [25]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Update description of cH1.5_(Homo_sapiens)</span>

### <span style="color:black">Update summary</span>

```cH1.5_(Homo_sapiens) is a replication dependent linker histone in human. сH1.5 plays a critical role in maintaining centromere integrity and is essential for sustaining appropriate transcriptional programs in differentiated cell types in human cells [saha_linker_2025, li_dynamic_2012].```

### <span style="color:black">Update expression</span>

```cH1.5 is present across various human cell types. Notably, it serves as the predominant H1 variant in astrocyte-lineage SVGp12 cells and in glioblastoma multiforme cells [saha_linker_2025]. cH1.5 expression is downregulated during cellular differentiation. Compared to normal differentiated somatic cells, H1.5 exhibits higher expression levels in both pluripotent cells and fibroblasts [li_dynamic_2012].```

### <span style="color:black">Update function</span>

```cH1.5 is essential for the regulation of mitotic integrity and for sustaining appropriate transcriptional programs in differentiated cell types [saha_linker_2025, li_dynamic_2012].```

### <span style="color:black">Update localization</span>

```In differentiated cells, cH1.5 forms enrichment blocks in both genic and intergenic regions, preferentially associating with genes encoding membrane and membrane-associated proteins [li_dynamic_2012]. Moreover, cH1.5 is enriched at centromeric regions and co‑localizes with the centromeric histone CENP‑A in human, as confirmed by immunofluorescence and chromatin immunoprecipitation assays [saha_linker_2025]. cH1.5 binding is also associated with gene repression, chromatin compaction and promoting the inclusion of alternatively spliced exons into the mature mRNA [li_dynamic_2012, behrends_linker_2020].```

>It is also enriched around splicing sites, particularly on genes with alternative splicing, where its binding promotes the inclusion of alternatively spliced exons into the mature mRNA (see behrends_linker_2020).

### <span style="color:black">Update knock_out</span>

```cH1.5 depletion leads to decreased cell growth, increased chromatin accessibility and deregulation of gene expression [li_dynamic_2012, behrends_linker_2020]. Additionally, knockdown of cH1.5 results in the loss of centromeric α‑satellite DNA transcription, a reduction in de novo CENP‑A loading, and the accumulation of mitotic defects [saha_linker_2025].```

### <span style="color:black">Update structure</span>

```cH1.5 binds to CENP‑A nucleosomes in a non‑canonical, dyad‑independent manner [saha_linker_2025].```

### <span style="color:black">Update interections</span>

```cH1.5 directly interacts with CENP‑A‑containing nucleosomes both in vitro and in vivo [saha_linker_2025].```

### <span style="color:black">Update disease</span>

```cH1.5 can serve as a valuable prognostic marker in cancer patients. Its expression level is decreased in many cancer cell lines, while being highly overexpressed in glioblastoma multiforme [saha_linker_2025, behrends_linker_2020, li_dynamic_2012]. Furthermore, cH1.5 mutations are associated with follicular lymphoma and altered chromatin states. Impairment of cH1.5 function may contribute to chromosomal instability and the development of malignancies, particularly in brain cells[saha_linker_2025].```

>human H1.5 bind right on the center of the dyad symmetrically [wu_binding_2021, bednar_structure_2017].

In [26]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cH1.5_(Homo_sapiens)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cH1.5_(Homo_sapiens),variant,Homo sapiens,9606,66,cH1.5,66,,,,...,,,,,,,,,,


In [27]:
df.to_dict()

{'id': {0: 66},
 'level': {0: 'variant'},
 'taxonomic_span': {0: 'Homo sapiens'},
 'taxonomic_span_id': {0: '9606'},
 'description': {0: 66},
 'parent': {0: 'cH1.5'},
 'summary': {0: 'null'},
 'taxonomy': {0: 'null'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'null'},
 'knock_out': {0: 'null'},
 'function': {0: 'null'},
 'sequence': {0: 'null'},
 'localization': {0: 'null'},
 'deposition': {0: 'null'},
 'structure': {0: 'null'},
 'interactions': {0: 'null'},
 'disease': {0: 'null'},
 'caveats': {0: 'null'},
 'relations': {0: None}}

In [29]:
summary_desc = "cH1.5_(Homo_sapiens) is a replication dependent linker histone in human. сH1.5 plays a critical role in maintaining centromere integrity and is essential for sustaining appropriate transcriptional programs in differentiated cell types in human cells [saha_linker_2025, li_dynamic_2012]."
expression_desc = "cH1.5 is present across various human cell types. Notably, it serves as the predominant H1 variant in astrocyte-lineage SVGp12 cells and in glioblastoma multiforme cells [saha_linker_2025]. cH1.5 expression is downregulated during cellular differentiation. Compared to normal differentiated somatic cells, H1.5 exhibits higher expression levels in both pluripotent cells and fibroblasts [li_dynamic_2012]."
knock_out_desc = "cH1.5 depletion leads to decreased cell growth, increased chromatin accessibility and deregulation of gene expression [li_dynamic_2012, behrends_linker_2020]. Additionally, knockdown of cH1.5 results in the loss of centromeric α‑satellite DNA transcription, a reduction in de novo CENP‑A loading, and the accumulation of mitotic defects [saha_linker_2025]."
function_desc = "cH1.5 is essential for the regulation of mitotic integrity and for sustaining appropriate transcriptional programs in differentiated cell types [saha_linker_2025, li_dynamic_2012]."
localization_desc = "In differentiated cells, cH1.5 forms enrichment blocks in both genic and intergenic regions, preferentially associating with genes encoding membrane and membrane-associated proteins [li_dynamic_2012]. Moreover, cH1.5 is enriched at centromeric regions and co‑localizes with the centromeric histone CENP‑A in human, as confirmed by immunofluorescence and chromatin immunoprecipitation assays [saha_linker_2025]. cH1.5 binding is also associated with gene repression, chromatin compaction and promoting the inclusion of alternatively spliced exons into the mature mRNA [li_dynamic_2012, behrends_linker_2020]."
structure_desc = "cH1.5 binds to CENP‑A nucleosomes in a non‑canonical, dyad‑independent manner [saha_linker_2025]."
interactions_desc = "cH1.5 directly interacts with CENP‑A‑containing nucleosomes both in vitro and in vivo [saha_linker_2025]."
disease_desc = "cH1.5 can serve as a valuable prognostic marker in cancer patients. Its expression level is decreased in many cancer cell lines, while being highly overexpressed in glioblastoma multiforme [saha_linker_2025, behrends_linker_2020, li_dynamic_2012]. Furthermore, cH1.5 mutations are associated with follicular lymphoma and altered chromatin states. Impairment of cH1.5 function may contribute to chromosomal instability and the development of malignancies, particularly in brain cells[saha_linker_2025]."
query = f"UPDATE histone_description SET summary='{summary_desc}', expression='{expression_desc}', knock_out='{knock_out_desc}', function='{function_desc}', localization='{localization_desc}', structure='{structure_desc}', interactions='{interactions_desc}', disease='{disease_desc}' WHERE id=66"
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='cH1.5_(Homo_sapiens) is a replication dependent linker histone in human. сH1.5 plays a critical role in maintaining centromere integrity and is essential for sustaining appropriate transcriptional programs in differentiated cell types in human cells [saha_linker_2025, li_dynamic_2012].', expression='cH1.5 is present across various human cell types. Notably, it serves as the predominant H1 variant in astrocyte-lineage SVGp12 cells and in glioblastoma multiforme cells [saha_linker_2025]. cH1.5 expression is downregulated during cellular differentiation. Compared to normal differentiated somatic cells, H1.5 exhibits higher expression levels in both pluripotent cells and fibroblasts [li_dynamic_2012].', knock_out='cH1.5 depletion leads to decreased cell growth, increased chromatin accessibility and deregulation of gene expression [li_dynamic_2012, behrends_linker_2020]. Additionally, knockdown of cH1.5 results in the loss of centromeric α‑satellite D

In [30]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cH1.5_(Homo_sapiens)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.to_dict()

{'id': {0: 66},
 'level': {0: 'variant'},
 'taxonomic_span': {0: 'Homo sapiens'},
 'taxonomic_span_id': {0: '9606'},
 'description': {0: 66},
 'parent': {0: 'cH1.5'},
 'summary': {0: 'cH1.5_(Homo_sapiens) is a replication dependent linker histone in human. сH1.5 plays a critical role in maintaining centromere integrity and is essential for sustaining appropriate transcriptional programs in differentiated cell types in human cells [saha_linker_2025, li_dynamic_2012].'},
 'taxonomy': {0: 'null'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'cH1.5 is present across various human cell types. Notably, it serves as the predominant H1 variant in astrocyte-lineage SVGp12 cells and in glioblastoma multiforme cells [saha_linker_2025]. cH1.5 expression is downregulated during cellular differentiation. Compared to normal differentiated somatic cells, H1.5 exhibits higher expression levels in both pluripotent cells and fibroblasts [li_dynamic_2012].'},
 'knock_out': {0: 'cH

In [31]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Update description of cH1.1</span>

### <span style="color:black">Update summary</span>

```cH1.1 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.1 plays an important role in the early stages of embryogenesis [funaya_involvement_2023].```

### <span style="color:black">Update function</span>

```cH1.1 plays a critical role in the early stages of embryogenesis. cH1.1 is absent in human differentiated keratinocytes and fibroblasts but is re-expressed upon reprogramming, suggesting that it is specific to pluripotent cells [terme_histone_2011]. In mice, it is essential for preimplantation development, particularly at the morula and blastocyst stages [funaya_involvement_2023].```

### <span style="color:black">Update expression</span>

```cH1.1 expression predominates in early embryonic cells compared to other replication-dependent linker histone variants. In mice, cH1.1—alongside H1.8—exhibits the highest expression levels at the one- and two-cell stages of embryonic development [funaya_involvement_2023, funaya_linker_2018]. Although its expression declines by the four-cell stage, it remains elevated through the blastocyst stage. In human cells, сH1.1 has also been shown to be highly expressed in pluripotent cells—including embryonic and induced pluripotent stem cells—and absent in differentiated keratinocytes and fibroblasts [terme_histone_2011].```

### <span style="color:black">Update localization</span>

```In mice, cH1.1 is associated with actively transcribed genes, whereas the somatic linker histone variants сH1.2–5 are excluded from genomic regions occupied by H1a, indicating that H1a exhibits a distinct and non-overlapping genomic localization [funaya_involvement_2023].```

### <span style="color:black">Update knock_out</span>

```Maternal knockout of the cH1.1 gene in mice results in reduced litter size and impaired embryonic development during the morula-to-blastocyst transition [funaya_involvement_2023].```

In [32]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cH1.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cH1.1,variant,Mammalia,40674,11,cH1,11,cH1.1 is a replication dependent linker histon...,,,...,,,,,,,,,,


In [33]:
df.to_dict()

{'id': {0: 11},
 'level': {0: 'variant'},
 'taxonomic_span': {0: 'Mammalia'},
 'taxonomic_span_id': {0: '40674'},
 'description': {0: 11},
 'parent': {0: 'cH1'},
 'summary': {0: 'cH1.1 is a replication dependent linker histone located within large histone gene clusters in mammals [happel_histone_2009, talbert_histone_2021].'},
 'taxonomy': {0: 'null'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'null'},
 'knock_out': {0: 'null'},
 'function': {0: 'null'},
 'sequence': {0: 'null'},
 'localization': {0: 'null'},
 'deposition': {0: 'null'},
 'structure': {0: 'null'},
 'interactions': {0: 'null'},
 'disease': {0: 'null'},
 'caveats': {0: 'null'},
 'relations': {0: None}}

In [35]:
summary_desc = "cH1.1 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.1 plays an important role in the early stages of embryogenesis [funaya_involvement_2023]."
expression_desc = "cH1.1 expression predominates in early embryonic cells compared to other replication-dependent linker histone variants. In mice, cH1.1—alongside H1.8—exhibits the highest expression levels at the one- and two-cell stages of embryonic development [funaya_involvement_2023, funaya_linker_2018]. Although its expression declines by the four-cell stage, it remains elevated through the blastocyst stage. In human cells, сH1.1 has also been shown to be highly expressed in pluripotent cells—including embryonic and induced pluripotent stem cells—and absent in differentiated keratinocytes and fibroblasts [terme_histone_2011]."
knock_out_desc = "Maternal knockout of the cH1.1 gene in mice results in reduced litter size and impaired embryonic development during the morula-to-blastocyst transition [funaya_involvement_2023]."
function_desc = "cH1.1 plays a critical role in the early stages of embryogenesis. cH1.1 is absent in human differentiated keratinocytes and fibroblasts but is re-expressed upon reprogramming, suggesting that it is specific to pluripotent cells [terme_histone_2011]. In mice, it is essential for preimplantation development, particularly at the morula and blastocyst stages [funaya_involvement_2023]."
localization_desc = "In mice, cH1.1 is associated with actively transcribed genes, whereas the somatic linker histone variants сH1.2–5 are excluded from genomic regions occupied by H1a, indicating that H1a exhibits a distinct and non-overlapping genomic localization [funaya_involvement_2023]."
query = f"UPDATE histone_description SET summary='{summary_desc}', expression='{expression_desc}', knock_out='{knock_out_desc}', function='{function_desc}', localization='{localization_desc}' WHERE id=11"
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='cH1.1 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.1 plays an important role in the early stages of embryogenesis [funaya_involvement_2023].', expression='cH1.1 expression predominates in early embryonic cells compared to other replication-dependent linker histone variants. In mice, cH1.1—alongside H1.8—exhibits the highest expression levels at the one- and two-cell stages of embryonic development [funaya_involvement_2023, funaya_linker_2018]. Although its expression declines by the four-cell stage, it remains elevated through the blastocyst stage. In human cells, сH1.1 has also been shown to be highly expressed in pluripotent cells—including embryonic and induced pluripotent stem cells—and absent in differentiated keratinocytes and fibroblasts [terme_histone_2011].', knock_out='Maternal knockout of the cH1.1 gene in mice results in reduced litter size and impaired embryonic development duri

In [36]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cH1.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.to_dict()

{'id': {0: 11},
 'level': {0: 'variant'},
 'taxonomic_span': {0: 'Mammalia'},
 'taxonomic_span_id': {0: '40674'},
 'description': {0: 11},
 'parent': {0: 'cH1'},
 'summary': {0: 'cH1.1 is a replication dependent linker histone located within large histone gene clusters in mammals. cH1.1 plays an important role in the early stages of embryogenesis [funaya_involvement_2023].'},
 'taxonomy': {0: 'null'},
 'genes': {0: 'null'},
 'evolution': {0: 'null'},
 'expression': {0: 'cH1.1 expression predominates in early embryonic cells compared to other replication-dependent linker histone variants. In mice, cH1.1—alongside H1.8—exhibits the highest expression levels at the one- and two-cell stages of embryonic development [funaya_involvement_2023, funaya_linker_2018]. Although its expression declines by the four-cell stage, it remains elevated through the blastocyst stage. In human cells, сH1.1 has also been shown to be highly expressed in pluripotent cells—including embryonic and induced pluripo

In [37]:
# Make sure data is committed to the database
conn.commit()

# DONE H3

## Add sequences to H3-like_(Plants)

Accessions: NP_172794.1 (HTR6), NP_201338.1 (HTR11),  NP_196795.1 (HTR15) [alvarez-venegas_canonical_2019, okada_analysis_2005]

In [40]:
accessions = ["NP_172794.1", "NP_201338.1", "NP_196795.1"]

## Add sequences to curatedDB

In [41]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [42]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3-like_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 3702
Fetched taxid from NCBI 3702
Fetched taxid from NCBI 3702
accession NP_196795.1 <class 'str'>
variant H3-like_(Plants) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 3702 <class 'int'>
organism Arabidopsis thaliana <class 'str'>
phylum Streptophyta <class 'str'>
class Magnoliopsida <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARSNQTARKATGGKAPHFAMRVWQHSTPPLKKPYRYKPGTVALREIRKYQKTTDLVIRKLPFQRLVKEIAQSLKADLRFQTGAVSALQEAAEAFMVGMFEDTNLCAMHAKRSTIMPKDIQLAKRLRGDRV <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [43]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [44]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3557,NP_172794.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQSARKSHGGKAPTKQLATKAARKSAPTTGGVKKPHRFRPGT...,
3568,NP_196795.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARSNQTARKATGGKAPHFAMRVWQHSTPPLKKPYRYKPGTVALRE...,
3572,NP_201338.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARISTGGKAPRKQLAPKAARQSAPATGGVKKPHRFRPGT...,


In [45]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [46]:
pids = ["alvarez-venegas_canonical_2019", "okada_analysis_2005"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,


In [47]:
data_publication = {
    "id": "okada_analysis_2005",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [48]:
pids = ["alvarez-venegas_canonical_2019", "okada_analysis_2005"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,
144,okada_analysis_2005,,,,,


In [49]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [50]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3979,NP_172794.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQSARKSHGGKAPTKQLATKAARKSAPTTGGVKKPHRFRPGT...,,NP_172794.1,alvarez-venegas_canonical_2019
3980,NP_172794.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQSARKSHGGKAPTKQLATKAARKSAPTTGGVKKPHRFRPGT...,,NP_172794.1,okada_analysis_2005
3998,NP_196795.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARSNQTARKATGGKAPHFAMRVWQHSTPPLKKPYRYKPGTVALRE...,,NP_196795.1,alvarez-venegas_canonical_2019
3999,NP_196795.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARSNQTARKATGGKAPHFAMRVWQHSTPPLKKPYRYKPGTVALRE...,,NP_196795.1,okada_analysis_2005
4006,NP_201338.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARISTGGKAPRKQLAPKAARQSAPATGGVKKPHRFRPGT...,,NP_201338.1,alvarez-venegas_canonical_2019
4007,NP_201338.1,H3-like_(Plants),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARISTGGKAPRKQLAPKAARQSAPATGGVKKPHRFRPGT...,,NP_201338.1,okada_analysis_2005


In [51]:
# Make sure data is committed to the database
conn.commit()

## Add sequences to H3-like_(Plants)

Accessions: ABA97899.1 (HTR714), ABA97902.1 (HTR715) [alvarez-venegas_canonical_2019, hu_identification_2015]

In [52]:
accessions = ["ABA97899.1", "ABA97902.1"]

## Add sequences to curatedDB

In [53]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [54]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3-like_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 39947
Fetched taxid from NCBI 39947
accession ABA97902.1 <class 'str'>
variant H3-like_(Plants) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 39947 <class 'int'>
organism Oryza sativa Japonica Group <class 'str'>
phylum Streptophyta <class 'str'>
class Magnoliopsida <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTAKKSTASNVPRKLLVMKVARKSAPTMAGLKKPHRFKPGTVALREIRTYQKSTELLIRKLPFQRLVQEIAQDVRSYVRFQSSAVVALQEAAETYLVGLFKDTNLCVIHAKRVTIMPKDIQLARRIRGEKA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [55]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [56]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
50,ABA97899.1,H3-like_(Plants),,,,39947.0,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTAGNVPRKLLVMKVARKSAPMMAGLKKPHRFNPWI...,
51,ABA97902.1,H3-like_(Plants),,,,39947.0,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTASNVPRKLLVMKVARKSAPTMAGLKKPHRFKPGT...,


In [57]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [58]:
pids = ["alvarez-venegas_canonical_2019", "hu_identification_2015"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,


In [59]:
data_publication = {
    "id": "hu_identification_2015",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [60]:
pids = ["alvarez-venegas_canonical_2019", "hu_identification_2015"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,
106,hu_identification_2015,,,,,


In [61]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [62]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
62,ABA97899.1,H3-like_(Plants),,,,39947.0,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTAGNVPRKLLVMKVARKSAPMMAGLKKPHRFNPWI...,,ABA97899.1,alvarez-venegas_canonical_2019
63,ABA97899.1,H3-like_(Plants),,,,39947.0,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTAGNVPRKLLVMKVARKSAPMMAGLKKPHRFNPWI...,,ABA97899.1,hu_identification_2015
64,ABA97902.1,H3-like_(Plants),,,,39947.0,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTASNVPRKLLVMKVARKSAPTMAGLKKPHRFKPGT...,,ABA97902.1,alvarez-venegas_canonical_2019
65,ABA97902.1,H3-like_(Plants),,,,39947.0,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTASNVPRKLLVMKVARKSAPTMAGLKKPHRFKPGT...,,ABA97902.1,hu_identification_2015


In [63]:
# Make sure data is committed to the database
conn.commit()

## Add sequences to H3-like_(Plants)

Accessions: BAE48427.1 (gcH3), BAA96098.1 (gH3), BAE48431.1 (leH3), BAE48433.1 (soH3-1), BAE48435.1 (soH3-2) [alvarez-venegas_canonical_2019, probst_similar_2020]

In [64]:
accessions = ["BAE48427.1", "BAA96098.1", "BAE48431.1", "BAE48433.1", "BAE48435.1"]

## Add sequences to curatedDB

In [65]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [66]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3-like_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 4690
Fetched taxid from NCBI 4690
Fetched taxid from NCBI 4690
Fetched taxid from NCBI 4690
Fetched taxid from NCBI 4690
accession BAE48435.1 <class 'str'>
variant H3-like_(Plants) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 4690 <class 'int'>
organism Lilium longiflorum <class 'str'>
phylum Streptophyta <class 'str'>
class Magnoliopsida <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKSTGGKAPRKQLATKAARKSAPTTGGVKKPHRYRPGTVALREIRKYQKSTDLLIRKLPFQRLVREIAQDYKADLRFQSHAVLALQEAAEAYLVGLFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [67]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [68]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
868,BAA96098.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARPRKEAPQRNLDRDENARQQPTEEPQDEAPRNQGRQQQQQRPPA...,
880,BAE48427.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKHTARMSTGGKAPRKQLASKALRKAPPPPTKGVKQPHHYHLR...,
881,BAE48431.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKLNARMSTGGKAPRKQLAYKAVRKAAPPTIGVKLPNSYRPGD...,
882,BAE48433.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSIPTGMGGMKRPRRYRPG...,
883,BAE48435.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPTTGGVKKPHRYRPGT...,


In [69]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [70]:
pids = ["alvarez-venegas_canonical_2019", "probst_similar_2020"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,


In [71]:
data_publication = {
    "id": "probst_similar_2020",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [72]:
pids = ["alvarez-venegas_canonical_2019", "probst_similar_2020"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,
150,probst_similar_2020,,,,,


In [73]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [74]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
1137,BAA96098.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARPRKEAPQRNLDRDENARQQPTEEPQDEAPRNQGRQQQQQRPPA...,,BAA96098.1,alvarez-venegas_canonical_2019
1138,BAA96098.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARPRKEAPQRNLDRDENARQQPTEEPQDEAPRNQGRQQQQQRPPA...,,BAA96098.1,probst_similar_2020
1153,BAE48427.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKHTARMSTGGKAPRKQLASKALRKAPPPPTKGVKQPHHYHLR...,,BAE48427.1,alvarez-venegas_canonical_2019
1154,BAE48427.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKHTARMSTGGKAPRKQLASKALRKAPPPPTKGVKQPHHYHLR...,,BAE48427.1,probst_similar_2020
1155,BAE48431.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKLNARMSTGGKAPRKQLAYKAVRKAAPPTIGVKLPNSYRPGD...,,BAE48431.1,alvarez-venegas_canonical_2019
1156,BAE48431.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKLNARMSTGGKAPRKQLAYKAVRKAAPPTIGVKLPNSYRPGD...,,BAE48431.1,probst_similar_2020
1157,BAE48433.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSIPTGMGGMKRPRRYRPG...,,BAE48433.1,alvarez-venegas_canonical_2019
1158,BAE48433.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSIPTGMGGMKRPRRYRPG...,,BAE48433.1,probst_similar_2020
1159,BAE48435.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPTTGGVKKPHRYRPGT...,,BAE48435.1,alvarez-venegas_canonical_2019
1160,BAE48435.1,H3-like_(Plants),,,,4690.0,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPTTGGVKKPHRYRPGT...,,BAE48435.1,probst_similar_2020


In [75]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [76]:
cursor.close()
conn.close()
tunnel.stop()