In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

44121


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [7]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
add_histone_description = (
    "INSERT INTO histone_description "
    "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
add_alternate_names = (
    "INSERT INTO alternative_name "
    "(name, taxonomy, gene, splice, histone) "
    "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
)
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

In [8]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

In [9]:
query = "SELECT * FROM alternative_name " "WHERE name='cid'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,65,cid,,,,cenH3
1,93,cid,,,,cenH3_(Animals)


# To Do

## <span style="color:green">Add a new node cenH3_(Insecta) to cenH3_(Animals)</span>

## <span style="color:green">Add a summary description to node cenH3_(Insecta)</span>

```cenH3_(Insecta) is a centromere-specific histone variant in insects, functionally analogous to CENP-A (Centromere Protein A) in other eukaryotes. It is essential for centromere specification and proper chromosome segregation during cell division. Notably, cenH3 is absent in four holocentric insect clades, suggesting alternative centromere organization mechanisms in these lineages [talbert_histone_2021, senaratne_formation_2021, sridhar_kinetochore_2022]. Some insect families, such as Drosophilidae and Culicidae, possess multiple cenH3 paralogs with evidence of functional specialization. For more overview of these families, see the cenH3_(Drosophilidae) and cenH3_(Culicidae) classes.```

## <span style="color:green">Add a gene description to node cenH3_(Insecta)</span>

```In some insect lineages, cenH3 has undergone gene duplication, leading to paralogs with distinct functional roles. For detailed lineage-specific information, see the cenH3_(Drosophilidae) and cenH3_(Culicidae) classes.```


## <span style="color:green">Add new nodes cenH3_(Drosophilidae), cenH3_(Culicidae) to cenH3_(Insecta)</span>

## <span style="color:green">Add a summary description to node cenH3_(Drosophilidae)</span>

```cenH3_(Drosophilidae) is a centromere-specific histone variant in Drosophilidae family, functionally analogous to CENP-A in mammals.```

## <span style="color:green">Add a gene description to node cenH3_(Drosophilidae)</span>

```In Drosophila, six cenH3 paralogs (Cid1–Cid6) have been identified: Cid1, Cid4, and Cid6 (and likely Cid2) are essential for mitotic chromosome segregation and are ubiquitously expressed in somatic cells, while the remaining paralogs are germline-specific and may play a role in suppressing centromere drive [kursel_gametic_2021, kursel_recurrent_2017].```

## <span style="color:green">Add a summary description to node cenH3_(Culicidae)</span>

```cenH3_(Culicidae) is a centromere-specific histone variant in Culicidae family (mosquitos), functionally analogous to CENP-A in mammals```

## <span style="color:green">Add a gene description to node cenH3_(Culicidae)</span>

```In mosquitoes (Culicidae), three cenH3 paralogs exist: the evolutionarily ancient mosqCid1 and mosqCid2, which have been conserved for over 150 million years, and mosqCid3, a more recent paralog resulting from an independent duplication of mosqCid1 in Aedes aegypti and Aedes albopictus [kursel_ancient_2020].```

## <span style="color:green">Add alternate names cid, cid1, cid2, cid3, cid4, cid5, cid6, mosqCid1, mosqCid2, mosqCid3 to cenH3_(Insecta)</span>

## <span style="color:green">Add alternate names cid, cid1, cid2, cid3, cid4, cid5, cid6 to cenH3_(Drosophilidae)</span>

## <span style="color:green">Add alternate names mosqCid1, mosqCid2, mosqCid3 to cenH3_(Culicidae)</span>

## <span style="color:green">Add new nodes cenH3.1_(Drosophilidae), cenH3.2_(Drosophila_eugracilis), cenH3.3_(Montium), cenH3.4_(Montium), cenH3.5_(Drosophila), cenH3.6_(Repleta) to cenH3_(Drosophilidae)</span>

## <span style="color:green">Add a summary description to node cenH3.1</span>

```CenH3.1_(Drosophilidae) is a centromere-specific histone variant encoded by the Cid1 (also known as Cid) gene found in most Drosophila species except D. eugracilis, where it was pseudogenized and replaced by Cid2, and D. buzzatii and D. seriema, where Cid1 degenerated due to transposon insertions and was replaced by a new copy - Cid6 [teixeira_concurrent_2018, kursel_recurrent_2017]. As the ancestral centromeric histone in Drosophilidae, Cid1  represents the canonical centromeric histone in this family. It is essential for chromosome segregation during mitosis and meiosis, maintaining conserved histone-fold domains (HFD) critical for nucleosome assembly at centromeres. Unlike its paralogs (Cid3-Cid5), Cid1 is ubiquitously expressed in both somatic and germline tissues, though its expression is highest in gonads [kursel_recurrent_2017]. The N-terminal tail of Cid1 contains four conserved motifs. Remarkably, Cid1 persists throughout oogenesis in females but is specifically degraded during male meiosis I, disappearing from post-meiotic spermatids and sperm [kursel_gametic_2021].```

## <span style="color:green">Add a summary description to node cenH3.2_(Drosophila_eugracilis)</span>

```cenH3.2_(Drosophila_eugracilis) is a centromere-specific histone variant encoded by the Cid2 gene, which has been identified exclusively in Drosophila eugracilis as a rare replacement for the ancestral Cid1 gene [kursel_recurrent_2017]. Unlike other Cid paralogs that coexist with Cid1, Cid2 uniquely serves as the sole centromeric histone in this species following the pseudogenization of Cid1 due to a frameshift-causing 2-bp deletion. While Cid2's precise functional specialization remains unstudied, its singular presence in D. eugracilis suggests it has assumed the core centromeric functions of the lost Cid1, though potential neofunctionalization cannot be ruled out given the lack of comparative data with other Cid paralogs [kursel_recurrent_2017].```

## <span style="color:green">Add a summary description to node cenH3.3_(Montium)</span>

```cenH3.3_(Montium) is a centromere-specific histone variant encoded by the Cid3 gene, found exclusively in the montium subgroup of Drosophila (e.g., D. kikkawai, D. auraria) [kursel_recurrent_2017]. Unlike the ubiquitously expressed Cid1, Cid3 exhibits male germline-specific expression, suggesting specialized meiotic functions, potentially related to suppressing centromere drive during spermatogenesis [kursel_recurrent_2017]. The histone variant retains conserved histone-fold domains through gene conversion with Cid1 but has a divergent N-terminal tail. Evolving under positive selection, Cid3 shows adaptive changes in DNA-contact regions, suggesting conflict-driven evolution. This paralog persists alongside Cid1 and Cid4 in a triplicate system, maintaining non-redundant meiotic functions for 15 million years [kursel_recurrent_2017].```

## <span style="color:green">Add a summary description to node cenH3.4_(Montium)</span>

```cenH3.4_(Montium) is a centromere-specific histone variant encoded by the Cid4 gene, uniquely present in the montium subgroup of Drosophila. Unlike its paralogs, Cid4 shows broad somatic expression and has effectively replaced Cid1 as the dominant centromeric histone in mitotic cells [kursel_recurrent_2017]. The histone variant shows positive selection in its DNA-binding Loop 1 region and retains all four conserved N-terminal motifs found in Cid1, while acquiring a novel motif that may enable specialized interactions [kursel_recurrent_2017]. As part of the montium's three-gene system with Cid1 and Cid3, Cid4 specializes in maintaining mitotic centromere function through 15 million years of evolution.```

## <span style="color:green">Add a summary description to node cenH3.5_(Drosophila)</span>

```cenH3.5_(Drosophila) is a divergent centromeric histone paralog found exclusively in the Drosophila subgenus, originating from an ancient duplication of Cid1 [kursel_gametic_2021, kursel_recurrent_2017]. Unlike the ubiquitously expressed Cid1, Cid5 exhibits germline-restricted expression and has functionally specialized for male gametogenesis. It is absent in somatic cells but co-expressed with Cid1 in early germ cells of both sexes [kursel_gametic_2021].  Cid5 replaces Cid1 during male meiosis I, becoming the sole centromeric histone in sperm, but is lost during late oogenesis in females [kursel_gametic_2021]. After fertilization, paternally inherited Cid5 is rapidly replaced by maternal Cid1 in the embryo. The histone variant lacks conserved N-terminal motifs present in Cid1 and features a divergent histone-fold domain, potentially adapting it for sperm chromatin [kursel_gametic_2021]. Notably, Cid5 co-exists with CENP-C2 (male-biased CENP-C paralog) in the Drosophila subgenus, potentially forming a distinct kinetochore configuration during spermatogenesis [teixeira_concurrent_2018]. While Cid5 maintains key centromere localization motifs, its rapid evolution and testis-specific expression pattern indicate subfunctionalization or neofunctionalization relative to Cid1.```

## <span style="color:green">Add a summary description to node cenH3.6_(Repleta)</span>

```cenH3.6_(Repleta)  is a recently evolved centromeric histone variant found specifically in Drosophila buzzatii and D. seriema (repleta group), resulting from an interchromosomal duplication of Cid1 that occurred 4.6-11.3 million years ago [teixeira_concurrent_2018]. This paralog shows ~80% amino acid identity with ancestral Cid1 from closely related species like D. mojavensis, compared to only ~40% identity with Cid5. Unlike the male germline-restricted Cid5, Cid6 displays constitutive expression across all developmental stages, suggesting it functionally replaced the degenerated Cid1 copy in these species [teixeira_concurrent_2018]. While Cid6 maintains core centromere functions, it shows no evidence of positive selection observed in Cid5, indicating preservation of ancestral Cid1 roles rather than neofunctionalization [teixeira_concurrent_2018].```

## <span style="color:green">Add new nodes cenH3.1_(Culicidae), cenH3.2_(Culicidae), cenH3.3_(Aedes) to cenH3_(Culicidae)</span>

## <span style="color:green">Add a summary description to node cenH3.1_(Culicidae)</span>

```cenH3.1_(Culicidae) is a centromeric histone variant encoded by the mosqCid1 gene, found in most studied mosquito species (Anopheles, Aedes, Culex), with the exception of An. albimanus and An. darlingi where it has been lost [kursel_ancient_2020]. This gene emerged over 150 million years ago and has been retained in most species, acquiring specialized functions. Unlike its paralog mosqCid2, which shows high expression in ovaries and early embryos, mosqCid1 exhibits low expression levels in both somatic and germline tissues [kursel_ancient_2020]. Notably, mosqCid1 evolves under positive selection, suggesting its potential role in suppressing genetic conflicts such as centromere drive [kursel_ancient_2020]. Its N-terminal tail contains unique motifs absent in mosqCid2, likely reflecting differences in protein interactions.```

## <span style="color:green">Add a summary description to node cenH3.2_(Culicidae)</span>

```cenH3.2_(Culicidae) is the ancestral (emerged over 150 My) centromeric histone variant in mosquitoes, conserved across Anopheles, Aedes, and Culex species [kursel_ancient_2020]. Unlike its paralog mosqCid1, mosqCid2 specializes in germline processes, showing dramatic upregulation (6-10 fold) in ovaries after blood feeding and during early embryogenesis [kursel_ancient_2020]. It contains four unique N-terminal motifs absent in mosqCid1, reflecting functional divergence. Remarkably, mosqCid2 serves as the sole centromeric histone in An. albimanus and An. darlingi following mosqCid1 loss, demonstrating its fundamental role in chromosome segregation [kursel_ancient_2020].```

## <span style="color:green">Add a summary description to node cenH3.3_(Aedes)</span>

```cenH3.3_(Aedes) is a recently evolved centromeric histone variant unique to Aedes mosquitoes, arising from a lineage-specific duplication of mosqCid1 [kursel_ancient_2020]. This paralog demonstrates specialized temporal expression during embryogenesis, suggesting a developmental-stage specific role in early development. Unlike the germline-focused mosqCid2 or the ubiquitously expressed mosqCid1, mosqCid3 shows no significant expression in adult tissues or ovaries, indicating its exclusive embryonic function [kursel_ancient_2020].```

# <span style="color:black">Add a new node cenH3_(Insecta) to cenH3_(Animals)</span>

In [14]:
data_histone = {
    "id": "cenH3_(Insecta)",
    "level": "variant",
    "taxonomic_span": "Insecta",
    "taxonomic_span_id": "50557 ",
    "description": None,
    "parent": "cenH3_(Animals)",
}
cursor.execute(add_histone, data_histone)

In [13]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(["cenH3_(Insecta)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
12,cenH3_(Insecta),variant,Insecta,50557,,cenH3_(Animals)


In [14]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3_(Insecta)</span>

```cenH3_(Insecta) is a centromere-specific histone variant in insects, functionally analogous to CENP-A (Centromere Protein A) in other eukaryotes. It is essential for centromere specification and proper chromosome segregation during cell division. Notably, cenH3 is absent in four holocentric insect clades, suggesting alternative centromere organization mechanisms in these lineages [talbert_histone_2021, senaratne_formation_2021, sridhar_kinetochore_2022]. Some insect families, such as Drosophilidae and Culicidae, possess multiple cenH3 paralogs with evidence of functional specialization. For more overview of these families, see the cenH3_(Drosophilidae) and cenH3_(Culicidae) classes.```

# <span style="color:black">Add a gene description to node cenH3_(Insecta)</span>

```In some insect lineages, cenH3 has undergone gene duplication, leading to paralogs with distinct functional roles. For detailed lineage-specific information, see the cenH3_(Drosophilidae) and cenH3_(Culicidae) classes.```

In [15]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Insecta)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Insecta),variant,Insecta,50557,,cenH3_(Animals),,,,,...,,,,,,,,,,


In [16]:
histone_description_summary = "cenH3_(Insecta) is a centromere-specific histone variant in insects, functionally analogous to CENP-A (Centromere Protein A) in other eukaryotes. It is essential for centromere specification and proper chromosome segregation during cell division. Notably, cenH3 is absent in four holocentric insect clades, suggesting alternative centromere organization mechanisms in these lineages [talbert_histone_2021, senaratne_formation_2021, sridhar_kinetochore_2022]. Some insect families, such as Drosophilidae and Culicidae, possess multiple cenH3 paralogs with evidence of functional specialization. For more overview of these families, see the cenH3_(Drosophilidae) and cenH3_(Culicidae) classes."
histone_description_genes = "In some insect lineages, cenH3 has undergone gene duplication, leading to paralogs with distinct functional roles. For detailed lineage-specific information, see the cenH3_(Drosophilidae) and cenH3_(Culicidae) classes."
data_histone_description = (
    histone_description_summary,
    None,
    histone_description_genes,
) + (None,) * 11
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3_(Insecta)'"
cursor.execute(query)

In [17]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Insecta)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Insecta),variant,Insecta,50557,237,cenH3_(Animals),237,cenH3_(Insecta) is a centromere-specific histo...,,"In some insect lineages, cenH3 has undergone g...",...,,,,,,,,,,


In [18]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add new nodes cenH3_(Drosophilidae), cenH3_(Culicidae) to cenH3_(Insecta)</span>

In [19]:
data_histone = {
    "id": "cenH3_(Drosophilidae)",
    "level": "variant",
    "taxonomic_span": "Drosophilidae",
    "taxonomic_span_id": "7214",
    "description": None,
    "parent": "cenH3_(Insecta)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3_(Culicidae)",
    "level": "variant",
    "taxonomic_span": "Culicidae",
    "taxonomic_span_id": "7157",
    "description": None,
    "parent": "cenH3_(Insecta)",
}
cursor.execute(add_histone, data_histone)

In [20]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(["cenH3_(Drosophilidae)", "cenH3_(Culicidae)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
10,cenH3_(Culicidae),variant,Culicidae,7157,,cenH3_(Insecta)
11,cenH3_(Drosophilidae),variant,Drosophilidae,7214,,cenH3_(Insecta)


In [21]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3_(Drosophilidae)</span>

```cenH3_(Drosophilidae) is a centromere-specific histone variant in Drosophilidae family, functionally analogous to CENP-A in mammals.```

# <span style="color:black">Add a gene description to node cenH3_(Drosophilidae)</span>

```In Drosophila, six cenH3 paralogs (Cid1–Cid6) have been identified: Cid1, Cid4, and Cid6 (and likely Cid2) are essential for mitotic chromosome segregation and are ubiquitously expressed in somatic cells, while the remaining paralogs are germline-specific and may play a role in suppressing centromere drive [kursel_gametic_2021, kursel_recurrent_2017].```

In [22]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Drosophilidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Drosophilidae),variant,Drosophilidae,7214,,cenH3_(Insecta),,,,,...,,,,,,,,,,


In [23]:
histone_description_summary = "cenH3_(Drosophilidae) is a centromere-specific histone variant in Drosophilidae family, functionally analogous to CENP-A in mammals."
histone_description_genes = "In Drosophila, six cenH3 paralogs (Cid1–Cid6) have been identified: Cid1, Cid4, and Cid6 (and likely Cid2) are essential for mitotic chromosome segregation and are ubiquitously expressed in somatic cells, while the remaining paralogs are germline-specific and may play a role in suppressing centromere drive [kursel_gametic_2021, kursel_recurrent_2017]."
data_histone_description = (
    histone_description_summary,
    None,
    histone_description_genes,
) + (None,) * 11
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3_(Drosophilidae)'"
cursor.execute(query)

In [24]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Drosophilidae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Drosophilidae),variant,Drosophilidae,7214,238,cenH3_(Insecta),238,cenH3_(Drosophilidae) is a centromere-specific...,,"In Drosophila, six cenH3 paralogs (Cid1–Cid6) ...",...,,,,,,,,,,


In [25]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3_(Culicidae)</span>

```cenH3_(Culicidae) is a centromere-specific histone variant in Culicidae family (mosquitos), functionally analogous to CENP-A in mammals```

# <span style="color:black">Add a gene description to node cenH3_(Culicidae)</span>

```In mosquitoes (Culicidae), three cenH3 paralogs exist: the evolutionarily ancient mosqCid1 and mosqCid2, which have been conserved for over 150 million years, and mosqCid3, a more recent paralog resulting from an independent duplication of mosqCid1 in Aedes aegypti and Aedes albopictus [kursel_ancient_2020].```

In [26]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Culicidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Culicidae),variant,Culicidae,7157,,cenH3_(Insecta),,,,,...,,,,,,,,,,


In [27]:
histone_description_summary = "cenH3_(Culicidae) is a centromere-specific histone variant in Culicidae family (mosquitos), functionally analogous to CENP-A in mammals."
histone_description_genes = "In mosquitoes (Culicidae), three cenH3 paralogs exist: the evolutionarily ancient mosqCid1 and mosqCid2, which have been conserved for over 150 million years, and mosqCid3, a more recent paralog resulting from an independent duplication of mosqCid1 in Aedes aegypti and Aedes albopictus [kursel_ancient_2020]."
data_histone_description = (
    histone_description_summary,
    None,
    histone_description_genes,
) + (None,) * 11
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3_(Culicidae)'"
cursor.execute(query)

In [28]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Culicidae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Culicidae),variant,Culicidae,7157,239,cenH3_(Insecta),239,cenH3_(Culicidae) is a centromere-specific his...,,"In mosquitoes (Culicidae), three cenH3 paralog...",...,,,,,,,,,,


In [29]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add alternate names cid, cid1, cid2, cid3, cid4, cid5, cid6, mosqCid1, mosqCid2, mosqCid3 to cenH3_(Insecta)</span>

In [30]:
query = "SELECT * FROM alternative_name " "WHERE histone='cenH3_(Insecta)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone


In [31]:
alternate_names = (
    ["cid"] + [f"cid{i}" for i in range(1, 7)] + [f"mosqCid{i}" for i in range(1, 4)]
)
alternate_names

['cid',
 'cid1',
 'cid2',
 'cid3',
 'cid4',
 'cid5',
 'cid6',
 'mosqCid1',
 'mosqCid2',
 'mosqCid3']

In [32]:
for an in alternate_names:
    data_alternate_name = {
        "name": an,
        "taxonomy": None,
        "gene": None,
        "splice": None,
        "histone": "cenH3_(Insecta)",
    }
    cursor.execute(add_alternate_names, data_alternate_name)

In [33]:
query = "SELECT * FROM alternative_name " "WHERE histone='cenH3_(Insecta)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,99,cid,,,,cenH3_(Insecta)
1,100,cid1,,,,cenH3_(Insecta)
2,101,cid2,,,,cenH3_(Insecta)
3,102,cid3,,,,cenH3_(Insecta)
4,103,cid4,,,,cenH3_(Insecta)
5,104,cid5,,,,cenH3_(Insecta)
6,105,cid6,,,,cenH3_(Insecta)
7,106,mosqCid1,,,,cenH3_(Insecta)
8,107,mosqCid2,,,,cenH3_(Insecta)
9,108,mosqCid3,,,,cenH3_(Insecta)


In [34]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add alternate names cid, cid1, cid2, cid3, cid4, cid5, cid6 to cenH3_(Drosophilidae)</span>

In [36]:
query = "SELECT * FROM alternative_name " "WHERE histone='cenH3_(Drosophilidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone


In [35]:
alternate_names = ["cid"] + [f"cid{i}" for i in range(1, 7)]
alternate_names

['cid', 'cid1', 'cid2', 'cid3', 'cid4', 'cid5', 'cid6']

In [37]:
for an in alternate_names:
    data_alternate_name = {
        "name": an,
        "taxonomy": None,
        "gene": None,
        "splice": None,
        "histone": "cenH3_(Drosophilidae)",
    }
    cursor.execute(add_alternate_names, data_alternate_name)

In [38]:
query = "SELECT * FROM alternative_name " "WHERE histone='cenH3_(Drosophilidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,109,cid,,,,cenH3_(Drosophilidae)
1,110,cid1,,,,cenH3_(Drosophilidae)
2,111,cid2,,,,cenH3_(Drosophilidae)
3,112,cid3,,,,cenH3_(Drosophilidae)
4,113,cid4,,,,cenH3_(Drosophilidae)
5,114,cid5,,,,cenH3_(Drosophilidae)
6,115,cid6,,,,cenH3_(Drosophilidae)


In [39]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add alternate names mosqCid1, mosqCid2, mosqCid3 to cenH3_(Culicidae)</span>

In [40]:
query = "SELECT * FROM alternative_name " "WHERE histone='cenH3_(Culicidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone


In [41]:
alternate_names = [f"mosqCid{i}" for i in range(1, 4)]
alternate_names

['mosqCid1', 'mosqCid2', 'mosqCid3']

In [42]:
for an in alternate_names:
    data_alternate_name = {
        "name": an,
        "taxonomy": None,
        "gene": None,
        "splice": None,
        "histone": "cenH3_(Culicidae)",
    }
    cursor.execute(add_alternate_names, data_alternate_name)

In [43]:
query = "SELECT * FROM alternative_name " "WHERE histone='cenH3_(Culicidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,116,mosqCid1,,,,cenH3_(Culicidae)
1,117,mosqCid2,,,,cenH3_(Culicidae)
2,118,mosqCid3,,,,cenH3_(Culicidae)


In [44]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add new nodes cenH3.1_(Drosophilidae), cenH3.2_(Drosophila_eugracilis), cenH3.3_(Montium), cenH3.4_(Montium), cenH3.5_(Drosophila), cenH3.6_(Repleta) to cenH3_(Drosophilidae)</span>

In [45]:
histone_nodes = [
    "cenH3.1_(Drosophilidae)",
    "cenH3.2_(Drosophila_eugracilis)",
    "cenH3.3_(Montium)",
    "cenH3.4_(Montium)",
    "cenH3.5_(Drosophila)",
    "cenH3.6_(Repleta)",
]

In [46]:
data_histone = {
    "id": "cenH3.1_(Drosophilidae)",
    "level": "variant",
    "taxonomic_span": "Drosophilidae",
    "taxonomic_span_id": "7214",
    "description": None,
    "parent": "cenH3_(Drosophilidae)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3.2_(Drosophila_eugracilis)",
    "level": "variant",
    "taxonomic_span": "Drosophila eugracilis",
    "taxonomic_span_id": "29029",
    "description": None,
    "parent": "cenH3_(Drosophilidae)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3.3_(Montium)",
    "level": "variant",
    "taxonomic_span": "montium subgroup",
    "taxonomic_span_id": "32352",
    "description": None,
    "parent": "cenH3_(Drosophilidae)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3.4_(Montium)",
    "level": "variant",
    "taxonomic_span": "montium subgroup",
    "taxonomic_span_id": "32352",
    "description": None,
    "parent": "cenH3_(Drosophilidae)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3.5_(Drosophila)",
    "level": "variant",
    "taxonomic_span": "Drosophila",
    "taxonomic_span_id": "32281",
    "description": None,
    "parent": "cenH3_(Drosophilidae)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3.6_(Repleta)",
    "level": "variant",
    "taxonomic_span": "repleta group",
    "taxonomic_span_id": "32321",
    "description": None,
    "parent": "cenH3_(Drosophilidae)",
}
cursor.execute(add_histone, data_histone)

In [47]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(histone_nodes)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
5,cenH3.1_(Drosophilidae),variant,Drosophilidae,7214,,cenH3_(Drosophilidae)
8,cenH3.2_(Drosophila_eugracilis),variant,Drosophila eugracilis,29029,,cenH3_(Drosophilidae)
11,cenH3.3_(Montium),variant,montium subgroup,32352,,cenH3_(Drosophilidae)
12,cenH3.4_(Montium),variant,montium subgroup,32352,,cenH3_(Drosophilidae)
13,cenH3.5_(Drosophila),variant,Drosophila,32281,,cenH3_(Drosophilidae)
14,cenH3.6_(Repleta),variant,repleta group,32321,,cenH3_(Drosophilidae)


In [48]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.1</span>

```CenH3.1_(Drosophilidae) is a centromere-specific histone variant encoded by the Cid1 (also known as Cid) gene found in most Drosophila species except D. eugracilis, where it was pseudogenized and replaced by Cid2, and D. buzzatii and D. seriema, where Cid1 degenerated due to transposon insertions and was replaced by a new copy - Cid6 [teixeira_concurrent_2018, kursel_recurrent_2017]. As the ancestral centromeric histone in Drosophilidae, Cid1  represents the canonical centromeric histone in this family. It is essential for chromosome segregation during mitosis and meiosis, maintaining conserved histone-fold domains (HFD) critical for nucleosome assembly at centromeres. Unlike its paralogs (Cid3-Cid5), Cid1 is ubiquitously expressed in both somatic and germline tissues, though its expression is highest in gonads [kursel_recurrent_2017]. The N-terminal tail of Cid1 contains four conserved motifs. Remarkably, Cid1 persists throughout oogenesis in females but is specifically degraded during male meiosis I, disappearing from post-meiotic spermatids and sperm [kursel_gametic_2021].```

In [49]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.1_(Drosophilidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.1_(Drosophilidae),variant,Drosophilidae,7214,,cenH3_(Drosophilidae),,,,,...,,,,,,,,,,


In [50]:
histone_description_summary = "CenH3.1_(Drosophilidae) is a centromere-specific histone variant encoded by the Cid1 (also known as Cid) gene found in most Drosophila species except D. eugracilis, where it was pseudogenized and replaced by Cid2, and D. buzzatii and D. seriema, where Cid1 degenerated due to transposon insertions and was replaced by a new copy - Cid6 [teixeira_concurrent_2018, kursel_recurrent_2017]. As the ancestral centromeric histone in Drosophilidae, Cid1  represents the canonical centromeric histone in this family. It is essential for chromosome segregation during mitosis and meiosis, maintaining conserved histone-fold domains (HFD) critical for nucleosome assembly at centromeres. Unlike its paralogs (Cid3-Cid5), Cid1 is ubiquitously expressed in both somatic and germline tissues, though its expression is highest in gonads [kursel_recurrent_2017]. The N-terminal tail of Cid1 contains four conserved motifs. Remarkably, Cid1 persists throughout oogenesis in females but is specifically degraded during male meiosis I, disappearing from post-meiotic spermatids and sperm [kursel_gametic_2021]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.1_(Drosophilidae)'"
cursor.execute(query)

In [51]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.1_(Drosophilidae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.1_(Drosophilidae),variant,Drosophilidae,7214,240,cenH3_(Drosophilidae),240,CenH3.1_(Drosophilidae) is a centromere-specif...,,,...,,,,,,,,,,


In [52]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.2_(Drosophila_eugracilis)</span>

```cenH3.2_(Drosophila_eugracilis) is a centromere-specific histone variant encoded by the Cid2 gene, which has been identified exclusively in Drosophila eugracilis as a rare replacement for the ancestral Cid1 gene [kursel_recurrent_2017]. Unlike other Cid paralogs that coexist with Cid1, Cid2 uniquely serves as the sole centromeric histone in this species following the pseudogenization of Cid1 due to a frameshift-causing 2-bp deletion. While Cid2's precise functional specialization remains unstudied, its singular presence in D. eugracilis suggests it has assumed the core centromeric functions of the lost Cid1, though potential neofunctionalization cannot be ruled out given the lack of comparative data with other Cid paralogs [kursel_recurrent_2017].```

In [53]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.2_(Drosophila_eugracilis)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.2_(Drosophila_eugracilis),variant,Drosophila eugracilis,29029,,cenH3_(Drosophilidae),,,,,...,,,,,,,,,,


In [54]:
histone_description_summary = "cenH3.2_(Drosophila_eugracilis) is a centromere-specific histone variant encoded by the Cid2 gene, which has been identified exclusively in Drosophila eugracilis as a rare replacement for the ancestral Cid1 gene [kursel_recurrent_2017]. Unlike other Cid paralogs that coexist with Cid1, Cid2 uniquely serves as the sole centromeric histone in this species following the pseudogenization of Cid1 due to a frameshift-causing 2-bp deletion. While Cid2's precise functional specialization remains unstudied, its singular presence in D. eugracilis suggests it has assumed the core centromeric functions of the lost Cid1, though potential neofunctionalization cannot be ruled out given the lack of comparative data with other Cid paralogs [kursel_recurrent_2017]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.2_(Drosophila_eugracilis)'"
cursor.execute(query)

In [55]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.2_(Drosophila_eugracilis)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.2_(Drosophila_eugracilis),variant,Drosophila eugracilis,29029,241,cenH3_(Drosophilidae),241,cenH3.2_(Drosophila_eugracilis) is a centromer...,,,...,,,,,,,,,,


In [56]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.3_(Montium)</span>

```cenH3.3_(Montium) is a centromere-specific histone variant encoded by the Cid3 gene, found exclusively in the montium subgroup of Drosophila (e.g., D. kikkawai, D. auraria) [kursel_recurrent_2017]. Unlike the ubiquitously expressed Cid1, Cid3 exhibits male germline-specific expression, suggesting specialized meiotic functions, potentially related to suppressing centromere drive during spermatogenesis [kursel_recurrent_2017]. The histone variant retains conserved histone-fold domains through gene conversion with Cid1 but has a divergent N-terminal tail. Evolving under positive selection, Cid3 shows adaptive changes in DNA-contact regions, suggesting conflict-driven evolution. This paralog persists alongside Cid1 and Cid4 in a triplicate system, maintaining non-redundant meiotic functions for 15 million years [kursel_recurrent_2017].```

In [57]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.3_(Montium)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.3_(Montium),variant,montium subgroup,32352,,cenH3_(Drosophilidae),,,,,...,,,,,,,,,,


In [58]:
histone_description_summary = "cenH3.3_(Montium) is a centromere-specific histone variant encoded by the Cid3 gene, found exclusively in the montium subgroup of Drosophila (e.g., D. kikkawai, D. auraria) [kursel_recurrent_2017]. Unlike the ubiquitously expressed Cid1, Cid3 exhibits male germline-specific expression, suggesting specialized meiotic functions, potentially related to suppressing centromere drive during spermatogenesis [kursel_recurrent_2017]. The histone variant retains conserved histone-fold domains through gene conversion with Cid1 but has a divergent N-terminal tail. Evolving under positive selection, Cid3 shows adaptive changes in DNA-contact regions, suggesting conflict-driven evolution. This paralog persists alongside Cid1 and Cid4 in a triplicate system, maintaining non-redundant meiotic functions for 15 million years [kursel_recurrent_2017]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.3_(Montium)'"
cursor.execute(query)

In [59]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.3_(Montium)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.3_(Montium),variant,montium subgroup,32352,242,cenH3_(Drosophilidae),242,cenH3.3_(Montium) is a centromere-specific his...,,,...,,,,,,,,,,


In [60]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.4_(Montium)</span>

```cenH3.4_(Montium) is a centromere-specific histone variant encoded by the Cid4 gene, uniquely present in the montium subgroup of Drosophila. Unlike its paralogs, Cid4 shows broad somatic expression and has effectively replaced Cid1 as the dominant centromeric histone in mitotic cells [kursel_recurrent_2017]. The histone variant shows positive selection in its DNA-binding Loop 1 region and retains all four conserved N-terminal motifs found in Cid1, while acquiring a novel motif that may enable specialized interactions [kursel_recurrent_2017]. As part of the montium's three-gene system with Cid1 and Cid3, Cid4 specializes in maintaining mitotic centromere function through 15 million years of evolution.```

In [61]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.4_(Montium)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.4_(Montium),variant,montium subgroup,32352,,cenH3_(Drosophilidae),,,,,...,,,,,,,,,,


In [62]:
histone_description_summary = "cenH3.4_(Montium) is a centromere-specific histone variant encoded by the Cid4 gene, uniquely present in the montium subgroup of Drosophila. Unlike its paralogs, Cid4 shows broad somatic expression and has effectively replaced Cid1 as the dominant centromeric histone in mitotic cells [kursel_recurrent_2017]. The histone variant shows positive selection in its DNA-binding Loop 1 region and retains all four conserved N-terminal motifs found in Cid1, while acquiring a novel motif that may enable specialized interactions [kursel_recurrent_2017]. As part of the montium's three-gene system with Cid1 and Cid3, Cid4 specializes in maintaining mitotic centromere function through 15 million years of evolution."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.4_(Montium)'"
cursor.execute(query)

In [63]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.4_(Montium)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.4_(Montium),variant,montium subgroup,32352,243,cenH3_(Drosophilidae),243,cenH3.4_(Montium) is a centromere-specific his...,,,...,,,,,,,,,,


In [64]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.5_(Drosophila)</span>

```cenH3.5_(Drosophila) is a divergent centromeric histone paralog found exclusively in the Drosophila subgenus, originating from an ancient duplication of Cid1 [kursel_gametic_2021, kursel_recurrent_2017]. Unlike the ubiquitously expressed Cid1, Cid5 exhibits germline-restricted expression and has functionally specialized for male gametogenesis. It is absent in somatic cells but co-expressed with Cid1 in early germ cells of both sexes [kursel_gametic_2021].  Cid5 replaces Cid1 during male meiosis I, becoming the sole centromeric histone in sperm, but is lost during late oogenesis in females [kursel_gametic_2021]. After fertilization, paternally inherited Cid5 is rapidly replaced by maternal Cid1 in the embryo. The histone variant lacks conserved N-terminal motifs present in Cid1 and features a divergent histone-fold domain, potentially adapting it for sperm chromatin [kursel_gametic_2021]. Notably, Cid5 co-exists with CENP-C2 (male-biased CENP-C paralog) in the Drosophila subgenus, potentially forming a distinct kinetochore configuration during spermatogenesis [teixeira_concurrent_2018]. While Cid5 maintains key centromere localization motifs, its rapid evolution and testis-specific expression pattern indicate subfunctionalization or neofunctionalization relative to Cid1.```

In [65]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.5_(Drosophila)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.5_(Drosophila),variant,Drosophila,32281,,cenH3_(Drosophilidae),,,,,...,,,,,,,,,,


In [66]:
histone_description_summary = "cenH3.5_(Drosophila) is a divergent centromeric histone paralog found exclusively in the Drosophila subgenus, originating from an ancient duplication of Cid1 [kursel_gametic_2021, kursel_recurrent_2017]. Unlike the ubiquitously expressed Cid1, Cid5 exhibits germline-restricted expression and has functionally specialized for male gametogenesis. It is absent in somatic cells but co-expressed with Cid1 in early germ cells of both sexes [kursel_gametic_2021].  Cid5 replaces Cid1 during male meiosis I, becoming the sole centromeric histone in sperm, but is lost during late oogenesis in females [kursel_gametic_2021]. After fertilization, paternally inherited Cid5 is rapidly replaced by maternal Cid1 in the embryo. The histone variant lacks conserved N-terminal motifs present in Cid1 and features a divergent histone-fold domain, potentially adapting it for sperm chromatin [kursel_gametic_2021]. Notably, Cid5 co-exists with CENP-C2 (male-biased CENP-C paralog) in the Drosophila subgenus, potentially forming a distinct kinetochore configuration during spermatogenesis [teixeira_concurrent_2018]. While Cid5 maintains key centromere localization motifs, its rapid evolution and testis-specific expression pattern indicate subfunctionalization or neofunctionalization relative to Cid1."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.5_(Drosophila)'"
cursor.execute(query)

In [67]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.5_(Drosophila)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.5_(Drosophila),variant,Drosophila,32281,244,cenH3_(Drosophilidae),244,cenH3.5_(Drosophila) is a divergent centromeri...,,,...,,,,,,,,,,


In [68]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.6_(Repleta)</span>

```cenH3.6_(Repleta)  is a recently evolved centromeric histone variant found specifically in Drosophila buzzatii and D. seriema (repleta group), resulting from an interchromosomal duplication of Cid1 that occurred 4.6-11.3 million years ago [teixeira_concurrent_2018]. This paralog shows ~80% amino acid identity with ancestral Cid1 from closely related species like D. mojavensis, compared to only ~40% identity with Cid5. Unlike the male germline-restricted Cid5, Cid6 displays constitutive expression across all developmental stages, suggesting it functionally replaced the degenerated Cid1 copy in these species [teixeira_concurrent_2018]. While Cid6 maintains core centromere functions, it shows no evidence of positive selection observed in Cid5, indicating preservation of ancestral Cid1 roles rather than neofunctionalization [teixeira_concurrent_2018].```

In [69]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.6_(Repleta)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.6_(Repleta),variant,repleta group,32321,,cenH3_(Drosophilidae),,,,,...,,,,,,,,,,


In [70]:
histone_description_summary = "cenH3.6_(Repleta)  is a recently evolved centromeric histone variant found specifically in Drosophila buzzatii and D. seriema (repleta group), resulting from an interchromosomal duplication of Cid1 that occurred 4.6-11.3 million years ago [teixeira_concurrent_2018]. This paralog shows ~80% amino acid identity with ancestral Cid1 from closely related species like D. mojavensis, compared to only ~40% identity with Cid5. Unlike the male germline-restricted Cid5, Cid6 displays constitutive expression across all developmental stages, suggesting it functionally replaced the degenerated Cid1 copy in these species [teixeira_concurrent_2018]. While Cid6 maintains core centromere functions, it shows no evidence of positive selection observed in Cid5, indicating preservation of ancestral Cid1 roles rather than neofunctionalization [teixeira_concurrent_2018]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.6_(Repleta)'"
cursor.execute(query)

In [71]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.6_(Repleta)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.6_(Repleta),variant,repleta group,32321,245,cenH3_(Drosophilidae),245,cenH3.6_(Repleta) is a recently evolved centr...,,,...,,,,,,,,,,


In [72]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add new nodes cenH3.1_(Culicidae), cenH3.2_(Culicidae), cenH3.3_(Aedes) to cenH3_(Culicidae)</span>

In [73]:
histone_nodes = [
    "cenH3.1_(Culicidae)",
    "cenH3.2_(Culicidae)",
    "cenH3.3_(Aedes)",
]

In [74]:
data_histone = {
    "id": "cenH3.1_(Culicidae)",
    "level": "variant",
    "taxonomic_span": "Culicidae",
    "taxonomic_span_id": "7157",
    "description": None,
    "parent": "cenH3_(Culicidae)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3.2_(Culicidae)",
    "level": "variant",
    "taxonomic_span": "Culicidae",
    "taxonomic_span_id": "7157",
    "description": None,
    "parent": "cenH3_(Culicidae)",
}
cursor.execute(add_histone, data_histone)

data_histone = {
    "id": "cenH3.3_(Aedes)",
    "level": "variant",
    "taxonomic_span": "Aedes",
    "taxonomic_span_id": "7158",
    "description": None,
    "parent": "cenH3_(Culicidae)",
}
cursor.execute(add_histone, data_histone)

In [75]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(histone_nodes)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
5,cenH3.1_(Culicidae),variant,Culicidae,7157,,cenH3_(Culicidae)
9,cenH3.2_(Culicidae),variant,Culicidae,7157,,cenH3_(Culicidae)
13,cenH3.3_(Aedes),variant,Aedes,7158,,cenH3_(Culicidae)


In [76]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.1_(Culicidae)</span>

```cenH3.1_(Culicidae) is a centromeric histone variant encoded by the mosqCid1 gene, found in most studied mosquito species (Anopheles, Aedes, Culex), with the exception of An. albimanus and An. darlingi where it has been lost [kursel_ancient_2020]. This gene emerged over 150 million years ago and has been retained in most species, acquiring specialized functions. Unlike its paralog mosqCid2, which shows high expression in ovaries and early embryos, mosqCid1 exhibits low expression levels in both somatic and germline tissues [kursel_ancient_2020]. Notably, mosqCid1 evolves under positive selection, suggesting its potential role in suppressing genetic conflicts such as centromere drive [kursel_ancient_2020]. Its N-terminal tail contains unique motifs absent in mosqCid2, likely reflecting differences in protein interactions.```

In [77]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.1_(Culicidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.1_(Culicidae),variant,Culicidae,7157,,cenH3_(Culicidae),,,,,...,,,,,,,,,,


In [78]:
histone_description_summary = "cenH3.1_(Culicidae) is a centromeric histone variant encoded by the mosqCid1 gene, found in most studied mosquito species (Anopheles, Aedes, Culex), with the exception of An. albimanus and An. darlingi where it has been lost [kursel_ancient_2020]. This gene emerged over 150 million years ago and has been retained in most species, acquiring specialized functions. Unlike its paralog mosqCid2, which shows high expression in ovaries and early embryos, mosqCid1 exhibits low expression levels in both somatic and germline tissues [kursel_ancient_2020]. Notably, mosqCid1 evolves under positive selection, suggesting its potential role in suppressing genetic conflicts such as centromere drive [kursel_ancient_2020]. Its N-terminal tail contains unique motifs absent in mosqCid2, likely reflecting differences in protein interactions."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.1_(Culicidae)'"
cursor.execute(query)

In [79]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.1_(Culicidae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.1_(Culicidae),variant,Culicidae,7157,246,cenH3_(Culicidae),246,cenH3.1_(Culicidae) is a centromeric histone v...,,,...,,,,,,,,,,


In [80]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.2_(Culicidae)</span>

```cenH3.2_(Culicidae) is the ancestral (emerged over 150 My) centromeric histone variant in mosquitoes, conserved across Anopheles, Aedes, and Culex species [kursel_ancient_2020]. Unlike its paralog mosqCid1, mosqCid2 specializes in germline processes, showing dramatic upregulation (6-10 fold) in ovaries after blood feeding and during early embryogenesis [kursel_ancient_2020]. It contains four unique N-terminal motifs absent in mosqCid1, reflecting functional divergence. Remarkably, mosqCid2 serves as the sole centromeric histone in An. albimanus and An. darlingi following mosqCid1 loss, demonstrating its fundamental role in chromosome segregation [kursel_ancient_2020].```

In [81]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.2_(Culicidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.2_(Culicidae),variant,Culicidae,7157,,cenH3_(Culicidae),,,,,...,,,,,,,,,,


In [82]:
histone_description_summary = "cenH3.2_(Culicidae) is the ancestral (emerged over 150 My) centromeric histone variant in mosquitoes, conserved across Anopheles, Aedes, and Culex species [kursel_ancient_2020]. Unlike its paralog mosqCid1, mosqCid2 specializes in germline processes, showing dramatic upregulation (6-10 fold) in ovaries after blood feeding and during early embryogenesis [kursel_ancient_2020]. It contains four unique N-terminal motifs absent in mosqCid1, reflecting functional divergence. Remarkably, mosqCid2 serves as the sole centromeric histone in An. albimanus and An. darlingi following mosqCid1 loss, demonstrating its fundamental role in chromosome segregation [kursel_ancient_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.2_(Culicidae)'"
cursor.execute(query)

In [83]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.2_(Culicidae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.2_(Culicidae),variant,Culicidae,7157,247,cenH3_(Culicidae),247,cenH3.2_(Culicidae) is the ancestral (emerged ...,,,...,,,,,,,,,,


In [84]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add a summary description to node cenH3.3_(Aedes)</span>

```cenH3.3_(Aedes) is a recently evolved centromeric histone variant unique to Aedes mosquitoes, arising from a lineage-specific duplication of mosqCid1 [kursel_ancient_2020]. This paralog demonstrates specialized temporal expression during embryogenesis, suggesting a developmental-stage specific role in early development. Unlike the germline-focused mosqCid2 or the ubiquitously expressed mosqCid1, mosqCid3 shows no significant expression in adult tissues or ovaries, indicating its exclusive embryonic function [kursel_ancient_2020].```

In [85]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.3_(Aedes)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.3_(Aedes),variant,Aedes,7158,,cenH3_(Culicidae),,,,,...,,,,,,,,,,


In [86]:
histone_description_summary = "cenH3.3_(Aedes) is a recently evolved centromeric histone variant unique to Aedes mosquitoes, arising from a lineage-specific duplication of mosqCid1 [kursel_ancient_2020]. This paralog demonstrates specialized temporal expression during embryogenesis, suggesting a developmental-stage specific role in early development. Unlike the germline-focused mosqCid2 or the ubiquitously expressed mosqCid1, mosqCid3 shows no significant expression in adult tissues or ovaries, indicating its exclusive embryonic function [kursel_ancient_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.3_(Aedes)'"
cursor.execute(query)

In [87]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.3_(Aedes)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.3_(Aedes),variant,Aedes,7158,248,cenH3_(Culicidae),248,cenH3.3_(Aedes) is a recently evolved centrome...,,,...,,,,,,,,,,


In [88]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [89]:
cursor.close()
conn.close()
tunnel.stop()