In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

38519


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [13]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
add_histone_description = (
    "INSERT INTO histone_description "
    "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
# add_sequence = (
#     "INSERT INTO sequence "
#     "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
#     "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

# To Do H3

## <span style="color:green">Add description to H3.B_(Giardia)</span>

### <span style="color:green">Add summary</span>

```H3.B_(Giardia) is a poorly studied replication-independent histone H3 variant identified in the genome of Giardia intestinalis, which, unlike canonical H3, exhibits a distinct subnuclear localization and is hypothesized to mark non-centromeric heterochromatin [dawson_cenh3_2007].```

### <span style="color:green">Add sequence</span>

```H3.B is characterized by an extended N-terminal tail (21 amino acids longer than canonical H3) and lacks the conserved lysine residue H3K9, which is a known site for post-translational modifications. In addition, the protein sequence of H3.B is rapidly evolving [dawson_cenh3_2007].```

### <span style="color:green">Add localization</span>

```H3.B localizes as numerous small foci on chromosomes throughout interphase and mitosis, and does not colocalize with centromeric markers (cenH3) or active transcriptional sites marked by H3K4 methylation [dawson_cenh3_2007].```

## <span style="color:green">Change name from to H3.P_(Moneuplotes) to H3.P_(Euplotes_crassus)</span>

## <span style="color:green">Add description to H3.P_(Euplotes_crassus)</span>

### <span style="color:green">Add summary</span>

```H3.P_(Euplotes_crassus) is a poorly studied histone H3 variant specifically expressed during sexual reproduction and macronuclear development in the ciliate Euplotes crassus [ghosh_development-specific_2000, jahn_unusual_1997].```

### <span style="color:green">Add sequence</span>

```The amino acid sequence of H3.P differs from the canonical histone H3 of Euplotes crassus by the presence of two additional short amino acid blocks in the N-terminal region, as well as numerous amino acid substitutions in the C-terminal histone-fold domain [ghosh_development-specific_2000, jahn_unusual_1997].```

### <span style="color:green">Add expression</span>

```H3.P is expressed exclusively during macronuclear development, starting at 20 hours after the initiation of conjugation, peaking during the polytene chromosome stage (20–45 hours), and subsequently declining during the vesicle stage. It is not detected in vegetative cells, the micronucleus, or the old degenerating macronucleus [ghosh_development-specific_2000, jahn_unusual_1997].```

# To Do cenH3

## <span style="color:green">Update description to cenH3</span>

### <span style="color:green">Update sequence</span>

```The cenH3 variant contains a distinctive CENP-A targeting domain (CATD), which encompasses extended L1-loop and α2‑helix within the histone‑fold domain, that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020, fachinetti_two-step_2013]. The N- or C-terminal tails of CENP-A are essential for the long-term centromere integrity [fachinetti_two-step_2013]. cenH3s typically have only about 50-60% amino acid identity to canonical H3 in the histone fold domain and no conservation of the N-terminus that is also very different from other H3 variants. cenH3s ususally lack a conserved glutamine in the α1-helix of the histone fold. cenH3s have an extended L1-loop and usually replace Phe84 in canonical H3 with Trp, and Thr 107 with Ala, Cys, or Ser.```

### <span style="color:green">Update deposition</span>

```Unlike canonical histones, cenH3 deposition is replication-independent and mediated by specialized chaperones (HJURP in humans, Scm3 in budding and fission yeast, and CAL1 in Drosophila). The timing of cenH3 deposition is species-specific, occurring in telophase/G1 in humans, G2 phase in Arabidopsis, anaphase in Drosophila embryos, with budding yeast utilizing dual deposition windows in S-phase and anaphase [wong_epigenetic_2020, shivaraju_cell_2012, lermontova_loading_2006, schuh_incorporation_2007]. Notably, both budding yeast and nematodes (C. elegans) exhibit complete turnover of cenH3 each cell cycle [wong_epigenetic_2020]. The assembly and replenishment of cenH3 nucleosomes involves chromatin remodeling, resulting in the displacement of H3.3 histones and active stabilization by specific proteins (e.g., HJURP, CENP-C, CENP-B in humans) [dunleavy_h33_2011, wong_epigenetic_2020, black_epigenetic_2011]. It is possible that new cenH3 deposition doesn't always occur at the same centromeric position during every cell cycle [dunleavy_h33_2011]. Deposition of cenH3 into centromeric nucleosomes depends not only on a specific chaperone but also on numerous other factors. The study using DT40 (chicken) and HeLa S3 (human) cell lines demonstrated that chromatin-remodeling complexes FACT and CHD1 play important roles in the proper recruitment of cenH3 [okada_cenp-hcontaining_2009]. However, this observation is likely not universal across all organisms. For example, in Drosophila, cenH3 incorporation occurs independently of CHD1, yet still involves direct participation of FACT [podhraski_cenh3cid_2010, chen_establishment_2015, chen_cal1_2014].```

## <span style="color:green">Update description to cenH3_(Plants)</span>

### <span style="color:green">Add deposition</span>

```Deposition of CENH3 in Arabidopsis occurs mainly in G2 phase [lermontova_loading_2006].```

### <span style="color:green">Add sequence</span>

```In maize, the phosphorylation of serine 50 displays kinetics analogous to human Ser7, a known regulator of centromere function [wong_epigenetic_2020, zhang_phosphoserines_2005]. ```

## <span style="color:green">Update description to cenH3_(Animals)</span>

### <span style="color:green">Add deposition</span>

```In humans and most animals, cenH3 is expressed in G2 phase, and but loading occurs during late mitosis and G1 phase of the cell cycle, mediated by specialized chaperones (HJURP in humans and CAL1 in Drosophila).```

## <span style="color:green">Update description to cenH3_(Drosophilidae)</span>

### <span style="color:green">Update deposition</span>

```In Drosophila melanogaster, the deposition of cenH3 into nucleosomes is mediated by the chaperone CAL1 in a DNA sequence-independent manner. CAL1 facilitates nucleosome formation through direct interaction with cenH3-H4 histones and additionally recruits the FACT complex (comprising Dre4 and SSRP1 subunits) along with RNA polymerase II (RNAPII) [chen_establishment_2015, chen_cal1_2014]. FACT destabilizes H3-containing nucleosomes, enabling RNAPII-mediated transcription of DNA, a prerequisite for the replacement of H3 with CENP-A/H4 [chen_establishment_2015]. In the absence of FACT, transcription is abolished, preventing CENP-A incorporation. Importantly, CAL1 mediates the assembly of octameric nucleosomes with left-handed DNA wrapping, analogous to canonical nucleosomes [chen_cal1_2014]. In Drosophila embryos, cenH3 loading occurs during anaphase [schuh_incorporation_2007].```

## <span style="color:green">Update description to cenH3_(Homo_sapiens)</span>

### <span style="color:green">Update sequence</span>

```Post-translational modifications of specific CENP-A residues regulate centromere function. In humans, CENP-A is subject to serine phosphorylation at positions 7, 16, 18, and 68, glycine methylation at position 1, as well as lysine 124 ubiquitination and acetylation [wong_epigenetic_2020, eot-houllier_aurora_2018, gattat_phosphorylation_2013]. Phosphorylation of Ser7 regulates the localization of kinetochore proteins, including the association with CENP-C, and ensures proper mitotic progression. Phosphorylation of Ser68 prevents premature CENP-A loading. During the cell cycle, the conserved residue K124 locaded in histone fold undergoes cyclic modifications [bui_internal_2017]. In G1/S phase, K124ac compacts the nucleosome, interfering with CENP-C binding (crucial for kinetochore assembly) while simultaneously promoting chromatin decondensation for centromere replication. During S phase, K124me stabilizes nucleosomes. Furthermore, K124 mutations disrupt both mitosis and replication, demonstrating its critical role in centromere epigenetic regulation [bui_internal_2017].```

### <span style="color:green">Update interactions</span>

```CENP-A nucleosomes stabilize the inner kinetochore known as the CCAN complex (Constitutive Centromere-Associated Network) through direct interaction with CENP-C and CENP-N proteins of this complex [cao_constitutive_2018, pesenti_structure_2022, sridhar_kinetochore_2022, xu_gross_2023]. The CENP-A targeting domain (CATD) along with its N- and C-terminal tails play crucial roles in this process [logsdon_both_2015]. Human CENP-C has been demonstrated to bind with the CENP-A C-terminal hydrophobic tail, the acidic patch of H2A-H2B and histone H4, thereby facilitating its engagement with all four histone subunits present on the nucleosome surface [allu_structure_2019]. The binding of CENP-C to CENP-A is critical for long-term viability in human RPE-1 cells [watanabe_cdk1-mediated_2019]. This interaction is stabilized by the process of CENP-C phosphorylation mediated by CDK1, which strengthens the complex's structure through the occurrence of an intramolecular interaction [watanabe_cdk1-mediated_2019, walstein_assembly_2021, ariyoshi_cryoem_2021]. Recruitment of CENP-C also depends on the N-terminal tail of CENP-A, albeit through an indirect mechanism [logsdon_both_2015]. The direct binding of human CENP-N to the CENP-A nucleosome is mediated by its recognition of the CATD, which encompasses the unique L1 and RG loops [chittori_structural_2018, tian_molecular_2018, ariyoshi_cryoem_2021]. However, cryo-EM studies have revealed that CENP-C and CENP-N bind to a single CENP-A nucleosome in a non-simultaneous manner, thereby demonstrating an asymmetric structure where CENP-C and CENP-N bind to opposite sides of the nucleosome [ariyoshi_cryoem_2021]. Furthermore, in 94% of cases, phosphorylated CENP-C exhibits exclusive binding to the RG-loop of CENP-A, suggesting that this particular interaction is predominant [ariyoshi_cryoem_2021]. In addition, in vitro reconstruction of the human CENP-C protein showed that CENP-C can bind to two centromeric nucleosomes simultaneously [walstein_assembly_2021]. Nevertheless, interactions with CENP-C or CENP-N do not determine the stability of CENP-A nucleosomes in chromatin [cao_constitutive_2018].```

# To Do cenH3

## <span style="color:green">Add description to cenH3_(Fungi)</span>

### <span style="color:green">Add summary</span>

```cenH3_(Fungi) is a centromere-specific histone variant in fungi, often called Cse4 budding yeast and Cnp1 fission yeast, and an important component of active centromere required for chromosome segregation [stoler_mutation_1995].```

### <span style="color:green">Add localization</span>

```In the budding yeast cenH3 is strictly localized to point centromeres, which are specified by a short (~125 bp) specific DNA sequence containing centromeric DNA elements (CDEs) [hara_critical_2017, steiner_diversity_2015]. In the fission yeast cenH3 localizes to regional centromeres that span several kilobases and contain repetitive DNA sequences [hara_critical_2017, steiner_diversity_2015].```

### <span style="color:green">Add function</span>

```Like in other eukaryotes, in fungi the active centromeric chromatin, enriched with cenH3, defines the region of kinetochore interaction for spindle formation [hara_critical_2017, stoler_mutation_1995].```

### <span style="color:green">Add sequence</span>

```Like in other eukaryotes, in fungi the cenH3 variant contains a distinctive CENP-A targeting domain (CATD) that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020]. Cse4 (cenH3 of the budding yeast) possesses a C-terminal histone-fold domain, which is over 60% identical to that of histone H3, and a unique 135-amino acid N-terminal domain that protrudes from the nucleosome core and interacts with kinetochore proteins essential for its assembly [samel_methylation_2012]. Additionally, arginine 37 methylation plays important role in recruitment of inner and linker kinetochore proteins [wong_epigenetic_2020]. The loss of Arg37me has been shown to impair accurate chromosome segregation [samel_methylation_2012].```

### <span style="color:green">Add deposition</span>

```In fungi cenH3 loading is mediated by Scm3, the HJURP homologue [wong_epigenetic_2020, zasadzinska_dimerization_2013, shivaraju_scm3_2011]. The deposition occurs in two distinct windows: during S-phase and anaphase in budding yeast, compared to S and G2 phases in fission yeast [shivaraju_cell_2012, takayama_biphasic_2008]. Notably, budding yeast exhibit complete turnover of cenH3 each cell cycle [wong_epigenetic_2020].```

### <span style="color:green">Add structure</span>

```In yeast, cenH3-containing nucleosomes exhibit an octameric structure [dechassa_structure_2011, shivaraju_scm3_2011].```

# Done

## <span style="color:black">Add description to H3.B_(Giardia)</span>

### <span style="color:black">Add summary</span>

```H3.B_(Giardia) is a poorly studied replication-independent histone H3 variant identified in the genome of Giardia intestinalis, which, unlike canonical H3, exhibits a distinct subnuclear localization and is hypothesized to mark non-centromeric heterochromatin [dawson_cenh3_2007].```

### <span style="color:black">Add sequence</span>

```H3.B is characterized by an extended N-terminal tail (21 amino acids longer than canonical H3) and lacks the conserved lysine residue H3K9, which is a known site for post-translational modifications. In addition, the protein sequence of H3.B is rapidly evolving [dawson_cenh3_2007].```

### <span style="color:black">Add localization</span>

```H3.B localizes as numerous small foci on chromosomes throughout interphase and mitosis, and does not colocalize with centromeric markers (cenH3) or active transcriptional sites marked by H3K4 methylation [dawson_cenh3_2007].```

In [8]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.B_(Giardia?)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,H3.B_(Giardia?),variant_group,Giardia,5740,56,H3,56,,,,...,,,,,,,,,,


In [10]:
query = "DELETE FROM histone WHERE id='H3.B_(Giardia?)'"
print(query)
cursor.execute(query)

DELETE FROM histone WHERE id='H3.B_(Giardia?)'


In [11]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.B_(Giardia?)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations


In [20]:
data_histone_description = {
    "summary": "H3.B_(Giardia) is a poorly studied replication-independent histone H3 variant identified in the genome of Giardia intestinalis, which, unlike canonical H3, exhibits a distinct subnuclear localization and is hypothesized to mark non-centromeric heterochromatin [dawson_cenh3_2007].",
    "taxonomy": None,
    "genes": None,
    "evolution": None,
    "expression": None,
    "knock_out": None,
    "function": None,
    "sequence": "H3.B is characterized by an extended N-terminal tail (21 amino acids longer than canonical H3) and lacks the conserved lysine residue H3K9, which is a known site for post-translational modifications. In addition, the protein sequence of H3.B is rapidly evolving [dawson_cenh3_2007].",
    "localization": "H3.B localizes as numerous small foci on chromosomes throughout interphase and mitosis, and does not colocalize with centromeric markers (cenH3) or active transcriptional sites marked by H3K4 methylation [dawson_cenh3_2007].",
    "deposition": None,
    "structure": None,
    "interactions": None,
    "disease": None,
    "caveats": None,
}
cursor.execute(add_histone_description, tuple(data_histone_description.values()))
histone_description_id = cursor.lastrowid
print(histone_description_id)

254


In [21]:
data_histone = {
    "id": "H3.B_(Giardia)",
    "level": "variant",
    "taxonomic_span": "Giardia",
    "taxonomic_span_id": "5740",
    "description": histone_description_id,
    "parent": "H3",
}
cursor.execute(add_histone, data_histone)

In [22]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.B_(Giardia)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,H3.B_(Giardia),variant,Giardia,5740,254,H3,254,H3.B_(Giardia) is a poorly studied replication...,,,...,,,H3.B is characterized by an extended N-termina...,H3.B localizes as numerous small foci on chrom...,,,,,,


In [23]:
# Make sure data is committed to the database
conn.commit()

In [28]:
query = f"UPDATE histone SET level='variant_group' WHERE id='H3.B_(Giardia)'"
print(query)
cursor.execute(query)

UPDATE histone SET level='variant_group' WHERE id='H3.B_(Giardia)'


In [29]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.B_(Giardia)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,H3.B_(Giardia),variant_group,Giardia,5740,254,H3,254,H3.B_(Giardia) is a poorly studied replication...,,,...,,,H3.B is characterized by an extended N-termina...,H3.B localizes as numerous small foci on chrom...,,,,,,


In [30]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Change name from to H3.P_(Moneuplotes) to H3.P_(Euplotes_crassus)</span>

## <span style="color:black">Add description to H3.P_(Euplotes_crassus)</span>

### <span style="color:black">Add summary</span>

```H3.P_(Euplotes_crassus) is a poorly studied histone H3 variant specifically expressed during sexual reproduction and macronuclear development in the ciliate Euplotes crassus [ghosh_development-specific_2000, jahn_unusual_1997].```

### <span style="color:black">Add sequence</span>

```The amino acid sequence of H3.P differs from the canonical histone H3 of Euplotes crassus by the presence of two additional short amino acid blocks in the N-terminal region, as well as numerous amino acid substitutions in the C-terminal histone-fold domain [ghosh_development-specific_2000, jahn_unusual_1997].```

### <span style="color:black">Add expression</span>

```H3.P is expressed exclusively during macronuclear development, starting at 20 hours after the initiation of conjugation, peaking during the polytene chromosome stage (20–45 hours), and subsequently declining during the vesicle stage. It is not detected in vegetative cells, the micronucleus, or the old degenerating macronucleus [ghosh_development-specific_2000, jahn_unusual_1997].```

In [31]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.P_(Moneuplotes?)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,H3.P_(Moneuplotes?),variant_group,Moneuplotes,152459,57,H3,57,,,,...,,,,,,,,,,


In [32]:
df.to_dict(orient="records")

[{'id': 57,
  'level': 'variant_group',
  'taxonomic_span': 'Moneuplotes',
  'taxonomic_span_id': '152459',
  'description': 57,
  'parent': 'H3',
  'summary': 'null',
  'taxonomy': 'null',
  'genes': 'null',
  'evolution': 'null',
  'expression': 'null',
  'knock_out': 'null',
  'function': 'null',
  'sequence': 'null',
  'localization': 'null',
  'deposition': 'null',
  'structure': 'null',
  'interactions': 'null',
  'disease': 'null',
  'caveats': 'null',
  'relations': None}]

In [33]:
query = "DELETE FROM histone WHERE id='H3.P_(Moneuplotes?)'"
print(query)
cursor.execute(query)

DELETE FROM histone WHERE id='H3.P_(Moneuplotes?)'


In [34]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.P_(Moneuplotes?)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations


In [35]:
data_histone_description = {
    "summary": "H3.P_(Euplotes_crassus) is a poorly studied histone H3 variant specifically expressed during sexual reproduction and macronuclear development in the ciliate Euplotes crassus [ghosh_development-specific_2000, jahn_unusual_1997].",
    "taxonomy": None,
    "genes": None,
    "evolution": None,
    "expression": "H3.P is expressed exclusively during macronuclear development, starting at 20 hours after the initiation of conjugation, peaking during the polytene chromosome stage (20–45 hours), and subsequently declining during the vesicle stage. It is not detected in vegetative cells, the micronucleus, or the old degenerating macronucleus [ghosh_development-specific_2000, jahn_unusual_1997].",
    "knock_out": None,
    "function": None,
    "sequence": "The amino acid sequence of H3.P differs from the canonical histone H3 of Euplotes crassus by the presence of two additional short amino acid blocks in the N-terminal region, as well as numerous amino acid substitutions in the C-terminal histone-fold domain [ghosh_development-specific_2000, jahn_unusual_1997].",
    "localization": None,
    "deposition": None,
    "structure": None,
    "interactions": None,
    "disease": None,
    "caveats": None,
}
cursor.execute(add_histone_description, tuple(data_histone_description.values()))
histone_description_id = cursor.lastrowid
print(histone_description_id)

255


In [36]:
data_histone = {
    "id": "H3.P_(Euplotes_crassus)",
    "level": "variant_group",
    "taxonomic_span": "Euplotes crassus",
    "taxonomic_span_id": "5936",
    "description": histone_description_id,
    "parent": "H3",
}
cursor.execute(add_histone, data_histone)

In [37]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.P_(Euplotes_crassus)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,H3.P_(Euplotes_crassus),variant_group,Euplotes crassus,5936,255,H3,255,H3.P_(Euplotes_crassus) is a poorly studied hi...,,,...,,,The amino acid sequence of H3.P differs from t...,,,,,,,


In [38]:
df.to_dict(orient="records")

[{'id': 255,
  'level': 'variant_group',
  'taxonomic_span': 'Euplotes crassus',
  'taxonomic_span_id': '5936',
  'description': 255,
  'parent': 'H3',
  'summary': 'H3.P_(Euplotes_crassus) is a poorly studied histone H3 variant specifically expressed during sexual reproduction and macronuclear development in the ciliate Euplotes crassus [ghosh_development-specific_2000, jahn_unusual_1997].',
  'taxonomy': None,
  'genes': None,
  'evolution': None,
  'expression': 'H3.P is expressed exclusively during macronuclear development, starting at 20 hours after the initiation of conjugation, peaking during the polytene chromosome stage (20–45 hours), and subsequently declining during the vesicle stage. It is not detected in vegetative cells, the micronucleus, or the old degenerating macronucleus [ghosh_development-specific_2000, jahn_unusual_1997].',
  'knock_out': None,
  'function': None,
  'sequence': 'The amino acid sequence of H3.P differs from the canonical histone H3 of Euplotes crassu

In [39]:
# Make sure data is committed to the database
conn.commit()

# Done cenH3

## <span style="color:black">Update description to cenH3</span>

### <span style="color:black">Update sequence</span>

```The cenH3 variant contains a distinctive CENP-A targeting domain (CATD), which encompasses extended L1-loop and α2‑helix within the histone‑fold domain, that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020, fachinetti_two-step_2013]. The N- or C-terminal tails of CENP-A are essential for the long-term centromere integrity [fachinetti_two-step_2013]. cenH3s typically have only about 50-60% amino acid identity to canonical H3 in the histone fold domain and no conservation of the N-terminus that is also very different from other H3 variants. cenH3s ususally lack a conserved glutamine in the α1-helix of the histone fold. cenH3s have an extended L1-loop and usually replace Phe84 in canonical H3 with Trp, and Thr 107 with Ala, Cys, or Ser.```

### <span style="color:black">Update deposition</span>

```Unlike canonical histones, cenH3 deposition is replication-independent and mediated by specialized chaperones (HJURP in humans, Scm3 in budding and fission yeast, and CAL1 in Drosophila). The timing of cenH3 deposition is species-specific, occurring in telophase/G1 in humans, G2 phase in Arabidopsis, anaphase in Drosophila embryos, with budding yeast utilizing dual deposition windows in S-phase and anaphase [wong_epigenetic_2020, shivaraju_cell_2012, lermontova_loading_2006, schuh_incorporation_2007]. Notably, both budding yeast and nematodes (C. elegans) exhibit complete turnover of cenH3 each cell cycle [wong_epigenetic_2020]. The assembly and replenishment of cenH3 nucleosomes involves chromatin remodeling, resulting in the displacement of H3.3 histones and active stabilization by specific proteins (e.g., HJURP, CENP-C, CENP-B in humans) [dunleavy_h33_2011, wong_epigenetic_2020, black_epigenetic_2011]. It is possible that new cenH3 deposition doesn't always occur at the same centromeric position during every cell cycle [dunleavy_h33_2011]. Deposition of cenH3 into centromeric nucleosomes depends not only on a specific chaperone but also on numerous other factors. The study using DT40 (chicken) and HeLa S3 (human) cell lines demonstrated that chromatin-remodeling complexes FACT and CHD1 play important roles in the proper recruitment of cenH3 [okada_cenp-hcontaining_2009]. However, this observation is likely not universal across all organisms. For example, in Drosophila, cenH3 incorporation occurs independently of CHD1, yet still involves direct participation of FACT [podhraski_cenh3cid_2010, chen_establishment_2015, chen_cal1_2014].```

In [40]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3,variant_group,Eukaryotes,2759,96,H3,96,cenH3 is a centromere-specific histone variant...,"cenH3 present in most eukaryotes. However, it ...",,...,,"Despite the high diversity of cenH3 proteins, ...",cenH3 has an extended L1-loop and its N-termin...,,"Unlike canonical histones, cenH3 deposition is...",,,CenH3 plays pro-viral and restriction role in ...,,


In [41]:
df.to_dict(orient="records")

[{'id': 96,
  'level': 'variant_group',
  'taxonomic_span': 'Eukaryotes',
  'taxonomic_span_id': '2759',
  'description': 96,
  'parent': 'H3',
  'summary': 'cenH3 is a centromere-specific histone variant, which replaces canonical H3 in centromeric nucleosomes. It is required for kinetochore formation, mitotic progression and chromosome segregation.',
  'taxonomy': 'cenH3 present in most eukaryotes. However, it has been lost in trypanosomes, the fungus Mucor, in four clades of holocentric insects and kinetoplastids [talbert_histone_2021, senaratne_formation_2021, sridhar_kinetochore_2022].',
  'genes': 'null',
  'evolution': 'The co-occurrence of CenH3 loss and holocentricity in insects suggests that the evolutionary development of holocentromeres in this group may have been facilitated by the loss of a CenH3-defined centromere [senaratne_formation_2021]. It has been demonstrated that CenH3 has undergone adaptive evolution at a significantly higher frequency in clades with asymmetric m

In [44]:
sequence_desc = "The cenH3 variant contains a distinctive CENP-A targeting domain (CATD), which encompasses extended L1-loop and α2‑helix within the histone‑fold domain, that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020, fachinetti_two-step_2013]. The N- or C-terminal tails of CENP-A are essential for the long-term centromere integrity [fachinetti_two-step_2013]. cenH3s typically have only about 50-60% amino acid identity to canonical H3 in the histone fold domain and no conservation of the N-terminus that is also very different from other H3 variants. cenH3s ususally lack a conserved glutamine in the α1-helix of the histone fold. cenH3s have an extended L1-loop and usually replace Phe84 in canonical H3 with Trp, and Thr 107 with Ala, Cys, or Ser."
deposition_desc = "Unlike canonical histones, cenH3 deposition is replication-independent and mediated by specialized chaperones (HJURP in humans, Scm3 in budding and fission yeast, and CAL1 in Drosophila). The timing of cenH3 deposition is species-specific, occurring in telophase/G1 in humans, G2 phase in Arabidopsis, anaphase in Drosophila embryos, with budding yeast utilizing dual deposition windows in S-phase and anaphase [wong_epigenetic_2020, shivaraju_cell_2012, lermontova_loading_2006, schuh_incorporation_2007]. Notably, both budding yeast and nematodes (C. elegans) exhibit complete turnover of cenH3 each cell cycle [wong_epigenetic_2020]. The assembly and replenishment of cenH3 nucleosomes involves chromatin remodeling, resulting in the displacement of H3.3 histones and active stabilization by specific proteins (e.g., HJURP, CENP-C, CENP-B in humans) [dunleavy_h33_2011, wong_epigenetic_2020, black_epigenetic_2011]. It is possible that new cenH3 deposition doesn't always occur at the same centromeric position during every cell cycle [dunleavy_h33_2011]. Deposition of cenH3 into centromeric nucleosomes depends not only on a specific chaperone but also on numerous other factors. The study using DT40 (chicken) and HeLa S3 (human) cell lines demonstrated that chromatin-remodeling complexes FACT and CHD1 play important roles in the proper recruitment of cenH3 [okada_cenp-hcontaining_2009]. However, this observation is likely not universal across all organisms. For example, in Drosophila, cenH3 incorporation occurs independently of CHD1, yet still involves direct participation of FACT [podhraski_cenh3cid_2010, chen_establishment_2015, chen_cal1_2014]."
query = f'UPDATE histone_description SET sequence="{sequence_desc}", deposition="{deposition_desc}" WHERE id=96'
print(query)
cursor.execute(query)

UPDATE histone_description SET sequence="The cenH3 variant contains a distinctive CENP-A targeting domain (CATD), which encompasses extended L1-loop and α2‑helix within the histone‑fold domain, that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020, fachinetti_two-step_2013]. The N- or C-terminal tails of CENP-A are essential for the long-term centromere integrity [fachinetti_two-step_2013]. cenH3s typically have only about 50-60% amino acid identity to canonical H3 in the histone fold domain and no conservation of the N-terminus that is also very different from other H3 variants. cenH3s ususally lack a conserved glutamine in the α1-helix of the histone fold. cenH3s have an extended L1-loop and usually replace Phe84 in canonical H3 with Trp, and Thr 107 with Ala, Cys, or Ser.", deposition="Unlike canonical histones, cenH3 deposition is replication-independent and mediated by specialized chaperones (HJURP in humans, Scm3 in buddin

In [45]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3,variant_group,Eukaryotes,2759,96,H3,96,cenH3 is a centromere-specific histone variant...,"cenH3 present in most eukaryotes. However, it ...",,...,,"Despite the high diversity of cenH3 proteins, ...",The cenH3 variant contains a distinctive CENP-...,,"Unlike canonical histones, cenH3 deposition is...",,,CenH3 plays pro-viral and restriction role in ...,,


In [46]:
df.to_dict(orient="records")

[{'id': 96,
  'level': 'variant_group',
  'taxonomic_span': 'Eukaryotes',
  'taxonomic_span_id': '2759',
  'description': 96,
  'parent': 'H3',
  'summary': 'cenH3 is a centromere-specific histone variant, which replaces canonical H3 in centromeric nucleosomes. It is required for kinetochore formation, mitotic progression and chromosome segregation.',
  'taxonomy': 'cenH3 present in most eukaryotes. However, it has been lost in trypanosomes, the fungus Mucor, in four clades of holocentric insects and kinetoplastids [talbert_histone_2021, senaratne_formation_2021, sridhar_kinetochore_2022].',
  'genes': 'null',
  'evolution': 'The co-occurrence of CenH3 loss and holocentricity in insects suggests that the evolutionary development of holocentromeres in this group may have been facilitated by the loss of a CenH3-defined centromere [senaratne_formation_2021]. It has been demonstrated that CenH3 has undergone adaptive evolution at a significantly higher frequency in clades with asymmetric m

In [47]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Update description to cenH3_(Plants)</span>

### <span style="color:black">Add deposition</span>

```Deposition of CENH3 in Arabidopsis occurs mainly in G2 phase [lermontova_loading_2006].```

### <span style="color:black">Add sequence</span>

```In maize, the phosphorylation of serine 50 displays kinetics analogous to human Ser7, a known regulator of centromere function [wong_epigenetic_2020, zhang_phosphoserines_2005]. ```

In [48]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Plants)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Plants),variant,Eukaryotes,2759,233,cenH3,233,cenH3_(Plants) is a centromere-specific histon...,,While most diploid eukaryotes and flowering pl...,...,,,,,,The L92F mutation in barley βCENH3 (and its or...,,,,


In [49]:
df.to_dict(orient="records")

[{'id': 233,
  'level': 'variant',
  'taxonomic_span': 'Eukaryotes',
  'taxonomic_span_id': '2759',
  'description': 233,
  'parent': 'cenH3',
  'summary': 'cenH3_(Plants) is a centromere-specific histone variant in Plants. This is a subclass of cenH3 (see it for a detailed description).',
  'taxonomy': None,
  'genes': 'While most diploid eukaryotes and flowering plants maintain only one CENH3 gene copy (even after whole-genome duplications), some diploid species like Arabidopsis lyrata, barley, rye, pea, and related legumes retain two functional CENH3 homologs [ishii_unequal_2020]. It was demonstrated that the duplication of HTR12 in A. lyrata may have facilitated adaptation to multiple centromeric satellite sequences, though the underlying mechanisms of this process require further investigation [kawabe_duplication_2006]. Pisum and closely related Lathyrus species also retain two CenH3 paralogs (CenH3-1 and CenH3-2) originating from a Fabeae ancestor [neumann_centromeres_2015, neuma

In [51]:
sequence_desc = "Deposition of CENH3 in Arabidopsis occurs mainly in G2 phase [lermontova_loading_2006]."
deposition_desc = "In maize, the phosphorylation of serine 50 displays kinetics analogous to human Ser7, a known regulator of centromere function [wong_epigenetic_2020, zhang_phosphoserines_2005]."
query = f'UPDATE histone_description SET sequence="{sequence_desc}", deposition="{deposition_desc}" WHERE id=233'
print(query)
cursor.execute(query)

UPDATE histone_description SET sequence="Deposition of CENH3 in Arabidopsis occurs mainly in G2 phase [lermontova_loading_2006].", deposition="In maize, the phosphorylation of serine 50 displays kinetics analogous to human Ser7, a known regulator of centromere function [wong_epigenetic_2020, zhang_phosphoserines_2005]." WHERE id=233


In [52]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Plants)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Plants),variant,Eukaryotes,2759,233,cenH3,233,cenH3_(Plants) is a centromere-specific histon...,,While most diploid eukaryotes and flowering pl...,...,,,Deposition of CENH3 in Arabidopsis occurs main...,,"In maize, the phosphorylation of serine 50 dis...",The L92F mutation in barley βCENH3 (and its or...,,,,


In [53]:
df.to_dict(orient="records")

[{'id': 233,
  'level': 'variant',
  'taxonomic_span': 'Eukaryotes',
  'taxonomic_span_id': '2759',
  'description': 233,
  'parent': 'cenH3',
  'summary': 'cenH3_(Plants) is a centromere-specific histone variant in Plants. This is a subclass of cenH3 (see it for a detailed description).',
  'taxonomy': None,
  'genes': 'While most diploid eukaryotes and flowering plants maintain only one CENH3 gene copy (even after whole-genome duplications), some diploid species like Arabidopsis lyrata, barley, rye, pea, and related legumes retain two functional CENH3 homologs [ishii_unequal_2020]. It was demonstrated that the duplication of HTR12 in A. lyrata may have facilitated adaptation to multiple centromeric satellite sequences, though the underlying mechanisms of this process require further investigation [kawabe_duplication_2006]. Pisum and closely related Lathyrus species also retain two CenH3 paralogs (CenH3-1 and CenH3-2) originating from a Fabeae ancestor [neumann_centromeres_2015, neuma

In [54]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Update description to cenH3_(Animals)</span>

### <span style="color:black">Add deposition</span>

```In humans and most animals, cenH3 is expressed in G2 phase, and but loading occurs during late mitosis and G1 phase of the cell cycle, mediated by specialized chaperones (HJURP in humans and CAL1 in Drosophila).```

In [55]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Animals)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Animals),variant,Homo sapiens,9606,94,cenH3,94,cenH3_(Animals) is a centromere-specific histo...,,The nematodes Caenorhabditis elegans and C. re...,...,,"The active centromeric chromatin, enriched wit...",,,,,cenH3 nucleosomes stabilize the inner kinetoch...,,,


In [56]:
df.to_dict(orient="records")

[{'id': 94,
  'level': 'variant',
  'taxonomic_span': 'Homo sapiens',
  'taxonomic_span_id': '9606',
  'description': 94,
  'parent': 'cenH3',
  'summary': 'cenH3_(Animals) is a centromere-specific histone variant in animals (Metazoa), often called CENP-A (Centromere Protein A) in mammals, and an important component of active centromere required for chromosome segregation.',
  'taxonomy': 'null',
  'genes': 'The nematodes Caenorhabditis elegans and C. remanei possess two genes encoding cenH3: HCP-3 (also known as CeCENP-A) and CPAR-1 [monen_separase_2015, monen_differential_2005]. While HCP-3 functions as the predominant centromeric histone - being highly expressed and essential for precise chromosome segregation during mitotic divisions - CPAR-1 displays distinct characteristics: it exhibits lower expression levels but shows specific enrichment on meiotic chromosomes. Importantly, CPAR-1 undergoes separase-mediated proteolytic cleavage at the meiosis I metaphase-to-anaphase transition

In [58]:
deposition_desc = "In humans and most animals, cenH3 is expressed in G2 phase, and but loading occurs during late mitosis and G1 phase of the cell cycle, mediated by specialized chaperones (HJURP in humans and CAL1 in Drosophila)."
query = f'UPDATE histone_description SET deposition="{deposition_desc}" WHERE id=94'
print(query)
cursor.execute(query)

UPDATE histone_description SET deposition="In humans and most animals, cenH3 is expressed in G2 phase, and but loading occurs during late mitosis and G1 phase of the cell cycle, mediated by specialized chaperones (HJURP in humans and CAL1 in Drosophila)." WHERE id=94


In [59]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Animals)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Animals),variant,Homo sapiens,9606,94,cenH3,94,cenH3_(Animals) is a centromere-specific histo...,,The nematodes Caenorhabditis elegans and C. re...,...,,"The active centromeric chromatin, enriched wit...",,,"In humans and most animals, cenH3 is expressed...",,cenH3 nucleosomes stabilize the inner kinetoch...,,,


In [60]:
df.to_dict(orient="records")

[{'id': 94,
  'level': 'variant',
  'taxonomic_span': 'Homo sapiens',
  'taxonomic_span_id': '9606',
  'description': 94,
  'parent': 'cenH3',
  'summary': 'cenH3_(Animals) is a centromere-specific histone variant in animals (Metazoa), often called CENP-A (Centromere Protein A) in mammals, and an important component of active centromere required for chromosome segregation.',
  'taxonomy': 'null',
  'genes': 'The nematodes Caenorhabditis elegans and C. remanei possess two genes encoding cenH3: HCP-3 (also known as CeCENP-A) and CPAR-1 [monen_separase_2015, monen_differential_2005]. While HCP-3 functions as the predominant centromeric histone - being highly expressed and essential for precise chromosome segregation during mitotic divisions - CPAR-1 displays distinct characteristics: it exhibits lower expression levels but shows specific enrichment on meiotic chromosomes. Importantly, CPAR-1 undergoes separase-mediated proteolytic cleavage at the meiosis I metaphase-to-anaphase transition

In [61]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Update description to cenH3_(Drosophilidae)</span>

### <span style="color:black">Update deposition</span>

```In Drosophila melanogaster, the deposition of cenH3 into nucleosomes is mediated by the chaperone CAL1 in a DNA sequence-independent manner. CAL1 facilitates nucleosome formation through direct interaction with cenH3-H4 histones and additionally recruits the FACT complex (comprising Dre4 and SSRP1 subunits) along with RNA polymerase II (RNAPII) [chen_establishment_2015, chen_cal1_2014]. FACT destabilizes H3-containing nucleosomes, enabling RNAPII-mediated transcription of DNA, a prerequisite for the replacement of H3 with CENP-A/H4 [chen_establishment_2015]. In the absence of FACT, transcription is abolished, preventing CENP-A incorporation. Importantly, CAL1 mediates the assembly of octameric nucleosomes with left-handed DNA wrapping, analogous to canonical nucleosomes [chen_cal1_2014]. In Drosophila embryos, cenH3 loading occurs during anaphase [schuh_incorporation_2007].```

In [62]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Drosophilidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Drosophilidae),variant,Drosophilidae,7214,238,cenH3_(Insecta),238,cenH3_(Drosophilidae) is a centromere-specific...,,"In Drosophila, six cenH3 paralogs (Cid1–Cid6) ...",...,,"Similar to other animals, Drosophila cenH3 fun...",,,"In Drosophila melanogaster, the deposition of ...",,,,,


In [63]:
df.to_dict(orient="records")

[{'id': 238,
  'level': 'variant',
  'taxonomic_span': 'Drosophilidae',
  'taxonomic_span_id': '7214',
  'description': 238,
  'parent': 'cenH3_(Insecta)',
  'summary': 'cenH3_(Drosophilidae) is a centromere-specific histone variant in Drosophilidae family, functionally analogous to CENP-A in mammals.',
  'taxonomy': None,
  'genes': 'In Drosophila, six cenH3 paralogs (Cid1–Cid6) have been identified: Cid1, Cid4, and Cid6 (and likely Cid2) are essential for mitotic chromosome segregation and are ubiquitously expressed in somatic cells, while the remaining paralogs are germline-specific and may play a role in suppressing centromere drive [kursel_gametic_2021, kursel_recurrent_2017].',
  'evolution': None,
  'expression': None,
  'knock_out': None,
  'function': 'Similar to other animals, Drosophila cenH3 functions primarily in kinetochore assembly through CENP-C recruitment. However, in contrast to vertebrates where CENP-A directly binds CENP-C, Drosophila depends on the CAL1 chaperone 

In [65]:
deposition_desc = "In Drosophila melanogaster, the deposition of cenH3 into nucleosomes is mediated by the chaperone CAL1 in a DNA sequence-independent manner. CAL1 facilitates nucleosome formation through direct interaction with cenH3-H4 histones and additionally recruits the FACT complex (comprising Dre4 and SSRP1 subunits) along with RNA polymerase II (RNAPII) [chen_establishment_2015, chen_cal1_2014]. FACT destabilizes H3-containing nucleosomes, enabling RNAPII-mediated transcription of DNA, a prerequisite for the replacement of H3 with CENP-A/H4 [chen_establishment_2015]. In the absence of FACT, transcription is abolished, preventing CENP-A incorporation. Importantly, CAL1 mediates the assembly of octameric nucleosomes with left-handed DNA wrapping, analogous to canonical nucleosomes [chen_cal1_2014]. In Drosophila embryos, cenH3 loading occurs during anaphase [schuh_incorporation_2007]."
query = f'UPDATE histone_description SET deposition="{deposition_desc}" WHERE id=238'
print(query)
cursor.execute(query)

UPDATE histone_description SET deposition="In Drosophila melanogaster, the deposition of cenH3 into nucleosomes is mediated by the chaperone CAL1 in a DNA sequence-independent manner. CAL1 facilitates nucleosome formation through direct interaction with cenH3-H4 histones and additionally recruits the FACT complex (comprising Dre4 and SSRP1 subunits) along with RNA polymerase II (RNAPII) [chen_establishment_2015, chen_cal1_2014]. FACT destabilizes H3-containing nucleosomes, enabling RNAPII-mediated transcription of DNA, a prerequisite for the replacement of H3 with CENP-A/H4 [chen_establishment_2015]. In the absence of FACT, transcription is abolished, preventing CENP-A incorporation. Importantly, CAL1 mediates the assembly of octameric nucleosomes with left-handed DNA wrapping, analogous to canonical nucleosomes [chen_cal1_2014]. In Drosophila embryos, cenH3 loading occurs during anaphase [schuh_incorporation_2007]." WHERE id=238


In [66]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Drosophilidae)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Drosophilidae),variant,Drosophilidae,7214,238,cenH3_(Insecta),238,cenH3_(Drosophilidae) is a centromere-specific...,,"In Drosophila, six cenH3 paralogs (Cid1–Cid6) ...",...,,"Similar to other animals, Drosophila cenH3 fun...",,,"In Drosophila melanogaster, the deposition of ...",,,,,


In [67]:
df.to_dict(orient="records")

[{'id': 238,
  'level': 'variant',
  'taxonomic_span': 'Drosophilidae',
  'taxonomic_span_id': '7214',
  'description': 238,
  'parent': 'cenH3_(Insecta)',
  'summary': 'cenH3_(Drosophilidae) is a centromere-specific histone variant in Drosophilidae family, functionally analogous to CENP-A in mammals.',
  'taxonomy': None,
  'genes': 'In Drosophila, six cenH3 paralogs (Cid1–Cid6) have been identified: Cid1, Cid4, and Cid6 (and likely Cid2) are essential for mitotic chromosome segregation and are ubiquitously expressed in somatic cells, while the remaining paralogs are germline-specific and may play a role in suppressing centromere drive [kursel_gametic_2021, kursel_recurrent_2017].',
  'evolution': None,
  'expression': None,
  'knock_out': None,
  'function': 'Similar to other animals, Drosophila cenH3 functions primarily in kinetochore assembly through CENP-C recruitment. However, in contrast to vertebrates where CENP-A directly binds CENP-C, Drosophila depends on the CAL1 chaperone 

In [68]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Update description to cenH3_(Homo_sapiens)</span>

### <span style="color:black">Update sequence</span>

```Post-translational modifications of specific CENP-A residues regulate centromere function. In humans, CENP-A is subject to serine phosphorylation at positions 7, 16, 18, and 68, glycine methylation at position 1, as well as lysine 124 ubiquitination and acetylation [wong_epigenetic_2020, eot-houllier_aurora_2018, gattat_phosphorylation_2013]. Phosphorylation of Ser7 regulates the localization of kinetochore proteins, including the association with CENP-C, and ensures proper mitotic progression. Phosphorylation of Ser68 prevents premature CENP-A loading. During the cell cycle, the conserved residue K124 locaded in histone fold undergoes cyclic modifications [bui_internal_2017]. In G1/S phase, K124ac compacts the nucleosome, interfering with CENP-C binding (crucial for kinetochore assembly) while simultaneously promoting chromatin decondensation for centromere replication. During S phase, K124me stabilizes nucleosomes. Furthermore, K124 mutations disrupt both mitosis and replication, demonstrating its critical role in centromere epigenetic regulation [bui_internal_2017].```

### <span style="color:black">Update interactions</span>

```CENP-A nucleosomes stabilize the inner kinetochore known as the CCAN complex (Constitutive Centromere-Associated Network) through direct interaction with CENP-C and CENP-N proteins of this complex [cao_constitutive_2018, pesenti_structure_2022, sridhar_kinetochore_2022, xu_gross_2023]. The CENP-A targeting domain (CATD) along with its N- and C-terminal tails play crucial roles in this process [logsdon_both_2015]. Human CENP-C has been demonstrated to bind with the CENP-A C-terminal hydrophobic tail, the acidic patch of H2A-H2B and histone H4, thereby facilitating its engagement with all four histone subunits present on the nucleosome surface [allu_structure_2019]. The binding of CENP-C to CENP-A is critical for long-term viability in human RPE-1 cells [watanabe_cdk1-mediated_2019]. This interaction is stabilized by the process of CENP-C phosphorylation mediated by CDK1, which strengthens the complex's structure through the occurrence of an intramolecular interaction [watanabe_cdk1-mediated_2019, walstein_assembly_2021, ariyoshi_cryoem_2021]. Recruitment of CENP-C also depends on the N-terminal tail of CENP-A, albeit through an indirect mechanism [logsdon_both_2015]. The direct binding of human CENP-N to the CENP-A nucleosome is mediated by its recognition of the CATD, which encompasses the unique L1 and RG loops [chittori_structural_2018, tian_molecular_2018, ariyoshi_cryoem_2021]. However, cryo-EM studies have revealed that CENP-C and CENP-N bind to a single CENP-A nucleosome in a non-simultaneous manner, thereby demonstrating an asymmetric structure where CENP-C and CENP-N bind to opposite sides of the nucleosome [ariyoshi_cryoem_2021]. Furthermore, in 94% of cases, phosphorylated CENP-C exhibits exclusive binding to the RG-loop of CENP-A, suggesting that this particular interaction is predominant [ariyoshi_cryoem_2021]. In addition, in vitro reconstruction of the human CENP-C protein showed that CENP-C can bind to two centromeric nucleosomes simultaneously [walstein_assembly_2021]. Nevertheless, interactions with CENP-C or CENP-N do not determine the stability of CENP-A nucleosomes in chromatin [cao_constitutive_2018].```

In [69]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Homo_sapiens)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Homo_sapiens),variant,Homo sapiens,9606,158,cenH3_(Mammalia),158,cenH3_(Homo_sapiens) is a centromere-specific ...,,,...,CENP-A depletion at different stages of the ce...,"While CENP-A is a crucial centromeric marker, ...",The conserved residue K124 locaded in histone ...,CENP-A in human chromosomes usually localized ...,Deposition of CENP-A into centromeric nucleoso...,CENP-A confers enhanced flexibility and elasti...,CENP-A nucleosomes stabilize the inner kinetoc...,,,


In [70]:
df.to_dict(orient="records")

[{'id': 158,
  'level': 'variant',
  'taxonomic_span': 'Homo sapiens',
  'taxonomic_span_id': '9606',
  'description': 158,
  'parent': 'cenH3_(Mammalia)',
  'summary': 'cenH3_(Homo_sapiens) is a centromere-specific histone variant in human, often called CENP-A (Centromere Protein A), and an important component of active centromere required for chromosome segregation. For a general description see cenH3_(Animals) class.',
  'taxonomy': 'null',
  'genes': 'null',
  'evolution': 'null',
  'expression': 'null',
  'knock_out': 'CENP-A depletion at different stages of the cell cycle showed that the earlier CENP-A is removed, the more errors accumulate during later cell divisions [hoffmann_cenp-is_2016].',
  'function': 'While CENP-A is a crucial centromeric marker, the formation of a functional kinetochore and proper chromosome segregation requires an epigenetic context, including the targeting of the chaperone HJURP and histone PTMs [hara_critical_2017, hori_histone_2014]. Human CENP-A pla

In [72]:
sequence_desc = "Post-translational modifications of specific CENP-A residues regulate centromere function. In humans, CENP-A is subject to serine phosphorylation at positions 7, 16, 18, and 68, glycine methylation at position 1, as well as lysine 124 ubiquitination and acetylation [wong_epigenetic_2020, eot-houllier_aurora_2018, gattat_phosphorylation_2013]. Phosphorylation of Ser7 regulates the localization of kinetochore proteins, including the association with CENP-C, and ensures proper mitotic progression. Phosphorylation of Ser68 prevents premature CENP-A loading. During the cell cycle, the conserved residue K124 locaded in histone fold undergoes cyclic modifications [bui_internal_2017]. In G1/S phase, K124ac compacts the nucleosome, interfering with CENP-C binding (crucial for kinetochore assembly) while simultaneously promoting chromatin decondensation for centromere replication. During S phase, K124me stabilizes nucleosomes. Furthermore, K124 mutations disrupt both mitosis and replication, demonstrating its critical role in centromere epigenetic regulation [bui_internal_2017]."
interactions_desc = "CENP-A nucleosomes stabilize the inner kinetochore known as the CCAN complex (Constitutive Centromere-Associated Network) through direct interaction with CENP-C and CENP-N proteins of this complex [cao_constitutive_2018, pesenti_structure_2022, sridhar_kinetochore_2022, xu_gross_2023]. The CENP-A targeting domain (CATD) along with its N- and C-terminal tails play crucial roles in this process [logsdon_both_2015]. Human CENP-C has been demonstrated to bind with the CENP-A C-terminal hydrophobic tail, the acidic patch of H2A-H2B and histone H4, thereby facilitating its engagement with all four histone subunits present on the nucleosome surface [allu_structure_2019]. The binding of CENP-C to CENP-A is critical for long-term viability in human RPE-1 cells [watanabe_cdk1-mediated_2019]. This interaction is stabilized by the process of CENP-C phosphorylation mediated by CDK1, which strengthens the complex's structure through the occurrence of an intramolecular interaction [watanabe_cdk1-mediated_2019, walstein_assembly_2021, ariyoshi_cryoem_2021]. Recruitment of CENP-C also depends on the N-terminal tail of CENP-A, albeit through an indirect mechanism [logsdon_both_2015]. The direct binding of human CENP-N to the CENP-A nucleosome is mediated by its recognition of the CATD, which encompasses the unique L1 and RG loops [chittori_structural_2018, tian_molecular_2018, ariyoshi_cryoem_2021]. However, cryo-EM studies have revealed that CENP-C and CENP-N bind to a single CENP-A nucleosome in a non-simultaneous manner, thereby demonstrating an asymmetric structure where CENP-C and CENP-N bind to opposite sides of the nucleosome [ariyoshi_cryoem_2021]. Furthermore, in 94% of cases, phosphorylated CENP-C exhibits exclusive binding to the RG-loop of CENP-A, suggesting that this particular interaction is predominant [ariyoshi_cryoem_2021]. In addition, in vitro reconstruction of the human CENP-C protein showed that CENP-C can bind to two centromeric nucleosomes simultaneously [walstein_assembly_2021]. Nevertheless, interactions with CENP-C or CENP-N do not determine the stability of CENP-A nucleosomes in chromatin [cao_constitutive_2018]."
query = f'UPDATE histone_description SET sequence="{sequence_desc}", interactions="{interactions_desc}" WHERE id=158'
print(query)
cursor.execute(query)

UPDATE histone_description SET sequence="Post-translational modifications of specific CENP-A residues regulate centromere function. In humans, CENP-A is subject to serine phosphorylation at positions 7, 16, 18, and 68, glycine methylation at position 1, as well as lysine 124 ubiquitination and acetylation [wong_epigenetic_2020, eot-houllier_aurora_2018, gattat_phosphorylation_2013]. Phosphorylation of Ser7 regulates the localization of kinetochore proteins, including the association with CENP-C, and ensures proper mitotic progression. Phosphorylation of Ser68 prevents premature CENP-A loading. During the cell cycle, the conserved residue K124 locaded in histone fold undergoes cyclic modifications [bui_internal_2017]. In G1/S phase, K124ac compacts the nucleosome, interfering with CENP-C binding (crucial for kinetochore assembly) while simultaneously promoting chromatin decondensation for centromere replication. During S phase, K124me stabilizes nucleosomes. Furthermore, K124 mutations 

In [73]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Homo_sapiens)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Homo_sapiens),variant,Homo sapiens,9606,158,cenH3_(Mammalia),158,cenH3_(Homo_sapiens) is a centromere-specific ...,,,...,CENP-A depletion at different stages of the ce...,"While CENP-A is a crucial centromeric marker, ...",Post-translational modifications of specific C...,CENP-A in human chromosomes usually localized ...,Deposition of CENP-A into centromeric nucleoso...,CENP-A confers enhanced flexibility and elasti...,CENP-A nucleosomes stabilize the inner kinetoc...,,,


In [74]:
df.to_dict(orient="records")

[{'id': 158,
  'level': 'variant',
  'taxonomic_span': 'Homo sapiens',
  'taxonomic_span_id': '9606',
  'description': 158,
  'parent': 'cenH3_(Mammalia)',
  'summary': 'cenH3_(Homo_sapiens) is a centromere-specific histone variant in human, often called CENP-A (Centromere Protein A), and an important component of active centromere required for chromosome segregation. For a general description see cenH3_(Animals) class.',
  'taxonomy': 'null',
  'genes': 'null',
  'evolution': 'null',
  'expression': 'null',
  'knock_out': 'CENP-A depletion at different stages of the cell cycle showed that the earlier CENP-A is removed, the more errors accumulate during later cell divisions [hoffmann_cenp-is_2016].',
  'function': 'While CENP-A is a crucial centromeric marker, the formation of a functional kinetochore and proper chromosome segregation requires an epigenetic context, including the targeting of the chaperone HJURP and histone PTMs [hara_critical_2017, hori_histone_2014]. Human CENP-A pla

In [75]:
# Make sure data is committed to the database
conn.commit()

Like in other eukaryotes, in fungi the cenH3 variant contains a distinctive CENP-A targeting domain (CATD) that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020]. Cse4 (cenH3 of the budding yeast) possesses a C-terminal histone-fold domain, which is over 60% identical to that of histone H3, and a unique 135-amino acid N-terminal domain that protrudes from the nucleosome core and interacts with kinetochore proteins essential for its assembly [samel_methylation_2012]. Additionally, arginine 37 methylation plays important role in recruitment of inner and linker kinetochore proteins [wong_epigenetic_2020]. The loss of Arg37me has been shown to impair accurate chromosome segregation [samel_methylation_2012].## <span style="color:black">Add description to cenH3_(Fungi)</span>

### <span style="color:black">Add summary</span>

```cenH3_(Fungi) is a centromere-specific histone variant in fungi, often called Cse4 budding yeast and Cnp1 fission yeast, and an important component of active centromere required for chromosome segregation [stoler_mutation_1995].```

### <span style="color:black">Add localization</span>

```In the budding yeast cenH3 is strictly localized to point centromeres, which are specified by a short (~125 bp) specific DNA sequence containing centromeric DNA elements (CDEs) [hara_critical_2017, steiner_diversity_2015]. In the fission yeast cenH3 localizes to regional centromeres that span several kilobases and contain repetitive DNA sequences [hara_critical_2017, steiner_diversity_2015].```

### <span style="color:black">Add function</span>

```Like in other eukaryotes, in fungi the active centromeric chromatin, enriched with cenH3, defines the region of kinetochore interaction for spindle formation [hara_critical_2017, stoler_mutation_1995].```

### <span style="color:black">Add sequence</span>

```Like in other eukaryotes, in fungi the cenH3 variant contains a distinctive CENP-A targeting domain (CATD) that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020]. Cse4 (cenH3 of the budding yeast) possesses a C-terminal histone-fold domain, which is over 60% identical to that of histone H3, and a unique 135-amino acid N-terminal domain that protrudes from the nucleosome core and interacts with kinetochore proteins essential for its assembly [samel_methylation_2012]. Additionally, arginine 37 methylation plays important role in recruitment of inner and linker kinetochore proteins [wong_epigenetic_2020]. The loss of Arg37me has been shown to impair accurate chromosome segregation [samel_methylation_2012].```

### <span style="color:black">Add deposition</span>

```In fungi cenH3 loading is mediated by Scm3, the HJURP homologue [wong_epigenetic_2020, zasadzinska_dimerization_2013, shivaraju_scm3_2011]. The deposition occurs in two distinct windows: during S-phase and anaphase in budding yeast, compared to S and G2 phases in fission yeast [shivaraju_cell_2012, takayama_biphasic_2008]. Notably, budding yeast exhibit complete turnover of cenH3 each cell cycle [wong_epigenetic_2020].```

### <span style="color:black">Add structure</span>

```In yeast, cenH3-containing nucleosomes exhibit an octameric structure [dechassa_structure_2011, shivaraju_scm3_2011].```

In [None]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Fungi)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Fungi),variant,Homo sapiens,9606,95,cenH3,95,,,,...,,,,,,,,,,


In [None]:
df.to_dict(orient="records")

[{'id': 95,
  'level': 'variant',
  'taxonomic_span': 'Homo sapiens',
  'taxonomic_span_id': '9606',
  'description': 95,
  'parent': 'cenH3',
  'summary': 'null',
  'taxonomy': 'null',
  'genes': 'null',
  'evolution': 'null',
  'expression': 'null',
  'knock_out': 'null',
  'function': 'null',
  'sequence': 'null',
  'localization': 'null',
  'deposition': 'null',
  'structure': 'null',
  'interactions': 'null',
  'disease': 'null',
  'caveats': 'null',
  'relations': None}]

In [79]:
summary_desc = "cenH3_(Fungi) is a centromere-specific histone variant in fungi, often called Cse4 budding yeast and Cnp1 fission yeast, and an important component of active centromere required for chromosome segregation [stoler_mutation_1995]."
localization_desc = "In the budding yeast cenH3 is strictly localized to point centromeres, which are specified by a short (~125 bp) specific DNA sequence containing centromeric DNA elements (CDEs) [hara_critical_2017, steiner_diversity_2015]. In the fission yeast cenH3 localizes to regional centromeres that span several kilobases and contain repetitive DNA sequences [hara_critical_2017, steiner_diversity_2015]."
function_desc = "Like in other eukaryotes, in fungi the active centromeric chromatin, enriched with cenH3, defines the region of kinetochore interaction for spindle formation [hara_critical_2017, stoler_mutation_1995]."
sequence_desc = "Like in other eukaryotes, in fungi the cenH3 variant contains a distinctive CENP-A targeting domain (CATD) that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020]. Cse4 (cenH3 of the budding yeast) possesses a C-terminal histone-fold domain, which is over 60% identical to that of histone H3, and a unique 135-amino acid N-terminal domain that protrudes from the nucleosome core and interacts with kinetochore proteins essential for its assembly [samel_methylation_2012]. Additionally, arginine 37 methylation plays important role in recruitment of inner and linker kinetochore proteins [wong_epigenetic_2020]. The loss of Arg37me has been shown to impair accurate chromosome segregation [samel_methylation_2012]."
deposition_desc = "In fungi cenH3 loading is mediated by Scm3, the HJURP homologue [wong_epigenetic_2020, zasadzinska_dimerization_2013, shivaraju_scm3_2011]. The deposition occurs in two distinct windows: during S-phase and anaphase in budding yeast, compared to S and G2 phases in fission yeast [shivaraju_cell_2012, takayama_biphasic_2008]. Notably, budding yeast exhibit complete turnover of cenH3 each cell cycle [wong_epigenetic_2020]."
structure_desc = "In yeast, cenH3-containing nucleosomes exhibit an octameric structure [dechassa_structure_2011, shivaraju_scm3_2011]."
query = f'UPDATE histone_description SET summary="{summary_desc}", localization="{localization_desc}", function="{function_desc}", sequence="{sequence_desc}", deposition="{deposition_desc}", structure="{structure_desc}" WHERE id=95'
print(query)
cursor.execute(query)

UPDATE histone_description SET summary="cenH3_(Fungi) is a centromere-specific histone variant in fungi, often called Cse4 budding yeast and Cnp1 fission yeast, and an important component of active centromere required for chromosome segregation [stoler_mutation_1995].", localization="In the budding yeast cenH3 is strictly localized to point centromeres, which are specified by a short (~125 bp) specific DNA sequence containing centromeric DNA elements (CDEs) [hara_critical_2017, steiner_diversity_2015]. In the fission yeast cenH3 localizes to regional centromeres that span several kilobases and contain repetitive DNA sequences [hara_critical_2017, steiner_diversity_2015].", function="Like in other eukaryotes, in fungi the active centromeric chromatin, enriched with cenH3, defines the region of kinetochore interaction for spindle formation [hara_critical_2017, stoler_mutation_1995].", sequence="Like in other eukaryotes, in fungi the cenH3 variant contains a distinctive CENP-A targeting d

In [80]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Fungi)'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Fungi),variant,Homo sapiens,9606,95,cenH3,95,cenH3_(Fungi) is a centromere-specific histone...,,,...,,"Like in other eukaryotes, in fungi the active ...","Like in other eukaryotes, in fungi the cenH3 v...",In the budding yeast cenH3 is strictly localiz...,"In fungi cenH3 loading is mediated by Scm3, th...","In yeast, cenH3-containing nucleosomes exhibit...",,,,


In [81]:
df.to_dict(orient="records")

[{'id': 95,
  'level': 'variant',
  'taxonomic_span': 'Homo sapiens',
  'taxonomic_span_id': '9606',
  'description': 95,
  'parent': 'cenH3',
  'summary': 'cenH3_(Fungi) is a centromere-specific histone variant in fungi, often called Cse4 budding yeast and Cnp1 fission yeast, and an important component of active centromere required for chromosome segregation [stoler_mutation_1995].',
  'taxonomy': 'null',
  'genes': 'null',
  'evolution': 'null',
  'expression': 'null',
  'knock_out': 'null',
  'function': 'Like in other eukaryotes, in fungi the active centromeric chromatin, enriched with cenH3, defines the region of kinetochore interaction for spindle formation [hara_critical_2017, stoler_mutation_1995].',
  'sequence': 'Like in other eukaryotes, in fungi the cenH3 variant contains a distinctive CENP-A targeting domain (CATD) that mediates specific chaperone recognition and facilitates kinetochore protein assembly [wong_epigenetic_2020]. Cse4 (cenH3 of the budding yeast) possesses a 

In [82]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [83]:
cursor.close()
conn.close()
tunnel.stop()