In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

36687


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [84]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
add_histone_description = (
    "INSERT INTO histone_description "
    "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

In [55]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Done

## <span style="color:green">Add to description of cenH3_(Plants):

    ```
    While most diploid eukaryotes and flowering plants maintain only one CENH3 gene copy (even after whole-genome duplications), some diploid species like Arabidopsis lyrata, barley, rye, pea, and related legumes retain two functional CENH3 homologs [ishii_unequal_2020].
    ```
</span>

## <span style="color:green">Add node cenH3_(Vigna_unguiculata) before children nodes cenH3.1_(Vigna_unguiculata) and cenH3.2_(Vigna_unguiculata).</span>

## <span style="color:green">Add node cenH3_(Triticeae) after cenH3_(Plants).</span>

## <span style="color:green">Add two children nodes for cenH3_(Triticeae): cenH3.1_(Triticeae) and cenH3.1_(Triticeae)</span>

## <span style="color:green">Add descriptions and publication for new nodes [yuan_characterization_2015]</span>

## <span style="color:green">Add sequences for cenH3.1_(Triticeae) and cenH3.1_(Triticeae)</span>

## See more in DeepSeek "Отличия αCENH3 и βCENH3 в пшенице"

Вот структурированная выгрузка ключевых выводов из нашего обсуждения статьи **Yuan et al. (2015)** о генах *αCENH3* и *βCENH3* у пшеницы и родственных видов:

---

### **1. Основные открытия статьи [yuan_characterization_2015]**  
- **Два гена *CENH3*** (*α* и *β*) обнаружены у:  
  - Всех изученных видов *Triticum* (диплоиды, тетраплоиды, гексаплоиды).  
  - Диплоидных предков (*Aegilops speltoides* (SS), *Ae. tauschii* (DD)).  
  - Ячменя (*Hordeum vulgare*).  
- **Функциональные различия**:  
  - *αCENH3* критичен для митоза и развития растений. Подавление экспрессии *αCENH3* приводит к сильной карликовости и ослаблению корневой системы, что указывает на его критическую роль в развитии растения.
  - *βCENH3* важен для репродукции и адаптации полиплоидов. Подавление *βCENH3* вызывает менее выраженные фенотипические изменения, но снижает репродуктивную способность, что предполагает его специализированную роль в репродуктивном развитии.

**Цитата**:  
> *"Plants inoculated with BSMV:αCENH3 were extremely dwarfed [...] while BSMV:βCENH3-treated plants produced very few seeds."*  

---

### **2. Таксономические уточнения**  
- **Не все злаки имеют два гена**:  
  - Один ген: рис (*Oryza*), кукуруза (*Zea*), сорго (*Sorghum*).  
  - Два гена: пшеница (*Triticum*), ячмень (*Hordeum*), *Aegilops*.  
- **Aegilops ≠ пшеница**:  
  - *Ae. speltoides* (SS) — дикий предок B-генома пшеницы, но относится к другому роду.  

**Цитата**:  
> *"Tetraploid wheat evolved through hybridization between Triticum (AA) and Aegilops (SS)."*  

---

### **3. Методология**  
- **Экспериментальное открытие *βCENH3***:  
  - ПЦР с праймерами P1/P2 на консервативном участке CATD.  
  - Не биоинформатика (BLAST выявил только *αCENH3*).  
- **Подтверждение**:  
  - Секвенирование, FISH, VIGS, филогенетический анализ.  

**Цитата**:  
> *"Using primers P1 and P2, we identified a second CENH3, named βCENH3."*  

---

### **4. Ограничения и открытые вопросы**  
- **Не все *Triticinae* изучены**:  
  - Нет данных по родам *Ambylopyrum*, *Henrardia* и др.  
- **Причины дупликации**:  
  - Предполагается общий предок *Pooideae* (~13 млн лет назад), но механизмы не ясны.  

---

### **5. Ключевые цитаты**  
1. О роли *βCENH3*:  
   > *"βCENH3 is under positive selection in cultivated tetraploids, suggesting adaptive evolution."*  
2. О локализации:  
   > *"αCENH3 is present at all mitotic stages, while βCENH3 is detected mainly in interphase."*  


# Add to description of cenH3_(Plants)


    ```
    While most diploid eukaryotes and flowering plants maintain only one CENH3 gene copy (even after whole-genome duplications), some diploid species like Arabidopsis lyrata, barley, rye, pea, and related legumes retain two functional CENH3 homologs [ishii_unequal_2020].
    ```

In [7]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Plants)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Plants),variant,Eukaryotes,2759,,cenH3,,,,,...,,,,,,,,,,


In [10]:
histone_desc_summary = "cenH3_(Plants) is a centromere-specific histone variant in Plants. This is a subclass of cenH3 (see it for a detailed description)."
genes_desc = "While most diploid eukaryotes and flowering plants maintain only one CENH3 gene copy (even after whole-genome duplications), some diploid species like Arabidopsis lyrata, barley, rye, pea, and related legumes retain two functional CENH3 homologs [ishii_unequal_2020]."
data_histone_description = (histone_desc_summary, None, genes_desc) + (None,) * 11
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = (
    f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3_(Plants)'"
)
cursor.execute(query)

In [11]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Plants)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Plants),variant,Eukaryotes,2759,233,cenH3,233,cenH3_(Plants) is a centromere-specific histon...,,While most diploid eukaryotes and flowering pl...,...,,,,,,,,,,


In [12]:
# Make sure data is committed to the database
conn.commit()

# Add node cenH3_(Vigna_unguiculata) before children nodes cenH3.1_(Vigna_unguiculata) and cenH3.2_(Vigna_unguiculata)

In [14]:
data_histone = {
    "id": "cenH3_(Vigna_unguiculata)",
    "level": "variant",
    "taxonomic_span": "Vigna unguiculata",
    "taxonomic_span_id": "3917",
    "description": None,
    "parent": "cenH3_(Plants)",
}
cursor.execute(add_histone, data_histone)

In [15]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(["cenH3_(Vigna_unguiculata)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
12,cenH3_(Vigna_unguiculata),variant,Vigna unguiculata,3917,,cenH3_(Plants)


In [16]:
# Make sure data is committed to the database
conn.commit()

In [17]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[
    histone_df["id"].isin(
        ["cenH3.1_(Vigna_unguiculata)", "cenH3.2_(Vigna_unguiculata)"]
    )
]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
5,cenH3.1_(Vigna_unguiculata),variant,Vigna unguiculata,3917,231.0,cenH3_(Plants)
6,cenH3.2_(Vigna_unguiculata),variant,Vigna unguiculata,3917,232.0,cenH3_(Plants)


In [19]:
query = f"UPDATE histone SET parent='cenH3_(Vigna_unguiculata)' WHERE id='cenH3.1_(Vigna_unguiculata)'"
print(query)
cursor.execute(query)

UPDATE histone SET parent='cenH3_(Vigna_unguiculata)' WHERE id='cenH3.1_(Vigna_unguiculata)'


In [21]:
query = f"UPDATE histone SET parent='cenH3_(Vigna_unguiculata)' WHERE id='cenH3.2_(Vigna_unguiculata)'"
print(query)
cursor.execute(query)

UPDATE histone SET parent='cenH3_(Vigna_unguiculata)' WHERE id='cenH3.2_(Vigna_unguiculata)'


In [22]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[
    histone_df["id"].isin(
        ["cenH3.1_(Vigna_unguiculata)", "cenH3.2_(Vigna_unguiculata)"]
    )
]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
5,cenH3.1_(Vigna_unguiculata),variant,Vigna unguiculata,3917,231.0,cenH3_(Vigna_unguiculata)
6,cenH3.2_(Vigna_unguiculata),variant,Vigna unguiculata,3917,232.0,cenH3_(Vigna_unguiculata)


In [23]:
# Make sure data is committed to the database
conn.commit()

# Add node cenH3_(Triticeae) after cenH3_(Plants)

In [24]:
data_histone = {
    "id": "cenH3_(Triticeae)",
    "level": "variant",
    "taxonomic_span": "Triticeae(?)",
    "taxonomic_span_id": "147389",
    "description": None,
    "parent": "cenH3_(Plants)",
}
cursor.execute(add_histone, data_histone)

In [25]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(["cenH3_(Triticeae)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
12,cenH3_(Triticeae),variant,Triticeae(?),147389,,cenH3_(Plants)


In [26]:
# Make sure data is committed to the database
conn.commit()

**Таксономическое положение**  
- **Род *Triticum*** (пшеница) включает культурные виды:  
  - Диплоиды (AA): *T. urartu*, *T. monococcum*.  
  - Тетраплоиды (AABB): *T. turgidum*, *T. timopheevii*.  
  - Гексаплоиды (AABBDD): *T. aestivum*.  

- **Род *Aegilops*** (козлобородник) — **дикие предки** B- и D-геномов пшеницы:  
  - *Ae. speltoides* (SS ≈ BB) — вероятный донор B-генома.  
  - *Ae. tauschii* (DD) — донор D-генома.  

**Цитата из статьи**:  
> *"Tetraploid wheat [...] evolved through hybridization between Sect. Monococcon (AA genome) and a species (SS genome, the alias of the BB genome in wild species) that belonged to the genus Aegilops."*  

- **Изученные виды**:  
  - У всех проанализированных представителей родов *Triticum* (пшеница) и *Aegilops* (козлобородник) обнаружены **два гена *CENH3*** (*α* и *β*).  
  - Примеры:  
    - *Triticum urartu* (AA), *T. aestivum* (AABBDD).  
    - *Aegilops speltoides* (SS), *Ae. tauschii* (DD).  

- **Филогенетический контекст**:  
  - Дупликация гена произошла у общего предка **подсемейства *Pooideae*** (включая *Triticum*, *Aegilops*, *Hordeum*).  
  - У других злаков (например, риса, кукурузы) — только один ген.  

**Цитата**:  
> *"The two CENH3s of wheat species are homologous to αHvCENH3 and βHvCENH3 from barley (Hordeum vulgare)."*  


In [27]:
histone_desc_summary = "cenH3_(Triticeae) is a centromere-specific histone variant found in wheat (Triticum) and its close relatives (Aegilops and Hordeum). Interestingly, two functionally distinct CENH3 genes (α and β) were identified for this group of variants [yuan_characterization_2015]. While other cereals (like rice, maize) possess only one CENH3 gene."
taxonomy_desc = "This group of variants described in 11 diploid wheat species (Triticum urartu, T. monococcum and Aegilops spp.), 15 tetraploid and one hexaploid species [yuan_characterization_2015]."
data_histone_description = (histone_desc_summary, taxonomy_desc) + (None,) * 12
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3_(Triticeae)'"
cursor.execute(query)

In [30]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Triticeae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Triticeae),variant,Triticeae(?),147389,234,cenH3_(Plants),234,cenH3_(Triticeae) is a centromere-specific his...,This group of variants described in 11 diploid...,,...,,,,,,,,,,


In [31]:
# Make sure data is committed to the database
conn.commit()

# Add two children nodes for cenH3_(Triticeae): cenH3.1_(Triticeae) and cenH3.2_(Triticeae)

In [32]:
data_histone = [
    {
        "id": "cenH3.1_(Triticeae)",
        "level": "variant",
        "taxonomic_span": "Triticeae(?)",
        "taxonomic_span_id": "147389",
        "description": None,
        "parent": "cenH3_(Triticeae)",
    },
    {
        "id": "cenH3.2_(Triticeae)",
        "level": "variant",
        "taxonomic_span": "Triticeae(?)",
        "taxonomic_span_id": "147389",
        "description": None,
        "parent": "cenH3_(Triticeae)",
    },
]
for dh in data_histone:
    cursor.execute(add_histone, dh)

In [33]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(["cenH3.1_(Triticeae)", "cenH3.2_(Triticeae)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
5,cenH3.1_(Triticeae),variant,Triticeae(?),147389,,cenH3_(Triticeae)
7,cenH3.2_(Triticeae),variant,Triticeae(?),147389,,cenH3_(Triticeae)


In [34]:
# Make sure data is committed to the database
conn.commit()

In [35]:
histone_desc_summary = "cenH3.1_(Triticeae) is a centromere-specific histone H3 variant encoded by the αCENH3 gene (one of two paralogs) in polyploid wheat (Triticum spp.) [yuan_characterization_2015]. Unlike βCENH3, it is essential for mitotic progression and vegetative growth, as its silencing causes severe dwarfism and root system defects. The αCENH3 protein contains conserved histone-fold domains but differs from βCENH3 in its N-terminal tail and loop1 region. αCENH3 is constitutively expressed and maintains centromere localization throughout the cell cycle, while βCENH3 shows cell cycle-dependent deposition and specializes in reproductive adaptation [yuan_characterization_2015]."
taxonomy_desc = "This group of variants described in 11 diploid wheat species (Triticum urartu, T. monococcum and Aegilops spp.), 15 tetraploid and one hexaploid species [yuan_characterization_2015]."
data_histone_description = (histone_desc_summary, taxonomy_desc) + (None,) * 12
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.1_(Triticeae)'"
cursor.execute(query)

In [36]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.1_(Triticeae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.1_(Triticeae),variant,Triticeae(?),147389,235,cenH3_(Triticeae),235,cenH3.1_(Triticeae) is a centromere-specific h...,This group of variants described in 11 diploid...,,...,,,,,,,,,,


In [37]:
# Make sure data is committed to the database
conn.commit()

In [38]:
histone_desc_summary = "cenH3.2_(Triticeae) is a centromeric histone H3 variant encoded by the βCENH3 paralog in polyploid wheat (Triticum spp.) [yuan_characterization_2015]. Unlike the essential αCENH3, βCENH3 shows specialized functions in reproductive development and polyploid adaptation. While dispensable for vegetative growth, its silencing leads to reduced fertility and seed set. The βCENH3 protein features distinct loop1 domain variations and undergoes cell cycle-specific centromere deposition, being predominantly present during interphase. Evolutionary analysis reveals positive selection in its histone-fold domain, particularly in cultivated wheat varieties, suggesting adaptive evolution under domestication pressures [yuan_characterization_2015]."
taxonomy_desc = "This group of variants described in 11 diploid wheat species (Triticum urartu, T. monococcum and Aegilops spp.), 15 tetraploid and one hexaploid species [yuan_characterization_2015]."
data_histone_description = (histone_desc_summary, taxonomy_desc) + (None,) * 12
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.2_(Triticeae)'"
cursor.execute(query)

In [39]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.2_(Triticeae)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.2_(Triticeae),variant,Triticeae(?),147389,236,cenH3_(Triticeae),236,cenH3.2_(Triticeae) is a centromeric histone H...,This group of variants described in 11 diploid...,,...,,,,,,,,,,


In [40]:
# Make sure data is committed to the database
conn.commit()

# Add publication for new nodes [yuan_characterization_2015]

## cenH3_(Vigna_unguiculata)

[ishii_unequal_2020]

In [42]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(["cenH3_(Vigna_unguiculata)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
20,cenH3_(Vigna_unguiculata),variant,Vigna unguiculata,3917,,cenH3_(Plants),,


In [43]:
pid = "ishii_unequal_2020"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,ishii_unequal_2020,Unequal contribution of two paralogous CENH3 v...,10.1038/s42003-020-01507-x,,2020


In [44]:
cursor.execute(add_histone_has_publication, ("cenH3_(Vigna_unguiculata)", pid))

In [46]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(["cenH3_(Vigna_unguiculata)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
20,cenH3_(Vigna_unguiculata),variant,Vigna unguiculata,3917,,cenH3_(Plants),cenH3_(Vigna_unguiculata),ishii_unequal_2020


In [47]:
# Make sure data is committed to the database
conn.commit()

## cenH3_(Triticeae), cenH3.1_(Triticeae) and cenH3.2_(Triticeae)

[yuan_characterization_2015]

In [48]:
variant_names = ["cenH3_(Triticeae)", "cenH3.1_(Triticeae)", "cenH3.2_(Triticeae)"]

In [49]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(variant_names)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
5,cenH3.1_(Triticeae),variant,Triticeae(?),147389,235.0,cenH3_(Triticeae),,
7,cenH3.2_(Triticeae),variant,Triticeae(?),147389,236.0,cenH3_(Triticeae),,
19,cenH3_(Triticeae),variant,Triticeae(?),147389,234.0,cenH3_(Plants),,


In [50]:
pid = "yuan_characterization_2015"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [51]:
data_publication = {
    "id": pid,
    "title": "Characterization of two CENH3 genes and their roles in wheat evolution",
    "doi": "10.1111/nph.13235",
    "author": None,
    "year": "2015",
}
cursor.execute(add_publication, data_publication)

In [52]:
for vn in variant_names:
    cursor.execute(add_histone_has_publication, (vn, pid))

In [53]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(variant_names)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
5,cenH3.1_(Triticeae),variant,Triticeae(?),147389,235.0,cenH3_(Triticeae),cenH3.1_(Triticeae),yuan_characterization_2015
7,cenH3.2_(Triticeae),variant,Triticeae(?),147389,236.0,cenH3_(Triticeae),cenH3.2_(Triticeae),yuan_characterization_2015
19,cenH3_(Triticeae),variant,Triticeae(?),147389,234.0,cenH3_(Plants),cenH3_(Triticeae),yuan_characterization_2015


In [54]:
# Make sure data is committed to the database
conn.commit()

# Correct taxonomy description for cenH3_(Triticeae), cenH3.1_(Triticeae) and cenH3.2_(Triticeae)

In [59]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df.iloc[:, 0].isin(variant_names)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
5,cenH3.1_(Triticeae),variant,Triticeae(?),147389,235.0,cenH3_(Triticeae),235.0,cenH3.1_(Triticeae) is a centromere-specific h...,This group of variants described in 11 diploid...,,...,,,,,,,,,,
7,cenH3.2_(Triticeae),variant,Triticeae(?),147389,236.0,cenH3_(Triticeae),236.0,cenH3.2_(Triticeae) is a centromeric histone H...,This group of variants described in 11 diploid...,,...,,,,,,,,,,
14,cenH3_(Triticeae),variant,Triticeae(?),147389,234.0,cenH3_(Plants),234.0,cenH3_(Triticeae) is a centromere-specific his...,This group of variants described in 11 diploid...,,...,,,,,,,,,,


In [87]:
taxonomy_desc = "This group of variants described in 11 diploid wheat species (Triticum urartu L., Triticum monococcum L. and Aegilops L.), 15 tetraploid (Triticum turgidum L. and Triticum timopheevii L.) and one hexaploid species (Triticum aestivum L.) [yuan_characterization_2015]."
for desc_id in range(234, 237):
    query = (
        f"UPDATE histone_description SET taxonomy='{taxonomy_desc}' WHERE id={desc_id}"
    )
    print(query)
    cursor.execute(query)

UPDATE histone_description SET taxonomy='This group of variants described in 11 diploid wheat species (Triticum urartu L., Triticum monococcum L. and Aegilops L.), 15 tetraploid (Triticum turgidum L. and Triticum timopheevii L.) and one hexaploid species (Triticum aestivum L.) [yuan_characterization_2015].' WHERE id=234
UPDATE histone_description SET taxonomy='This group of variants described in 11 diploid wheat species (Triticum urartu L., Triticum monococcum L. and Aegilops L.), 15 tetraploid (Triticum turgidum L. and Triticum timopheevii L.) and one hexaploid species (Triticum aestivum L.) [yuan_characterization_2015].' WHERE id=235
UPDATE histone_description SET taxonomy='This group of variants described in 11 diploid wheat species (Triticum urartu L., Triticum monococcum L. and Aegilops L.), 15 tetraploid (Triticum turgidum L. and Triticum timopheevii L.) and one hexaploid species (Triticum aestivum L.) [yuan_characterization_2015].' WHERE id=236


In [88]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df.iloc[:, 0].isin(variant_names)]["taxonomy"].values[0]

'This group of variants described in 11 diploid wheat species (Triticum urartu L., Triticum monococcum L. and Aegilops L.), 15 tetraploid (Triticum turgidum L. and Triticum timopheevii L.) and one hexaploid species (Triticum aestivum L.) [yuan_characterization_2015].'

In [89]:
# Make sure data is committed to the database
conn.commit()

# Add sequences for cenH3.1_(Triticeae) and cenH3.2_(Triticeae)

Using RACE we obtained the full gene sequences of aCENH3 and bCENH3 in diploid species TMU38 (T. urartu, AA), AE739 (A. speltoides, SS) and TQ27 (A. tauschii, DD). <...> Multiple amino acid sequence alignment was conducted among <span style="color:green">aTuCENH3 (KM507181), bTuCENH3 (KM507184), aAsCENH3 (KM507182), bAsCENH3 (KM507185), aAtaCENH3 (KM507183), bAtaCENH3 (KM507186)</span> and other grass CENH3 proteins, including aHvCENH3 and bHvCENH3 from H. vulgare, OsCENH3 from rice, ZmCENH3 from maize, and SbCENH3 from Sorghum bicolor.

Список изученных в статье видов пшеницы (*Triticum*) и их диплоидных предков (*Aegilops*) с указанием геномов:

---

### **1. Диплоидные виды (2n=14)**
| Вид (латинское название)          | Геном | Роль в эволюции пшеницы          |
|-----------------------------------|-------|----------------------------------|
| *Triticum urartu*                 | AA    | Дикий предок **A-генома** культурной пшеницы |
| *Triticum monococcum*             | AA    | Однозернянка (примитивная пшеница) |
| *Aegilops speltoides*             | SS    | Ближайший аналог **B-генома** (не прямой предок) |
| *Aegilops tauschii* (син. *Ae. squarrosa*)| DD | Прямой предок **D-генома** гексаплоидной пшеницы |

---

### **2. Тетраплоидные виды (2n=28)**
| Вид (латинское название)          | Еще название | Геном | Категория        |
|-----------------------------------|--------------|-------|------------------|
| *Triticum turgidum* ssp. *dicoccoides* | TTD92 | AABB | **Дикая** эммерная пшеница |
| *Triticum turgidum* ssp. *durum*  | TTR04 | AABB | **Культурная** твердая пшеница (макаронная) |
| *Triticum turgidum* ssp. *polonicum* | Polonicum | AABB | Польская пшеница |
| *Triticum turgidum* ssp. *turgidum* | Turgidum1 | AABB | Голая пшеница    |
| *Triticum timopheevii*            | Отсутствует | AAGG | Дикий/культурный вид (отдельная ветвь) |

---

### **3. Гексаплоидные виды (2n=42)**
| Вид (латинское название)          | Геном | Примечание       |
|-----------------------------------|-------|------------------|
| *Triticum aestivum* (сорт Chinese Spring) | AABBDD | Мягкая пшеница (хлебная) |

---

### **4. Ключевые моменты**
1. **Геномные обозначения**:  
   - **A, B, D** — основные геномы пшеницы.  
   - **S (Ae. speltoides)** и **G (T. timopheevii)** — альтернативные обозначения для близких геномов.  

2. **Происхождение полиплоидов**:  
   - **AABB**: Гибридизация *T. urartu* (AA) × *Ae. speltoides* (SS).  
   - **AABBDD**: Гибридизация *T. turgidum* (AABB) × *Ae. tauschii* (DD).  

3. **Почему *Ae. speltoides* не считается "пшеницей"?**  
   - Принадлежит к роду *Aegilops*, хотя и участвовал в образовании B-генома.  


**В статье приведены данные о последовательностях только для диплоидных видов.**

## Get protein IDs

In [73]:
records_data = {}
other_records = {}
for accession_id in [f"KM50718{i}" for i in range(1, 7)]:
    # ID записи в DDBJ
    print(f"### Search for {accession_id} ...")
    # Загружаем запись в формате GenBank
    with Entrez.efetch(
        db="nucleotide", id=accession_id, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    if not (
        record.annotations["organism"].startswith("Triticum")
        or record.annotations["organism"].startswith("Aegilops")
    ):
        other_records[accession_id] = {
            "ID": record.id,
            "Description": record.description,
            "Organism": record.annotations["organism"],
        }
        continue
    # Выводим информацию о записи
    print(f"ID: {record.id}")
    print(f"Описание: {record.description}")
    # Извлекаем идентификаторы белков
    protein_ids = []
    for feature in record.features:
        if feature.type == "CDS":  # Ищем только кодирующие последовательности (белки)
            if "protein_id" in feature.qualifiers:
                protein_ids.append(feature.qualifiers["protein_id"][0])
            elif (
                "db_xref" in feature.qualifiers
            ):  # Ищем идентификаторы в db_xref (например, UniProt)
                for xref in feature.qualifiers["db_xref"]:
                    if xref.startswith("GI:") or xref.startswith(
                        "UniProt:"
                    ):  # Можно добавить другие базы
                        protein_ids.append(xref)
    # Выводим результат
    print("Найденные идентификаторы белков:", end=" ")
    for pid in protein_ids:
        print(pid)
    records_data[accession_id] = {
        "ID": record.id,
        "Description": record.description,
        "Organism": record.annotations["organism"],
        "Protein ID": protein_ids[0],
    }

### Search for KM507181 ...
ID: KM507181.1
Описание: Triticum urartu centromeric histone H3 (alphaCENH3) mRNA, partial cds
Найденные идентификаторы белков: AKM28567.1
### Search for KM507182 ...
ID: KM507182.1
Описание: Aegilops speltoides centromeric histone H3 (alphaCENH3) mRNA, partial cds
Найденные идентификаторы белков: AKM28568.1
### Search for KM507183 ...
ID: KM507183.1
Описание: Aegilops tauschii centromeric histone H3 (alphaCENH3) mRNA, partial cds
Найденные идентификаторы белков: AKM28569.1
### Search for KM507184 ...
ID: KM507184.1
Описание: Triticum urartu centromeric histone H3 (betaCENH3) mRNA, partial cds
Найденные идентификаторы белков: AKM28570.1
### Search for KM507185 ...
ID: KM507185.1
Описание: Aegilops speltoides centromeric histone H3 (betaCENH3) mRNA, partial cds
Найденные идентификаторы белков: AKM28571.1
### Search for KM507186 ...
ID: KM507186.1
Описание: Aegilops tauschii centromeric histone H3 (betaCENH3) mRNA, partial cds
Найденные идентификаторы белков: 

In [74]:
records_data

{'KM507181': {'ID': 'KM507181.1',
  'Description': 'Triticum urartu centromeric histone H3 (alphaCENH3) mRNA, partial cds',
  'Organism': 'Triticum urartu',
  'Protein ID': 'AKM28567.1'},
 'KM507182': {'ID': 'KM507182.1',
  'Description': 'Aegilops speltoides centromeric histone H3 (alphaCENH3) mRNA, partial cds',
  'Organism': 'Aegilops speltoides',
  'Protein ID': 'AKM28568.1'},
 'KM507183': {'ID': 'KM507183.1',
  'Description': 'Aegilops tauschii centromeric histone H3 (alphaCENH3) mRNA, partial cds',
  'Organism': 'Aegilops tauschii',
  'Protein ID': 'AKM28569.1'},
 'KM507184': {'ID': 'KM507184.1',
  'Description': 'Triticum urartu centromeric histone H3 (betaCENH3) mRNA, partial cds',
  'Organism': 'Triticum urartu',
  'Protein ID': 'AKM28570.1'},
 'KM507185': {'ID': 'KM507185.1',
  'Description': 'Aegilops speltoides centromeric histone H3 (betaCENH3) mRNA, partial cds',
  'Organism': 'Aegilops speltoides',
  'Protein ID': 'AKM28571.1'},
 'KM507186': {'ID': 'KM507186.1',
  'Descr

In [75]:
other_records

{}

## Add sequences to curatedDB

In [76]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [77]:
with Entrez.efetch(
    db="protein", id="AKM28567.1", rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
str(record.seq)

'MARTKHPAVRKTKAPPKKQLGPRPAQRRQETDGAGTSATPRRAGRAAAPGGAEGATGQPKQRKPHRFRPGTVALREIRRYQKSVDFLIPFAPFVRLIKEVTDFFCPEISRWTPQALVAIQEAAEYHLVDVFERANHCAIHAKRVTVMQKDIQLARRIGGRRLW'

In [81]:
variant_dict = {"alphaCENH3": "cenH3.1_(Triticeae)", "betaCENH3": "cenH3.2_(Triticeae)"}
data_sequence_list = []
for k, v in records_data.items():
    with Entrez.efetch(
        db="protein", id=v["Protein ID"], rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    isoform = v["Description"].split("(")[1].split(")")[0]
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": variant_dict[isoform],
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 4572
Fetched taxid from NCBI 4573
Fetched taxid from NCBI 37682
Fetched taxid from NCBI 4572
Fetched taxid from NCBI 4573
Fetched taxid from NCBI 37682
accession AKM28572.1 <class 'str'>
variant cenH3.2_(Triticeae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 37682 <class 'int'>
organism Aegilops tauschii <class 'str'>
phylum Streptophyta <class 'str'>
class Magnoliopsida <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MGRTKHAVAATATTTTTETKKRLRFELSPRWRPPPPMRQVPPEPQPEKKKKRAYRFRPGTVALREVRKYQKSTGPLIPFAPFVRLVKEITNDLTKGELNHWTPQALFALQEAAEYHIVDVFEKANLCAIHAKRVTIMQKDIQLARRIGGRRLW <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [85]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [86]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
327,AKM28567.1,cenH3.1_(Triticeae),,,,4572.0,Triticum urartu,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKAPPKKQLGPRPAQRRQETDGAGTSATPRRAGRA...,
328,AKM28568.1,cenH3.1_(Triticeae),,,,4573.0,Aegilops speltoides,Streptophyta,Magnoliopsida,,,MGRTKHPAVRKTKAPPKKQLGPRPAQRRQETDGAGTSATPRRAGRA...,
329,AKM28569.1,cenH3.1_(Triticeae),,,,37682.0,Aegilops tauschii,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKAPPKKQLGPRPAQRRQETGATGQPKQRKPHRFR...,
330,AKM28570.1,cenH3.2_(Triticeae),,,,4572.0,Triticum urartu,Streptophyta,Magnoliopsida,,,MGRTKHAVAATAATTETKKRLRFELSPRWRPPPALRQVPPEPQPEK...,
331,AKM28571.1,cenH3.2_(Triticeae),,,,4573.0,Aegilops speltoides,Streptophyta,Magnoliopsida,,,MGRTKHAVAATAATTETKKRLRFELSPRWRPPPPMRQVPPEPQPQP...,
332,AKM28572.1,cenH3.2_(Triticeae),,,,37682.0,Aegilops tauschii,Streptophyta,Magnoliopsida,,,MGRTKHAVAATATTTTTETKKRLRFELSPRWRPPPPMRQVPPEPQP...,


In [91]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [92]:
pid = "yuan_characterization_2015"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,yuan_characterization_2015,Characterization of two CENH3 genes and their ...,10.1111/nph.13235,,2015


In [93]:
for acc in [v["Protein ID"] for k, v in records_data.items()]:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [94]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
404,AKM28567.1,cenH3.1_(Triticeae),,,,4572.0,Triticum urartu,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKAPPKKQLGPRPAQRRQETDGAGTSATPRRAGRA...,,AKM28567.1,yuan_characterization_2015
405,AKM28568.1,cenH3.1_(Triticeae),,,,4573.0,Aegilops speltoides,Streptophyta,Magnoliopsida,,,MGRTKHPAVRKTKAPPKKQLGPRPAQRRQETDGAGTSATPRRAGRA...,,AKM28568.1,yuan_characterization_2015
406,AKM28569.1,cenH3.1_(Triticeae),,,,37682.0,Aegilops tauschii,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKAPPKKQLGPRPAQRRQETGATGQPKQRKPHRFR...,,AKM28569.1,yuan_characterization_2015
407,AKM28570.1,cenH3.2_(Triticeae),,,,4572.0,Triticum urartu,Streptophyta,Magnoliopsida,,,MGRTKHAVAATAATTETKKRLRFELSPRWRPPPALRQVPPEPQPEK...,,AKM28570.1,yuan_characterization_2015
408,AKM28571.1,cenH3.2_(Triticeae),,,,4573.0,Aegilops speltoides,Streptophyta,Magnoliopsida,,,MGRTKHAVAATAATTETKKRLRFELSPRWRPPPPMRQVPPEPQPQP...,,AKM28571.1,yuan_characterization_2015
409,AKM28572.1,cenH3.2_(Triticeae),,,,37682.0,Aegilops tauschii,Streptophyta,Magnoliopsida,,,MGRTKHAVAATATTTTTETKKRLRFELSPRWRPPPPMRQVPPEPQP...,,AKM28572.1,yuan_characterization_2015


In [95]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [96]:
cursor.close()
conn.close()
tunnel.stop()