In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

36671


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [55]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
add_histone_description = (
    "INSERT INTO histone_description "
    "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

In [52]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add cenH3.1 and cenH3.2 of cowpea (Vigna unguiculata)

ishii_unequal_2020

In [8]:
data_histone = [
    {
        "id": "cenH3.1",
        "level": "variant",
        "taxonomic_span": "Vigna unguiculata",
        "taxonomic_span_id": "3917",
        "description": None,
        "parent": "cenH3_(Plants)",
    },
    {
        "id": "cenH3.2",
        "level": "variant",
        "taxonomic_span": "Vigna unguiculata",
        "taxonomic_span_id": "3917",
        "description": None,
        "parent": "cenH3_(Plants)",
    },
]
for dh in data_histone:
    cursor.execute(add_histone, dh)

In [9]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin(["cenH3.1", "cenH3.2"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
5,cenH3.1,variant,Vigna unguiculata,3917,,cenH3_(Plants)
6,cenH3.2,variant,Vigna unguiculata,3917,,cenH3_(Plants)


In [10]:
# Make sure data is committed to the database
conn.commit()

# Add description for cenH3.1_(Vigna_unguiculata)

In [11]:
query = f"UPDATE histone SET id='cenH3.1_(Vigna_unguiculata)' WHERE id='cenH3.1'"
cursor.execute(query)
query = f"UPDATE histone SET id='cenH3.2_(Vigna_unguiculata)' WHERE id='cenH3.2'"
cursor.execute(query)

query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[
    histone_df["id"].isin(
        ["cenH3.1_(Vigna_unguiculata)", "cenH3.2_(Vigna_unguiculata)"]
    )
]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
5,cenH3.1_(Vigna_unguiculata),variant,Vigna unguiculata,3917,,cenH3_(Plants)
6,cenH3.2_(Vigna_unguiculata),variant,Vigna unguiculata,3917,,cenH3_(Plants)


In [12]:
histone_description_summary = "cenH3.1_(Vigna_unguiculata) is a centromere-specific histone H3 variant encoded by CENH3.1 (one of two paralogous genes) in cowpea (Vigna unguiculata). Unlike CENH3.2, it is essential for normal plant development and fertility, as its knockout causes growth defects and sterility. At the protein level, cenH3.1 shares 91% amino acid identity with its couterpart. CENH3.1 is dominantly expressed in most tissues and persists in generative cells of pollen, whereas CENH3.2 is dispensable and shows tissue-specific removal [ishii_unequal_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.1_(Vigna_unguiculata)'"
cursor.execute(query)

In [13]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.1_(Vigna_unguiculata)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.1_(Vigna_unguiculata),variant,Vigna unguiculata,3917,231,cenH3_(Plants),231,cenH3.1_(Vigna_unguiculata) is a centromere-sp...,,,...,,,,,,,,,,


In [14]:
# Make sure data is committed to the database
conn.commit()

# Add description for cenH3.2_(Vigna_unguiculata)

In [15]:
histone_description_summary = "cenH3.2_(Vigna_unguiculata) is a centromere-specific histone H3 variant encoded by CENH3.2, one of two paralogous genes in cowpea (Vigna unguiculata). Unlike CENH3.1, it is dispensable for normal plant development and fertility, as its knockout does not cause visible defects. At the protein level, cenH3.2 shares 91% amino acid identity with cenH3.1, with key differences concentrated in the N-terminal domain. While cenH3.2 is incorporated into centromeres, its expression is lower and tissue-specific, and it is selectively removed from generative cells during pollen maturation. These findings suggest that CENH3.2 may be undergoing pseudogenization rather than functional specialization [ishii_unequal_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='cenH3.2_(Vigna_unguiculata)'"
cursor.execute(query)

In [16]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3.2_(Vigna_unguiculata)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3.2_(Vigna_unguiculata),variant,Vigna unguiculata,3917,232,cenH3_(Plants),232,cenH3.2_(Vigna_unguiculata) is a centromere-sp...,,,...,,,,,,,,,,


In [18]:
# Make sure data is committed to the database
conn.commit()

# Add sequences of cenH3.1_(Vigna_unguiculata), cenH3.2_(Vigna_unguiculata)

## Get protein IDs

In [41]:
records_data = {}
other_records = {}
for accession_id in [f"LC4909{i:02d}" for i in range(3, 41)]:
    # ID записи в DDBJ
    print(f"### Search for {accession_id} ...")
    # Загружаем запись в формате GenBank
    with Entrez.efetch(
        db="nucleotide", id=accession_id, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    if not record.annotations["organism"].startswith("Vigna unguiculata"):
        other_records[accession_id] = {
            "ID": record.id,
            "Description": record.description,
            "Organism": record.annotations["organism"],
        }
        continue
    # Выводим информацию о записи
    print(f"ID: {record.id}")
    print(f"Описание: {record.description}")
    # Извлекаем идентификаторы белков
    protein_ids = []
    for feature in record.features:
        if feature.type == "CDS":  # Ищем только кодирующие последовательности (белки)
            if "protein_id" in feature.qualifiers:
                protein_ids.append(feature.qualifiers["protein_id"][0])
            elif (
                "db_xref" in feature.qualifiers
            ):  # Ищем идентификаторы в db_xref (например, UniProt)
                for xref in feature.qualifiers["db_xref"]:
                    if xref.startswith("GI:") or xref.startswith(
                        "UniProt:"
                    ):  # Можно добавить другие базы
                        protein_ids.append(xref)
    # Выводим результат
    print("Найденные идентификаторы белков:", end=" ")
    for pid in protein_ids:
        print(pid)
    records_data[accession_id] = {
            "ID": record.id,
            "Description": record.description,
            "Organism": record.annotations["organism"],
            "Protein ID": protein_ids[0]
        }

### Search for LC490903 ...
ID: LC490903.1
Описание: Vigna unguiculata IT86D-1010 CENH3_1 gene for centromeric histone CENH3, partial cds
Найденные идентификаторы белков: BBM60641.1
### Search for LC490904 ...
ID: LC490904.1
Описание: Vigna unguiculata IT86D-1010 CENH3_2 gene for centromeric histone CENH3, partial cds
Найденные идентификаторы белков: BBM60642.1
### Search for LC490905 ...
ID: LC490905.1
Описание: Vigna unguiculata subsp. unguiculata NI147 CENH3_1 gene for centromeric histone CENH3, partial cds
Найденные идентификаторы белков: BBM60643.1
### Search for LC490906 ...
ID: LC490906.1
Описание: Vigna unguiculata subsp. unguiculata NI147 CENH3_2 gene for centromeric histone CENH3, partial cds
Найденные идентификаторы белков: BBM60644.1
### Search for LC490907 ...
ID: LC490907.1
Описание: Vigna unguiculata subsp. sesquipedalis NI126 CENH3_1 gene for centromeric histone CENH3, partial cds
Найденные идентификаторы белков: BBM60645.1
### Search for LC490908 ...
ID: LC490908.1
Опи

In [42]:
for k, v in other_records.items():
    print(v['Description'])

Vigna umbellata var. umbellata NI204 CENH3 gene for centromeric histone CENH3, partial cds
Vigna angularis NI615 CENH3 gene for centromeric histone CENH3, partial cds
Vigna aconitifolia NI41 CENH3 gene for centromeric histone CENH3, partial cds
Vigna radiata NI4 CENH3 gene for centromeric histone CENH3, partial cds
Vigna mungo NI515 CENH3_1 gene for centromeric histone CENH3, partial cds
Vigna mungo NI515 CENH3_2 gene for centromeric histone CENH3, partial cds
Vigna reflexopilosa var. glabra NI532 CENH3_a gene for centromeric histone CENH3, partial cds
Vigna reflexopilosa var. glabra NI532 CENH3_b gene for centromeric histone CENH3, partial cds
Vigna reflexopilosa var. reflexopilosa NI1684 CENH3_a gene for centromeric histone CENH3, partial cds
Vigna reflexopilosa var. reflexopilosa NI1684 CENH3_b gene for centromeric histone CENH3, partial cds
Vigna trilobata NI453 CENH3 gene for centromeric histone CENH3, partial cds
Vigna vexillata NI1859 CENH3 gene for centromeric histone CENH3, pa

**Vigna mungo & Vigna reflexopilosa also has two isiforms ??**

In [44]:
for k, v in records_data.items():
    print(v['Description'])

Vigna unguiculata IT86D-1010 CENH3_1 gene for centromeric histone CENH3, partial cds
Vigna unguiculata IT86D-1010 CENH3_2 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. unguiculata NI147 CENH3_1 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. unguiculata NI147 CENH3_2 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. sesquipedalis NI126 CENH3_1 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. sesquipedalis NI126 CENH3_2 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. stenophylla NI1419 CENH3_1 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. stenophylla NI1419 CENH3_2 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. alba NI1656 CENH3_1 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. alba NI1656 CENH3_2 gene for centromeric histone CENH3, partial cds
Vigna unguiculata subsp. pawekiae NI1638 CEN

## Add sequences to curatedDB

In [47]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [51]:
with Entrez.efetch(
    db="protein", id="BBM60641.1", rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
str(record.seq)

'MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEEAAAAAPQTHGRKKKRSKPGTAALREIRHFQKSCKLLIPAAPFIRCVKQITHQFSTEVSRWTPEAVVALQEAAEECLVHLFEDGMLCAIHARRITLMTKDIQLARRRNRKALV'

In [53]:
data_sequence_list = []
for k, v in records_data.items():
    with Entrez.efetch(
        db="protein", id=v["Protein ID"], rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    isoform = v['Description'].split("CENH3_")[1][0]
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": f"cenH3.{isoform}_(Vigna_unguiculata)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3920
Fetched taxid from NCBI 3920
Fetched taxid from NCBI 138955
Fetched taxid from NCBI 138955
Fetched taxid from NCBI 459609
Fetched taxid from NCBI 459609
Fetched taxid from NCBI 460247
Fetched taxid from NCBI 460247
Fetched taxid from NCBI 2598452
Fetched taxid from NCBI 2598452
Fetched taxid from NCBI 460248
Fetched taxid from NCBI 460248
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 3917
Fetched taxid from NCBI 459607
Fetched taxid from NCBI 459607
accession BBM60670.1 <class 'str'>
variant cenH3.2_(Vigna_unguiculata) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 459607 <class 'int'>

In [56]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [57]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
533,BBM60641.1,cenH3.1_(Vigna_unguiculata),,,,3917.0,Vigna unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,
534,BBM60642.1,cenH3.2_(Vigna_unguiculata),,,,3917.0,Vigna unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSVPPSQQSPVTRSNRRAQEEEPQEEAAA...,
535,BBM60643.1,cenH3.1_(Vigna_unguiculata),,,,3920.0,Vigna unguiculata subsp. unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,
536,BBM60644.1,cenH3.2_(Vigna_unguiculata),,,,3920.0,Vigna unguiculata subsp. unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSVPPSQQSPVTRSNRRAQEEEPQEEAAA...,
537,BBM60645.1,cenH3.1_(Vigna_unguiculata),,,,138955.0,Vigna unguiculata subsp. sesquipedalis,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,
538,BBM60646.1,cenH3.2_(Vigna_unguiculata),,,,138955.0,Vigna unguiculata subsp. sesquipedalis,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSVPPSQQSPVTRSNRRAQEEEPQEEAAA...,
539,BBM60647.1,cenH3.1_(Vigna_unguiculata),,,,459609.0,Vigna unguiculata subsp. stenophylla,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSMPPSQQSPVTRSNRRAQEEEPQEEAAA...,
540,BBM60648.1,cenH3.2_(Vigna_unguiculata),,,,459609.0,Vigna unguiculata subsp. stenophylla,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,
541,BBM60649.1,cenH3.1_(Vigna_unguiculata),,,,460247.0,Vigna unguiculata subsp. alba,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,
542,BBM60650.1,cenH3.2_(Vigna_unguiculata),,,,460247.0,Vigna unguiculata subsp. alba,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSMPPSQQSPVTRSNRRAQEEEPQEEAAA...,


In [58]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [59]:
pid = "ishii_unequal_2020"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [60]:
data_publication = {
    "id": pid,
    "title": "Unequal contribution of two paralogous CENH3 variants in cowpea centromere function",
    "doi": "10.1038/s42003-020-01507-x",
    "author": None,
    "year": "2020",
}
cursor.execute(add_publication, data_publication)

In [61]:
for acc in [v["Protein ID"] for k, v in records_data.items()]:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [62]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
666,BBM60641.1,cenH3.1_(Vigna_unguiculata),,,,3917.0,Vigna unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,,BBM60641.1,ishii_unequal_2020
667,BBM60642.1,cenH3.2_(Vigna_unguiculata),,,,3917.0,Vigna unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSVPPSQQSPVTRSNRRAQEEEPQEEAAA...,,BBM60642.1,ishii_unequal_2020
668,BBM60643.1,cenH3.1_(Vigna_unguiculata),,,,3920.0,Vigna unguiculata subsp. unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,,BBM60643.1,ishii_unequal_2020
669,BBM60644.1,cenH3.2_(Vigna_unguiculata),,,,3920.0,Vigna unguiculata subsp. unguiculata,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSVPPSQQSPVTRSNRRAQEEEPQEEAAA...,,BBM60644.1,ishii_unequal_2020
670,BBM60645.1,cenH3.1_(Vigna_unguiculata),,,,138955.0,Vigna unguiculata subsp. sesquipedalis,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,,BBM60645.1,ishii_unequal_2020
671,BBM60646.1,cenH3.2_(Vigna_unguiculata),,,,138955.0,Vigna unguiculata subsp. sesquipedalis,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSVPPSQQSPVTRSNRRAQEEEPQEEAAA...,,BBM60646.1,ishii_unequal_2020
672,BBM60647.1,cenH3.1_(Vigna_unguiculata),,,,459609.0,Vigna unguiculata subsp. stenophylla,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSMPPSQQSPVTRSNRRAQEEEPQEEAAA...,,BBM60647.1,ishii_unequal_2020
673,BBM60648.1,cenH3.2_(Vigna_unguiculata),,,,459609.0,Vigna unguiculata subsp. stenophylla,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,,BBM60648.1,ishii_unequal_2020
674,BBM60649.1,cenH3.1_(Vigna_unguiculata),,,,460247.0,Vigna unguiculata subsp. alba,Streptophyta,Magnoliopsida,,,MARVKHTPASLKVGKKKVSRASTSTPQQSPATRSRRRAQEEEPQEE...,,BBM60649.1,ishii_unequal_2020
675,BBM60650.1,cenH3.2_(Vigna_unguiculata),,,,460247.0,Vigna unguiculata subsp. alba,Streptophyta,Magnoliopsida,,,MARVKHTPASLKASRASTSMPPSQQSPVTRSNRRAQEEEPQEEAAA...,,BBM60650.1,ishii_unequal_2020


In [63]:
# Make sure data is committed to the database
conn.commit()

# Add publication for histone variants

In [64]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(["cenH3.1_(Vigna_unguiculata)", "cenH3.2_(Vigna_unguiculata)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
5,cenH3.1_(Vigna_unguiculata),variant,Vigna unguiculata,3917,231.0,cenH3_(Plants),,
6,cenH3.2_(Vigna_unguiculata),variant,Vigna unguiculata,3917,232.0,cenH3_(Plants),,


In [65]:
for hv in ["cenH3.1_(Vigna_unguiculata)", "cenH3.2_(Vigna_unguiculata)"]:
    cursor.execute(add_histone_has_publication, (hv, pid))

In [66]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(["cenH3.1_(Vigna_unguiculata)", "cenH3.2_(Vigna_unguiculata)"])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
5,cenH3.1_(Vigna_unguiculata),variant,Vigna unguiculata,3917,231.0,cenH3_(Plants),cenH3.1_(Vigna_unguiculata),ishii_unequal_2020
6,cenH3.2_(Vigna_unguiculata),variant,Vigna unguiculata,3917,232.0,cenH3_(Plants),cenH3.2_(Vigna_unguiculata),ishii_unequal_2020


In [67]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [68]:
cursor.close()
conn.close()
tunnel.stop()