In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

35907


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [74]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [15]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# To Do

## <span style="color:green">Add sequences from `neumann_centromeres_2015`</span>

Accessions (CDS) from Supplementary Table S2:
AB649144.1
KM822210
XM_003637685
KM822207
KM822209
KM822208
KM822212
KM822213
KM822214
KM822215
KM822216
KM822217
KM822218
KM822211
extracted from JF739989
KM822220
KM822219
KM822221
KM822222
KM822223
KM822224
KM822225
GT621186.1 KM822227
KM822226
extracted from JF739990
KM822228
KM822229
KM822231
KM822232
KM822233
KM822234
KM822235
KM822230

## <span style="color:green">Add sequences from `sanei_loss_2011`, `karimi-ashtiyani_point_2015` and `ishii_differential_2015`</span>
Protein Accessions: AEK21394.1
AEK21393.1
AEK21392.1
ADB03182.1

## <span style="color:green">Add sequences from `neumann_stretching_2012`</span>
Protein Accessions: AEX31246.1
AEX31245.1

## <span style="color:green">Add sequences from `kawabe_duplication_2006`</span>
Protein Accessions:
ABE27630.1
ABE27629.1
ABE27628.1
ABE27627.1
ABE27626.1
ABE27625.1
ABE27624.1
ABE27623.1
ABE27622.1
ABE27621.1
ABE27620.1
ABE27619.1
ABE27618.1
ABE27617.1
ABE27616.1
ABE27615.1
ABE27614.1
ABE27613.1
ABE27612.1
ABE27611.1
ABE27610.1
ABE27609.1
ABE27608.1
ABE27607.1
ABE27606.1
ABE27605.1
ABE27604.1
ABE27603.1
ABE27602.1
ABE27601.1
ABE27600.1
ABE27662.1
ABE27661.1
ABE27660.1
ABE27659.1
ABE27658.1
ABE27657.1
ABE27656.1
ABE27655.1
ABE27654.1
ABE27653.1
ABE27652.1
ABE27651.1
ABE27650.1
ABE27649.1
ABE27648.1
ABE27647.1
ABE27646.1
ABE27645.1
ABE27644.1
ABE27643.1
ABE27642.1
ABE27641.1
ABE27640.1
ABE27639.1
ABE27638.1
ABE27637.1
ABE27636.1
ABE27635.1
ABE27634.1
ABE27633.1
ABE27632.1
ABE27631.1
BAC79432.1
BAC79431.1
BAC79430.1
BAC79429.1
BAC79428.1
BAC79427.1
    
## <span style="color:green">Add sequences from `evtushenko_conserved_2017`</span>

Protein Accessions: AUN88474.1
AUN88473.1
AUN88472.1
AUN88471.1
AUN88470.1
AUN88469.1
AUN88468.1
AUN88467.1
AUN88466.1
AUN88465.1
AUN88464.1
AUN88463.1
AUN88462.1
AUN88461.1
AUN88460.1
AUN88459.1
AUN88458.1
AUN88457.1
AUN88456.1
AUN88455.1
AUN88454.1
AUN88453.1
AUN88452.1
AUN88451.1
AUN88450.1
AUN88449.1
    
## <span style="color:black">Add sequences from `monen_differential_2005` and `monen_separase_2015`</span>

Protein Accessions: NP_499128.1 (hcp-3) NP_499073.1 (cpar-1)

# <span style="color:black">Add sequences from `neumann_centromeres_2015`</span>

Accessions (CDS) from Supplementary Table S2:
AB649144.1
KM822210
XM_003637685
KM822207
KM822209
KM822208
KM822212
KM822213
KM822214
KM822215
KM822216
KM822217
KM822218
KM822211
extracted from JF739989
KM822220
KM822219
KM822221
KM822222
KM822223
KM822224
KM822225
GT621186.1 KM822227
KM822226
extracted from JF739990
KM822228
KM822229
KM822231
KM822232
KM822233
KM822234
KM822235
KM822230

In [7]:
cds_accessions = [
    "AB649144.1",
    "KM822210",
    "XM_003637685",
    "KM822207",
    "KM822209",
    "KM822208",
    "KM822212",
    "KM822213",
    "KM822214",
    "KM822215",
    "KM822216",
    "KM822217",
    "KM822218",
    "KM822211",
    "JF739989",
    "KM822220",
    "KM822219",
    "KM822221",
    "KM822222",
    "KM822223",
    "KM822224",
    "KM822225",
    "KM822227",
    "KM822226",
    "JF739990",
    "KM822228",
    "KM822229",
    "KM822231",
    "KM822232",
    "KM822233",
    "KM822234",
    "KM822235",
    "KM822230",
]

## Get protein IDs

In [8]:
records_data = {}
other_records = {}
for accession_id in cds_accessions:
    # ID записи в NCBI nuccore
    print(f"### Search for {accession_id} ...")
    # Загружаем запись в формате GenBank
    with Entrez.efetch(
        db="nucleotide", id=accession_id, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    if not (
        record.annotations["organism"].startswith(
            ("Pisum", "Lathyrus", "Vicia", "Lens")
        )
    ):
        other_records[accession_id] = {
            "ID": record.id,
            "Description": record.description,
            "Organism": record.annotations["organism"],
        }
        continue
    # Выводим информацию о записи
    print(f"ID: {record.id}")
    print(f"Описание: {record.description}")
    # Извлекаем идентификаторы белков
    protein_ids = []
    for feature in record.features:
        if feature.type == "CDS":  # Ищем только кодирующие последовательности (белки)
            if "protein_id" in feature.qualifiers:
                protein_ids.append(feature.qualifiers["protein_id"][0])
            elif (
                "db_xref" in feature.qualifiers
            ):  # Ищем идентификаторы в db_xref (например, UniProt)
                for xref in feature.qualifiers["db_xref"]:
                    if xref.startswith("GI:") or xref.startswith(
                        "UniProt:"
                    ):  # Можно добавить другие базы
                        protein_ids.append(xref)
    # Выводим результат
    print("Найденные идентификаторы белков:", end=" ")
    for pid in protein_ids:
        print(pid)
    records_data[accession_id] = {
        "ID": record.id,
        "Description": record.description,
        "Organism": record.annotations["organism"],
        "Protein ID": protein_ids[0],
    }

### Search for AB649144.1 ...
### Search for KM822210 ...
### Search for XM_003637685 ...
### Search for KM822207 ...
### Search for KM822209 ...
### Search for KM822208 ...
### Search for KM822212 ...
ID: KM822212.1
Описание: Lathyrus clymenum isolate CenH3-1_LAC centromere-specific variant of histone H3 mRNA, complete cds
Найденные идентификаторы белков: AKA94118.1
### Search for KM822213 ...
ID: KM822213.1
Описание: Lathyrus latifolius isolate CenH3-1_LAL centromere-specific variant of histone H3 mRNA, complete cds
Найденные идентификаторы белков: AKA94119.1
### Search for KM822214 ...
ID: KM822214.1
Описание: Lathyrus niger isolate CenH3-1_LAN centromere-specific variant of histone H3 mRNA, complete cds
Найденные идентификаторы белков: AKA94120.1
### Search for KM822215 ...
ID: KM822215.1
Описание: Lathyrus ochrus isolate CenH3-1_LAO centromere-specific variant of histone H3 mRNA, complete cds
Найденные идентификаторы белков: AKA94121.1
### Search for KM822216 ...
ID: KM822216.1
Оп

In [9]:
records_data

{'KM822212': {'ID': 'KM822212.1',
  'Description': 'Lathyrus clymenum isolate CenH3-1_LAC centromere-specific variant of histone H3 mRNA, complete cds',
  'Organism': 'Lathyrus clymenum',
  'Protein ID': 'AKA94118.1'},
 'KM822213': {'ID': 'KM822213.1',
  'Description': 'Lathyrus latifolius isolate CenH3-1_LAL centromere-specific variant of histone H3 mRNA, complete cds',
  'Organism': 'Lathyrus latifolius',
  'Protein ID': 'AKA94119.1'},
 'KM822214': {'ID': 'KM822214.1',
  'Description': 'Lathyrus niger isolate CenH3-1_LAN centromere-specific variant of histone H3 mRNA, complete cds',
  'Organism': 'Lathyrus niger',
  'Protein ID': 'AKA94120.1'},
 'KM822215': {'ID': 'KM822215.1',
  'Description': 'Lathyrus ochrus isolate CenH3-1_LAO centromere-specific variant of histone H3 mRNA, complete cds',
  'Organism': 'Lathyrus ochrus',
  'Protein ID': 'AKA94121.1'},
 'KM822216': {'ID': 'KM822216.1',
  'Description': 'Lathyrus sativus isolate CenH3-1_LAS centromere-specific variant of histone H3

In [10]:
other_records

{'AB649144.1': {'ID': 'AB649144.1',
  'Description': 'Astragalus sinicus AsCENH3 mRNA for centromere specific histone H3 variant, complete cds',
  'Organism': 'Astragalus sinicus'},
 'KM822210': {'ID': 'KM822210.1',
  'Description': 'Cicer arietinum isolate CenH3_CicA centromere-specific variant of histone H3 mRNA, complete cds',
  'Organism': 'Cicer arietinum'},
 'XM_003637685': {'ID': 'XM_003637685.1',
  'Description': 'Medicago truncatula Histone H3 (MTR_100s0023) mRNA, complete cds',
  'Organism': 'Medicago truncatula'},
 'KM822207': {'ID': 'KM822207.1',
  'Description': 'Melilotus albus isolate CenH3_MelA centromere-specific variant of histone H3 mRNA, complete cds',
  'Organism': 'Melilotus albus'},
 'KM822209': {'ID': 'KM822209.1',
  'Description': 'Trifolium pratense isolate CenH3_TriP centromere-specific variant of histone H3 mRNA, complete cds',
  'Organism': 'Trifolium pratense'},
 'KM822208': {'ID': 'KM822208.1',
  'Description': 'Trigonella foenum-graecum isolate CenH3_TFG

## Add sequences to curatedDB

In [11]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [13]:
with Entrez.efetch(
    db="protein", id="AKA94118.1", rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
str(record.seq)

'MGRVKQFPPSKFAARNNHEKKKRRVKPGTVALREIRKFQKDVKLLIPYAPFVRCVREITTQFSSLVTRWTPEALISLQEAAEDDLIRMFEAGMLCAIHARRITLIKKDIELTRRLTGIGRLR'

In [16]:
data_sequence_list = []
for k, v in records_data.items():
    with Entrez.efetch(
        db="protein", id=v["Protein ID"], rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "cenH3_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 3855
Fetched taxid from NCBI 154494
Fetched taxid from NCBI 313121
Fetched taxid from NCBI 3858
Fetched taxid from NCBI 3860
Fetched taxid from NCBI 313117
Fetched taxid from NCBI 313120
Fetched taxid from NCBI 51020
Fetched taxid from NCBI 3888
Fetched taxid from NCBI 3855
Fetched taxid from NCBI 154494
Fetched taxid from NCBI 313121
Fetched taxid from NCBI 3858
Fetched taxid from NCBI 3860
Fetched taxid from NCBI 313117
Fetched taxid from NCBI 313120
Fetched taxid from NCBI 3864
Fetched taxid from NCBI 51020
Fetched taxid from NCBI 3888
Fetched taxid from NCBI 3906
Fetched taxid from NCBI 347192
Fetched taxid from NCBI 29753
Fetched taxid from NCBI 233247
Fetched taxid from NCBI 3908
Fetched taxid from NCBI 347188
Fetched taxid from NCBI 3911
Fetched taxid from NCBI 3912
accession AKA94136.1 <class 'str'>
variant cenH3_(Plants) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 3912 

In [17]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [18]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
207,AEX31245.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,
208,AEX31246.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MARVKQTPRHARENQERKKRRNKPGTVALREIKKLQKTFQLLIPYA...,
302,AKA94117.1,cenH3_(Plants),,,,51020.0,Pisum fulvum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,
303,AKA94118.1,cenH3_(Plants),,,,3855.0,Lathyrus clymenum,Streptophyta,Magnoliopsida,,,MGRVKQFPPSKFAARNNHEKKKRRVKPGTVALREIRKFQKDVKLLI...,
304,AKA94119.1,cenH3_(Plants),,,,154494.0,Lathyrus latifolius,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASNNHEKKKRRSKPGTKAVREIRKFQKDVKLL...,
305,AKA94120.1,cenH3_(Plants),,,,313121.0,Lathyrus niger,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKSAASDNHGKKKRRSKPGTKALREIRKFQKDVKLL...,
306,AKA94121.1,cenH3_(Plants),,,,3858.0,Lathyrus ochrus,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKSAANNTHEKKKRRFKRGTALQEIRKFQKDVKLLI...,
307,AKA94122.1,cenH3_(Plants),,,,3860.0,Lathyrus sativus,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASDNQEKRKRRSKPGTKAVREIRKFQKDVKLL...,
308,AKA94123.1,cenH3_(Plants),,,,313117.0,Lathyrus sylvestris,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASNNHEKKKRRSKPGTKAVREIRKFQKDVKLL...,
309,AKA94124.1,cenH3_(Plants),,,,313120.0,Lathyrus vernus,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASDNHGKKKRLSKPGTKALREIRKFQNDVKLL...,


In [19]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [20]:
pid = "neumann_centromeres_2015"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year,pubmed_id
0,neumann_centromeres_2015,,,,,


In [21]:
for acc in [v["Protein ID"] for k, v in records_data.items()]:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [22]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin([v["Protein ID"] for k, v in records_data.items()])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
262,AEX31245.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,,AEX31245.1,neumann_centromeres_2015
263,AEX31246.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MARVKQTPRHARENQERKKRRNKPGTVALREIKKLQKTFQLLIPYA...,,AEX31246.1,neumann_centromeres_2015
379,AKA94117.1,cenH3_(Plants),,,,51020.0,Pisum fulvum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,,AKA94117.1,neumann_centromeres_2015
380,AKA94118.1,cenH3_(Plants),,,,3855.0,Lathyrus clymenum,Streptophyta,Magnoliopsida,,,MGRVKQFPPSKFAARNNHEKKKRRVKPGTVALREIRKFQKDVKLLI...,,AKA94118.1,neumann_centromeres_2015
381,AKA94119.1,cenH3_(Plants),,,,154494.0,Lathyrus latifolius,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASNNHEKKKRRSKPGTKAVREIRKFQKDVKLL...,,AKA94119.1,neumann_centromeres_2015
382,AKA94120.1,cenH3_(Plants),,,,313121.0,Lathyrus niger,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKSAASDNHGKKKRRSKPGTKALREIRKFQKDVKLL...,,AKA94120.1,neumann_centromeres_2015
383,AKA94121.1,cenH3_(Plants),,,,3858.0,Lathyrus ochrus,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKSAANNTHEKKKRRFKRGTALQEIRKFQKDVKLLI...,,AKA94121.1,neumann_centromeres_2015
384,AKA94122.1,cenH3_(Plants),,,,3860.0,Lathyrus sativus,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASDNQEKRKRRSKPGTKAVREIRKFQKDVKLL...,,AKA94122.1,neumann_centromeres_2015
385,AKA94123.1,cenH3_(Plants),,,,313117.0,Lathyrus sylvestris,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASNNHEKKKRRSKPGTKAVREIRKFQKDVKLL...,,AKA94123.1,neumann_centromeres_2015
386,AKA94124.1,cenH3_(Plants),,,,313120.0,Lathyrus vernus,Streptophyta,Magnoliopsida,,,MGRVKHFPRPSKPAASDNHGKKKRLSKPGTKALREIRKFQNDVKLL...,,AKA94124.1,neumann_centromeres_2015


In [23]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add sequences from `sanei_loss_2011`, `karimi-ashtiyani_point_2015` and `ishii_differential_2015`</span>
Protein Accessions: AEK21394.1
AEK21393.1
AEK21392.1
ADB03182.1

In [24]:
accessions = ["AEK21394.1", "AEK21393.1", "AEK21392.1", "ADB03182.1"]

## Add sequences to curatedDB

In [25]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [26]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "cenH3_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 4516
Fetched taxid from NCBI 112509
Fetched taxid from NCBI 112509
Fetched taxid from NCBI 4516
accession ADB03182.1 <class 'str'>
variant cenH3_(Plants) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 4516 <class 'int'>
organism Hordeum bulbosum <class 'str'>
phylum Streptophyta <class 'str'>
class Magnoliopsida <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKHPAVRKSKAPPRKKVGSARAPAAAQRRHETDGAGTSETPRRGPAPAADQGAPGEPKKRKPHRYRPGTVALREIRKYQKSVDFLIPFAPFVRLVKEVTEFYCPAISRWTPQALLAVQEAAEYHLVDVFERAHLCAIHAKRVTVMQKDIQLA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [27]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [28]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
126,ADB03182.1,cenH3_(Plants),,,,4516.0,Hordeum bulbosum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPRKKVGSARAPAAAQRRHETDGAGTSETPRR...,
202,AEK21392.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPKKKIGSASSPSAAQRRQETDGAGTSETPRR...,
203,AEK21393.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKKTVAAKEKRPPCSKSEPQSQPKKKEKRAYRFRPGTVALREI...,
204,AEK21394.1,cenH3_(Plants),,,,4516.0,Hordeum bulbosum,Streptophyta,Magnoliopsida,,,MARTKKTVAATKRRSPRTRLEPQSQPEKKKRAHRFRPGTVALREIR...,


In [29]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [30]:
pids = ["sanei_loss_2011", "karimi-ashtiyani_point_2015", "ishii_differential_2015"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
108,ishii_differential_2015,,,,,
114,karimi-ashtiyani_point_2015,,,,,
145,sanei_loss_2011,,,,,


In [31]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [32]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
154,ADB03182.1,cenH3_(Plants),,,,4516.0,Hordeum bulbosum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPRKKVGSARAPAAAQRRHETDGAGTSETPRR...,,ADB03182.1,ishii_differential_2015
155,ADB03182.1,cenH3_(Plants),,,,4516.0,Hordeum bulbosum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPRKKVGSARAPAAAQRRHETDGAGTSETPRR...,,ADB03182.1,karimi-ashtiyani_point_2015
156,ADB03182.1,cenH3_(Plants),,,,4516.0,Hordeum bulbosum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPRKKVGSARAPAAAQRRHETDGAGTSETPRR...,,ADB03182.1,sanei_loss_2011
257,AEK21392.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPKKKIGSASSPSAAQRRQETDGAGTSETPRR...,,AEK21392.1,ishii_differential_2015
258,AEK21392.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPKKKIGSASSPSAAQRRQETDGAGTSETPRR...,,AEK21392.1,karimi-ashtiyani_point_2015
259,AEK21392.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKHPAVRKSKAPPKKKIGSASSPSAAQRRQETDGAGTSETPRR...,,AEK21392.1,sanei_loss_2011
260,AEK21393.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKKTVAAKEKRPPCSKSEPQSQPKKKEKRAYRFRPGTVALREI...,,AEK21393.1,ishii_differential_2015
261,AEK21393.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKKTVAAKEKRPPCSKSEPQSQPKKKEKRAYRFRPGTVALREI...,,AEK21393.1,karimi-ashtiyani_point_2015
262,AEK21393.1,cenH3_(Plants),,,,112509.0,Hordeum vulgare subsp. vulgare,Streptophyta,Magnoliopsida,,,MARTKKTVAAKEKRPPCSKSEPQSQPKKKEKRAYRFRPGTVALREI...,,AEK21393.1,sanei_loss_2011
263,AEK21394.1,cenH3_(Plants),,,,4516.0,Hordeum bulbosum,Streptophyta,Magnoliopsida,,,MARTKKTVAATKRRSPRTRLEPQSQPEKKKRAHRFRPGTVALREIR...,,AEK21394.1,ishii_differential_2015


In [33]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add sequences from `neumann_stretching_2012`</span>
Protein Accessions: AEX31246.1
AEX31245.1

In [34]:
accessions = ["AEX31246.1", "AEX31245.1"]

## Add sequences to curatedDB

In [35]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
211,AEX31245.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,
212,AEX31246.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MARVKQTPRHARENQERKKRRNKPGTVALREIKKLQKTFQLLIPYA...,


In [36]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
274,AEX31245.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,,AEX31245.1,neumann_centromeres_2015
275,AEX31246.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MARVKQTPRHARENQERKKRRNKPGTVALREIKKLQKTFQLLIPYA...,,AEX31246.1,neumann_centromeres_2015


## Add sequence publication

In [37]:
pid = "neumann_stretching_2012"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year,pubmed_id
0,neumann_stretching_2012,,,,,


In [38]:
for acc in accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [39]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
274,AEX31245.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,,AEX31245.1,neumann_centromeres_2015
275,AEX31245.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MGRVKHFPSPSKPAASDNLGKKKRRCKPGTKALREIRKFQKDVKLL...,,AEX31245.1,neumann_stretching_2012
276,AEX31246.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MARVKQTPRHARENQERKKRRNKPGTVALREIKKLQKTFQLLIPYA...,,AEX31246.1,neumann_centromeres_2015
277,AEX31246.1,cenH3_(Plants),,,,3888.0,Pisum sativum,Streptophyta,Magnoliopsida,,,MARVKQTPRHARENQERKKRRNKPGTVALREIKKLQKTFQLLIPYA...,,AEX31246.1,neumann_stretching_2012


In [40]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add sequences from `kawabe_duplication_2006`</span>
Protein Accessions:
ABE27630.1
ABE27629.1
ABE27628.1
ABE27627.1
ABE27626.1
ABE27625.1
ABE27624.1
ABE27623.1
ABE27622.1
ABE27621.1
ABE27620.1
ABE27619.1
ABE27618.1
ABE27617.1
ABE27616.1
ABE27615.1
ABE27614.1
ABE27613.1
ABE27612.1
ABE27611.1
ABE27610.1
ABE27609.1
ABE27608.1
ABE27607.1
ABE27606.1
ABE27605.1
ABE27604.1
ABE27603.1
ABE27602.1
ABE27601.1
ABE27600.1
ABE27662.1
ABE27661.1
ABE27660.1
ABE27659.1
ABE27658.1
ABE27657.1
ABE27656.1
ABE27655.1
ABE27654.1
ABE27653.1
ABE27652.1
ABE27651.1
ABE27650.1
ABE27649.1
ABE27648.1
ABE27647.1
ABE27646.1
ABE27645.1
ABE27644.1
ABE27643.1
ABE27642.1
ABE27641.1
ABE27640.1
ABE27639.1
ABE27638.1
ABE27637.1
ABE27636.1
ABE27635.1
ABE27634.1
ABE27633.1
ABE27632.1
ABE27631.1
BAC79432.1
BAC79431.1
BAC79430.1
BAC79429.1
BAC79428.1
BAC79427.1

In [41]:
accessions = [
    "ABE27630.1",
    "ABE27629.1",
    "ABE27628.1",
    "ABE27627.1",
    "ABE27626.1",
    "ABE27625.1",
    "ABE27624.1",
    "ABE27623.1",
    "ABE27622.1",
    "ABE27621.1",
    "ABE27620.1",
    "ABE27619.1",
    "ABE27618.1",
    "ABE27617.1",
    "ABE27616.1",
    "ABE27615.1",
    "ABE27614.1",
    "ABE27613.1",
    "ABE27612.1",
    "ABE27611.1",
    "ABE27610.1",
    "ABE27609.1",
    "ABE27608.1",
    "ABE27607.1",
    "ABE27606.1",
    "ABE27605.1",
    "ABE27604.1",
    "ABE27603.1",
    "ABE27602.1",
    "ABE27601.1",
    "ABE27600.1",
    "ABE27662.1",
    "ABE27661.1",
    "ABE27660.1",
    "ABE27659.1",
    "ABE27658.1",
    "ABE27657.1",
    "ABE27656.1",
    "ABE27655.1",
    "ABE27654.1",
    "ABE27653.1",
    "ABE27652.1",
    "ABE27651.1",
    "ABE27650.1",
    "ABE27649.1",
    "ABE27648.1",
    "ABE27647.1",
    "ABE27646.1",
    "ABE27645.1",
    "ABE27644.1",
    "ABE27643.1",
    "ABE27642.1",
    "ABE27641.1",
    "ABE27640.1",
    "ABE27639.1",
    "ABE27638.1",
    "ABE27637.1",
    "ABE27636.1",
    "ABE27635.1",
    "ABE27634.1",
    "ABE27633.1",
    "ABE27632.1",
    "ABE27631.1",
    "BAC79432.1",
    "BAC79431.1",
    "BAC79430.1",
    "BAC79429.1",
    "BAC79428.1",
    "BAC79427.1",
]

## Add sequences to curatedDB

In [42]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [43]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "cenH3_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 81972
Fetched taxid from NCBI 81972
Fetched taxid from NCBI 81972
Fetched taxid from NCBI 81972
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 59691
Fetched taxid from NCBI 81971
Fetched taxid from NCBI 81972
Fetched taxid from NCBI 81972
Fetched ta

In [44]:
len(accessions), len(data_sequence_list)

(69, 69)

In [45]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [46]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
62,ABE27600.1,cenH3_(Plants),,,,81971.0,Arabidopsis halleri subsp. halleri,Streptophyta,Magnoliopsida,,,MARTKHFVTRKGSGNRTDFDANASSSQAAGPTKTPTTRGTEGGDNT...,
63,ABE27601.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRTGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,
64,ABE27602.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRSGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,
65,ABE27603.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRTGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,
66,ABE27604.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRSGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
601,BAC79428.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFVTRKGSGNRTDFDANASSSQAAGPTKTPTTRGTEGGDNT...,
602,BAC79429.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFVTRKGSGNRTDFDANASSSQAAGPTKTPTTRGTEGGDNT...,
603,BAC79430.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFAIKSRSGNRTDANASSSQAAGPTTTPTTRGTEGGDNTQQ...,
604,BAC79431.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFAIKSRSGNRTDANASSSQAAGPTTTPTTRGTEGGDNTQQ...,


In [47]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [48]:
pid = "kawabe_duplication_2006"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year,pubmed_id
0,kawabe_duplication_2006,,,,,


In [49]:
for acc in accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [50]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
79,ABE27600.1,cenH3_(Plants),,,,81971.0,Arabidopsis halleri subsp. halleri,Streptophyta,Magnoliopsida,,,MARTKHFVTRKGSGNRTDFDANASSSQAAGPTKTPTTRGTEGGDNT...,,ABE27600.1,kawabe_duplication_2006
80,ABE27601.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRTGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,,ABE27601.1,kawabe_duplication_2006
81,ABE27602.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRSGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,,ABE27602.1,kawabe_duplication_2006
82,ABE27603.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRTGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,,ABE27603.1,kawabe_duplication_2006
83,ABE27604.1,cenH3_(Plants),,,,59691.0,Arabidopsis lyrata subsp. petraea,Streptophyta,Magnoliopsida,,,MARTKHFATRSGSGNRTDANASSSQAAGPTTTPTTRGSEGGDNTQQ...,,ABE27604.1,kawabe_duplication_2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,BAC79428.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFVTRKGSGNRTDFDANASSSQAAGPTKTPTTRGTEGGDNT...,,BAC79428.1,kawabe_duplication_2006
737,BAC79429.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFVTRKGSGNRTDFDANASSSQAAGPTKTPTTRGTEGGDNT...,,BAC79429.1,kawabe_duplication_2006
738,BAC79430.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFAIKSRSGNRTDANASSSQAAGPTTTPTTRGTEGGDNTQQ...,,BAC79430.1,kawabe_duplication_2006
739,BAC79431.1,cenH3_(Plants),,,,63677.0,Arabidopsis halleri subsp. gemmifera,Streptophyta,Magnoliopsida,,,MARTKHFAIKSRSGNRTDANASSSQAAGPTTTPTTRGTEGGDNTQQ...,,BAC79431.1,kawabe_duplication_2006


In [51]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add sequences from `evtushenko_conserved_2017`</span>

Protein Accessions: AUN88474.1
AUN88473.1
AUN88472.1
AUN88471.1
AUN88470.1
AUN88469.1
AUN88468.1
AUN88467.1
AUN88466.1
AUN88465.1
AUN88464.1
AUN88463.1
AUN88462.1
AUN88461.1
AUN88460.1
AUN88459.1
AUN88458.1
AUN88457.1
AUN88456.1
AUN88455.1
AUN88454.1
AUN88453.1
AUN88452.1
AUN88451.1
AUN88450.1
AUN88449.1

In [52]:
accessions = [
    "AUN88474.1",
    "AUN88473.1",
    "AUN88472.1",
    "AUN88471.1",
    "AUN88470.1",
    "AUN88469.1",
    "AUN88468.1",
    "AUN88467.1",
    "AUN88466.1",
    "AUN88465.1",
    "AUN88464.1",
    "AUN88463.1",
    "AUN88462.1",
    "AUN88461.1",
    "AUN88460.1",
    "AUN88459.1",
    "AUN88458.1",
    "AUN88457.1",
    "AUN88456.1",
    "AUN88455.1",
    "AUN88454.1",
    "AUN88453.1",
    "AUN88452.1",
    "AUN88451.1",
    "AUN88450.1",
    "AUN88449.1",
]

## Add sequences to curatedDB

In [53]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [54]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "cenH3_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 155090
Fetched taxid from NCBI 155101
Fetched taxid from NCBI 155093
Fetched taxid from NCBI 155089
Fetched taxid from NCBI 4553
Fetched taxid from NCBI 4552
Fetched taxid from NCBI 155095
Fetched taxid from NCBI 100817
Fetched taxid from NCBI 4550
Fetched taxid from NCBI 37731
Fetched taxid from NCBI 155091
Fetched taxid from NCBI 155095
Fetched taxid from NCBI 37731
Fetched taxid from NCBI 155091
Fetched taxid from NCBI 155091
Fetched taxid from NCBI 4553
Fetched taxid from NCBI 4550
Fetched taxid from NCBI 155090
Fetched taxid from NCBI 100817
Fetched taxid from NCBI 155093
Fetched taxid from NCBI 4552
Fetched taxid from NCBI 155095
Fetched taxid from NCBI 155089
Fetched taxid from NCBI 155101
Fetched taxid from NCBI 37731
Fetched taxid from NCBI 4550
accession AUN88449.1 <class 'str'>
variant cenH3_(Plants) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 4550 <class 'int'>
organ

In [55]:
len(accessions), len(data_sequence_list)

(26, 26)

In [56]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [57]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
555,AUN88449.1,cenH3_(Plants),,,,4550.0,Secale cereale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKAPPKKQLGPRPAQRRQETDGAGTSATPRRAGRA...,
556,AUN88450.1,cenH3_(Plants),,,,37731.0,Secale strictum subsp. africanum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
557,AUN88451.1,cenH3_(Plants),,,,155101.0,Secale strictum subsp. anatolicum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
558,AUN88452.1,cenH3_(Plants),,,,155089.0,Secale cereale subsp. ancestrale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
559,AUN88453.1,cenH3_(Plants),,,,155095.0,Secale strictum subsp. strictum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
560,AUN88454.1,cenH3_(Plants),,,,4552.0,Secale sylvestre,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
561,AUN88455.1,cenH3_(Plants),,,,155093.0,Secale cereale subsp. dighoricum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
562,AUN88456.1,cenH3_(Plants),,,,100817.0,Secale strictum subsp. kuprijanovii,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
563,AUN88457.1,cenH3_(Plants),,,,155090.0,Secale cereale subsp. segetale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,
564,AUN88458.1,cenH3_(Plants),,,,4550.0,Secale cereale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,


In [58]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [59]:
pid = "evtushenko_conserved_2017"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year,pubmed_id
0,evtushenko_conserved_2017,,,,,


In [60]:
for acc in accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [61]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
676,AUN88449.1,cenH3_(Plants),,,,4550.0,Secale cereale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKAPPKKQLGPRPAQRRQETDGAGTSATPRRAGRA...,,AUN88449.1,evtushenko_conserved_2017
677,AUN88450.1,cenH3_(Plants),,,,37731.0,Secale strictum subsp. africanum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88450.1,evtushenko_conserved_2017
678,AUN88451.1,cenH3_(Plants),,,,155101.0,Secale strictum subsp. anatolicum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88451.1,evtushenko_conserved_2017
679,AUN88452.1,cenH3_(Plants),,,,155089.0,Secale cereale subsp. ancestrale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88452.1,evtushenko_conserved_2017
680,AUN88453.1,cenH3_(Plants),,,,155095.0,Secale strictum subsp. strictum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88453.1,evtushenko_conserved_2017
681,AUN88454.1,cenH3_(Plants),,,,4552.0,Secale sylvestre,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88454.1,evtushenko_conserved_2017
682,AUN88455.1,cenH3_(Plants),,,,155093.0,Secale cereale subsp. dighoricum,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88455.1,evtushenko_conserved_2017
683,AUN88456.1,cenH3_(Plants),,,,100817.0,Secale strictum subsp. kuprijanovii,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88456.1,evtushenko_conserved_2017
684,AUN88457.1,cenH3_(Plants),,,,155090.0,Secale cereale subsp. segetale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88457.1,evtushenko_conserved_2017
685,AUN88458.1,cenH3_(Plants),,,,4550.0,Secale cereale,Streptophyta,Magnoliopsida,,,MARTKHPAVRKTKVPPKKKLGTRPSGGTQRRQDTDGAGTSATPRRA...,,AUN88458.1,evtushenko_conserved_2017


In [62]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add sequences from `monen_differential_2005` and `monen_separase_2015`</span>

Protein Accessions: NP_499128.1 (hcp-3) NP_499073.1 (cpar-1)

In [63]:
accessions = ["NP_499128.1", "NP_499073.1"]

## Add sequences to curatedDB

In [64]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3280,NP_499128.1,cenH3_(Animals),17553736,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDTPIIEEIAEQNESVTRIMQRLKHDMQRVTSVPGFNTSAAGVN...,


In [65]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3580,NP_499128.1,cenH3_(Animals),17553736,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDTPIIEEIAEQNESVTRIMQRLKHDMQRVTSVPGFNTSAAGVN...,,,


In [66]:
data_sequence_list = []
for a in accessions[1:]:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "cenH3_(Plants)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 6239
accession NP_499073.1 <class 'str'>
variant cenH3_(Plants) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 6239 <class 'int'>
organism Caenorhabditis elegans <class 'str'>
phylum Nematoda <class 'str'>
class Chromadorea <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MADDGPIIEEIAEKNGRVARIMQRLQHDTQRVTSVPGFNTSATGYADLIALLDQYKNDLEAVGFNDLEQARRRAPSVDITVGSNSTNLVDYSHGRHDMPSHRRHDSSDEEITAANSHHQSPINVGNRNDTDGTNGRNGSRAGSSSSDRVRMIAGRNRISKTRRYRPGQKALEEIRKYQESEDLLIPKAPFARLVREIMQTSTPFSSDLRIRSDAINALQEASEALLVQMFDGSSLISAHSKRATLTTTDVQLYRRLCLPNL <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [67]:
len(accessions), len(data_sequence_list)

(2, 1)

In [68]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [69]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3280,NP_499073.1,cenH3_(Plants),,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDGPIIEEIAEKNGRVARIMQRLQHDTQRVTSVPGFNTSATGYA...,
3281,NP_499128.1,cenH3_(Animals),17553736.0,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDTPIIEEIAEQNESVTRIMQRLKHDMQRVTSVPGFNTSAAGVN...,


In [70]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [71]:
pids = ["monen_differential_2005", "monen_separase_2015"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [72]:
data_publication = [
    {
        "id": pid,
        "title": None,
        "doi": None,
        "author": None,
        "year": None,
    }
    for pid in pids
]

In [75]:
for dp in data_publication:
    cursor.execute(add_publication, dp)

In [76]:
pids = ["monen_differential_2005", "monen_separase_2015"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
134,monen_differential_2005,,,,,
135,monen_separase_2015,,,,,


In [77]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [78]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3580,NP_499073.1,cenH3_(Plants),,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDGPIIEEIAEKNGRVARIMQRLQHDTQRVTSVPGFNTSATGYA...,,NP_499073.1,monen_differential_2005
3581,NP_499073.1,cenH3_(Plants),,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDGPIIEEIAEKNGRVARIMQRLQHDTQRVTSVPGFNTSATGYA...,,NP_499073.1,monen_separase_2015
3582,NP_499128.1,cenH3_(Animals),17553736.0,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDTPIIEEIAEQNESVTRIMQRLKHDMQRVTSVPGFNTSAAGVN...,,NP_499128.1,monen_differential_2005
3583,NP_499128.1,cenH3_(Animals),17553736.0,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDTPIIEEIAEQNESVTRIMQRLKHDMQRVTSVPGFNTSAAGVN...,,NP_499128.1,monen_separase_2015


In [79]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [80]:
cursor.close()
conn.close()
tunnel.stop()