In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

43813


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [59]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
add_histone_description = (
    "INSERT INTO histone_description "
    "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [53]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Methanothermus fervidus

В базе уже добавлены 5 последовательностей данного вида.

Последовательности ADP77717.1 и ADP77985.1 являются HMfA и HMfB, соответственно (см. [статью](https://www.pnas.org/content/117/52/33384)).

ADP77742.1 классифицирован как Coiled-coil ???

In [6]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.contains("WP_013414263")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [7]:
"melpiapigriikdagaervsddaritlakileemgrdiaseaiklarhagrktikaedielavrrfkk".upper()

'MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKLARHAGRKTIKAEDIELAVRRFKK'

In [8]:
seq = "melpiapigriikdagaervsddaritlakileemgrdiaseaiklarhagrktikaedielavrrfkk".upper()
df[df["sequence"] == seq]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
4,AAA72080.1,Nucleosomal,,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,
161,ADP77985.1,Nucleosomal,,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,


In [9]:
df[df["organism"].str.contains("Methanothermus")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
4,AAA72080.1,Nucleosomal,,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,
7,AAA73366.1,Nucleosomal,,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,
159,ADP77717.1,Nucleosomal,,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,
160,ADP77742.1,Coiled-coil,,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MEEKLPFAKAEVVRLMRKYLDDDKMIRERVKIEMNKFLGEIVKNIC...,
161,ADP77985.1,Nucleosomal,,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,


# Add HMfA and HMfB

## HMfA_(Methanobacteriales)

In [20]:
data_histone = {
    "id": "HMfA",
    "level": "variant",
    "taxonomic_span": "Methanothermus fervidus",
    "taxonomic_span_id": "523846",
    "description": None,
    "parent": "Nucleosomal",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [21]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HMfA"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
193,HMfA,variant,Methanothermus fervidus,523846,,Nucleosomal


In [22]:
query = f"UPDATE histone SET id='HMfA_(Methanobacteriales)', taxonomic_span='Methanobacteriales', taxonomic_span_id=2158 WHERE id='HMfA'"
cursor.execute(query)

query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HMfA_(Methanobacteriales)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
193,HMfA_(Methanobacteriales),variant,Methanobacteriales,2158,,Nucleosomal


In [23]:
histone_description_summary = "HMfA_(Methanobacteriales) is a group of histones from various species of Methanobacteriales. These histones are homologs for the HMfA histone of Methanothermus fervidus [stevens_histone_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HMfA_(Methanobacteriales)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [24]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HMfA_(Methanobacteriales)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HMfA_(Methanobacteriales),variant,Methanobacteriales,2158,223,Nucleosomal,223,HMfA_(Methanobacteriales) is a group of histon...,,,...,,,,,,,,,,


## HMfA_(Methanothermus_fervidus)

In [27]:
data_histone = {
    "id": "HMfA_(Methanothermus_fervidus)",
    "level": "variant",
    "taxonomic_span": "Methanothermus fervidus DSM 2088",
    "taxonomic_span_id": "523846",
    "description": None,
    "parent": "HMfA_(Methanobacteriales)",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [28]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HMfA_(Methanothermus_fervidus)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
194,HMfA_(Methanothermus_fervidus),variant,Methanothermus fervidus DSM 2088,523846,,HMfA_(Methanobacteriales)


In [29]:
histone_description_summary = "HMfA_(Methanothermus_fervidus) are histones encoded by the HMfA gene of Methanothermus fervidus [stevens_histone_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HMfA_(Methanothermus_fervidus)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [30]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HMfA_(Methanothermus_fervidus)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HMfA_(Methanothermus_fervidus),variant,Methanothermus fervidus DSM 2088,523846,224,HMfA_(Methanobacteriales),224,HMfA_(Methanothermus_fervidus) are histones en...,,,...,,,,,,,,,,


## HMfB_(Methanobacteriales)

In [32]:
data_histone = {
    "id": "HMfB_(Methanobacteriales)",
    "level": "variant",
    "taxonomic_span": "Methanothermus fervidus",
    "taxonomic_span_id": "523846",
    "description": None,
    "parent": "Nucleosomal",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [33]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HMfB_(Methanobacteriales)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
195,HMfB_(Methanobacteriales),variant,Methanothermus fervidus,523846,,Nucleosomal


In [34]:
histone_description_summary = "HMfB_(Methanobacteriales) is a group of histones from various species of Methanobacteriales. These histones are homologs for the HMfB histone of Methanothermus fervidus [stevens_histone_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HMfB_(Methanobacteriales)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [35]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HMfB_(Methanobacteriales)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HMfB_(Methanobacteriales),variant,Methanothermus fervidus,523846,225,Nucleosomal,225,HMfB_(Methanobacteriales) is a group of histon...,,,...,,,,,,,,,,


## HMfB_(Methanothermus_fervidus)

In [36]:
data_histone = {
    "id": "HMfB_(Methanothermus_fervidus)",
    "level": "variant",
    "taxonomic_span": "Methanothermus fervidus DSM 2088",
    "taxonomic_span_id": "523846",
    "description": None,
    "parent": "HMfB_(Methanobacteriales)",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [37]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HMfB_(Methanothermus_fervidus)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
196,HMfB_(Methanothermus_fervidus),variant,Methanothermus fervidus DSM 2088,523846,,HMfB_(Methanobacteriales)


In [38]:
histone_description_summary = "HMfB_(Methanothermus_fervidus) are histones encoded by the HMfB gene of Methanothermus fervidus [stevens_histone_2020]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HMfB_(Methanothermus_fervidus)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [39]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HMfB_(Methanothermus_fervidus)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HMfB_(Methanothermus_fervidus),variant,Methanothermus fervidus DSM 2088,523846,226,HMfB_(Methanobacteriales),226,HMfB_(Methanothermus_fervidus) are histones en...,,,...,,,,,,,,,,


## Correct variant for ADP77717.1 and ADP77985.1

Последовательности ADP77717.1 и ADP77985.1 являются HMfA и HMfB, соответственной (см. [статью](https://www.pnas.org/content/117/52/33384)).

In [40]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.contains("ADP77717.1")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
159,ADP77717.1,Nucleosomal,,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,


In [41]:
query = f"UPDATE sequence SET variant='HMfA_(Methanothermus_fervidus)' WHERE accession='ADP77717.1'"
cursor.execute(query)

query = f"UPDATE sequence SET variant='HMfB_(Methanothermus_fervidus)' WHERE accession='ADP77985.1'"
cursor.execute(query)

In [42]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["ADP77717.1", "ADP77985.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
159,ADP77717.1,HMfA_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,
161,ADP77985.1,HMfB_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,


In [43]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["ADP77717.1", "ADP77985.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
163,ADP77717.1,HMfA_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,ADP77717.1,schwab_histones_2024
165,ADP77985.1,HMfB_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,ADP77985.1,schwab_histones_2024


In [45]:
pid = "stevens_histone_2020"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [46]:
data_publication = {
    "id": pid,
    "title": "Histone variants in archaea and the evolution of combinatorial chromatin complexity",
    "doi": "10.1073/pnas.2007056117",
    "author": None,
    "year": "2020",
}
cursor.execute(add_publication, data_publication)

In [47]:
for acc in ["ADP77717.1", "ADP77985.1"]:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [48]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["ADP77717.1", "ADP77985.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
163,ADP77717.1,HMfA_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,ADP77717.1,schwab_histones_2024
164,ADP77717.1,HMfA_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,ADP77717.1,stevens_histone_2020
166,ADP77985.1,HMfB_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,ADP77985.1,schwab_histones_2024
167,ADP77985.1,HMfB_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,ADP77985.1,stevens_histone_2020


In [49]:
# Make sure data is committed to the database
conn.commit()

# Add WP_013413995 and WP_013414263

These seqs from [article](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1007582)

In [52]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["WP_013413995", "WP_013414263"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


## Add WP_013413995 as HMfA_(Methanothermus_fervidus)

In [54]:
accession = "WP_013413995"

In [55]:
with Entrez.efetch(db="protein", id=accession, rettype="gb", retmode="text") as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)
print(record.seq)

ID: WP_013413995.1
Name: WP_013413995
Description: histone HmfA [Methanothermus fervidus]
Number of features: 4
/topology=linear
/data_file_division=BCT
/date=22-OCT-2023
/accessions=['WP_013413995']
/sequence_version=1
/keywords=['RefSeq']
/source=Methanothermus fervidus
/organism=Methanothermus fervidus
/taxonomy=['Archaea', 'Euryarchaeota', 'Methanomada group', 'Methanobacteria', 'Methanobacteriales', 'Methanothermaceae', 'Methanothermus']
/references=[Reference(title='Specific DNA binding of archaeal histones HMfA and HMfB', ...), Reference(title='Crystal structures of recombinant histones HMfA and HMfB from the hyperthermophilic archaeon Methanothermus fervidus', ...), Reference(title='HMf, a DNA-binding protein isolated from the hyperthermophilic archaeon Methanothermus fervidus, is most closely related to histones', ...)]
/comment=REFSEQ: This record represents a single, non-redundant, protein
sequence which may be annotated on many different RefSeq genomes
from the same, or dif

In [62]:
record.id

'WP_013413995.1'

In [57]:
taxonomy_data = get_taxonomy_data(record)
data_sequence = {
    "accession": accession,
    "variant": "HMfA_(Methanothermus_fervidus)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
for k, v in data_sequence.items():
    print(k, v, type(v))

Fetched taxid from NCBI 2180
accession WP_013413995 <class 'str'>
variant HMfA_(Methanothermus_fervidus) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 2180 <class 'int'>
organism Methanothermus fervidus <class 'str'>
phylum Methanobacteriota <class 'str'>
class Methanobacteria <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVKLAKHAGRKTIKAEDIELARKMFK <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [60]:
cursor.execute(add_sequence, data_sequence)

In [63]:
query = f"UPDATE sequence SET accession='{record.id}' WHERE accession='WP_013413995'"
cursor.execute(query)

In [65]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.contains("WP_013413995")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,,


In [66]:
pid = "henneman_structure_2018"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [67]:
data_publication = {
    "id": pid,
    "title": "Structure and function of archaeal histones",
    "doi": "10.1371/journal.pgen.1007582",
    "author": None,
    "year": "2018",
}
cursor.execute(add_publication, data_publication)

In [68]:
cursor.execute(add_sequence_has_publication, ("WP_013413995.1", pid))

In [69]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.contains("WP_013413995")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,WP_013413995.1,henneman_structure_2018


In [70]:
# Make sure data is committed to the database
conn.commit()

## Add WP_013414263 as HMfB_(Methanothermus_fervidus)

In [71]:
accession = "WP_013414263"

In [72]:
with Entrez.efetch(db="protein", id=accession, rettype="gb", retmode="text") as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)
print(record.seq)

ID: WP_013414263.1
Name: WP_013414263
Description: histone HmfB [Methanothermus fervidus]
Number of features: 4
/topology=linear
/data_file_division=BCT
/date=22-OCT-2023
/accessions=['WP_013414263']
/sequence_version=1
/keywords=['RefSeq']
/source=Methanothermus fervidus
/organism=Methanothermus fervidus
/taxonomy=['Archaea', 'Euryarchaeota', 'Methanomada group', 'Methanobacteria', 'Methanobacteriales', 'Methanothermaceae', 'Methanothermus']
/references=[Reference(title='Specific DNA binding of archaeal histones HMfA and HMfB', ...), Reference(title='Crystal structures of recombinant histones HMfA and HMfB from the hyperthermophilic archaeon Methanothermus fervidus', ...), Reference(title='NMR structure of HMfB from the hyperthermophile, Methanothermus fervidus, confirms that this archaeal protein is a histone', ...), Reference(title='HMf, a DNA-binding protein isolated from the hyperthermophilic archaeon Methanothermus fervidus, is most closely related to histones', ...)]
/comment=RE

In [73]:
accession = record.id

In [75]:
taxonomy_data = get_taxonomy_data(record)
data_sequence = {
    "accession": accession,
    "variant": "HMfB_(Methanothermus_fervidus)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
for k, v in data_sequence.items():
    print(k, v, type(v))

Fetched taxid from NCBI 2180
accession WP_013414263.1 <class 'str'>
variant HMfB_(Methanothermus_fervidus) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 2180 <class 'int'>
organism Methanothermus fervidus <class 'str'>
phylum Methanobacteriota <class 'str'>
class Methanobacteria <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKLARHAGRKTIKAEDIELAVRRFKK <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [76]:
cursor.execute(add_sequence, data_sequence)

In [77]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"] == accession]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5003,WP_013414263.1,HMfB_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,,


In [78]:
pid = "henneman_structure_2018"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,henneman_structure_2018,Structure and function of archaeal histones,10.1371/journal.pgen.1007582,,2018


In [79]:
cursor.execute(add_sequence_has_publication, (accession, pid))

In [80]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"] == accession]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5003,WP_013414263.1,HMfB_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,WP_013414263.1,henneman_structure_2018


In [81]:
# Make sure data is committed to the database
conn.commit()

## Correct references

In [84]:
accessions = ["WP_013413995.1", "WP_013414263.1"]

In [85]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,WP_013413995.1,henneman_structure_2018
5003,WP_013414263.1,HMfB_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,WP_013414263.1,henneman_structure_2018


In [87]:
query = (
    f"DELETE FROM sequence_has_publication WHERE sequence_accession='WP_013413995.1'"
)
cursor.execute(query)

query = (
    f"DELETE FROM sequence_has_publication WHERE sequence_accession='WP_013414263.1'"
)
cursor.execute(query)

In [88]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,,
5003,WP_013414263.1,HMfB_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,,


In [89]:
pid = "mattiroli_structure_2017"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [90]:
data_publication = {
    "id": pid,
    "title": "Structure of Histone-based Chromatin in Archaea",
    "doi": "10.1126/science.aaj1849",
    "author": None,
    "year": "2017",
}
cursor.execute(add_publication, data_publication)

In [91]:
for acc in accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [92]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,WP_013413995.1,mattiroli_structure_2017
5003,WP_013414263.1,HMfB_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,WP_013414263.1,mattiroli_structure_2017


In [93]:
# Make sure data is committed to the database
conn.commit()

# Thermococcus

In [110]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["organism"].str.contains("Thermococcus")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
11,AAB53861.1,Nucleosomal,,,,1151117.0,Thermococcus zilligii AN1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKALAEYLEEYAIEVGKKATE...,
93,ACJ15670.1,Nucleosomal,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAMDLAKRAAE...,
94,ACJ16232.1,FtF,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAEMIVKSKVKEAVKAIDPEMRINPEFYEALEAEIKILIEKAVKRA...,
95,ACJ16723.1,Nucleosomal,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,
105,ACS32979.1,Nucleosomal,,,,593117.0,Thermococcus gammatolerans EJ3,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKAIEIAKKAVE...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3764,QEK14533.1,Nucleosomal,,,,2598455.0,Thermococcus aciditolerans,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIELARKSAD...,
3765,QEK15571.1,Nucleosomal,,,,2598455.0,Thermococcus aciditolerans,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIARKAVD...,
4414,SEV81931.1,Nucleosomal,,,,277988.0,Thermococcus thioreducens,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,
4415,SEV88938.1,FtF,,,,277988.0,Thermococcus thioreducens,Methanobacteriota,Thermococci,,,MAELIVKSKVKEAVKAIEPEMRVNPEFYEALEAEIKALIEKAVKRA...,


In [111]:
df_therm = df[df["organism"].str.contains("Thermococcus")]

In [112]:
df_therm["variant"].value_counts()

variant
Nucleosomal    91
FtF            41
Name: count, dtype: int64

In [113]:
df_therm[df_therm["organism"].str.contains("TS600")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


# Add HTkA and HTkB

## HTkA_(Thermococcales)

In [94]:
data_histone = {
    "id": "HTkA_(Thermococcales)",
    "level": "variant",
    "taxonomic_span": "Thermococcales",
    "taxonomic_span_id": "2258",
    "description": None,
    "parent": "Nucleosomal",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [95]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HTkA_(Thermococcales)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
197,HTkA_(Thermococcales),variant,Thermococcales,2258,,Nucleosomal


In [96]:
histone_description_summary = "HTkA_(Thermococcales) is a group of histones from various species of Thermococcales. These histones are homologs for the HTkA histone of Thermococcus kodakarensis [stevens_deep_2022]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HTkA_(Thermococcales)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [97]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HTkA_(Thermococcales)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HTkA_(Thermococcales),variant,Thermococcales,2258,227,Nucleosomal,227,HTkA_(Thermococcales) is a group of histones f...,,,...,,,,,,,,,,


## HTkA_(Thermococcus_kodakarensis)

In [98]:
data_histone = {
    "id": "HTkA_(Thermococcus_kodakarensis)",
    "level": "variant",
    "taxonomic_span": "Thermococcus kodakarensis",
    "taxonomic_span_id": "311400 ",
    "description": None,
    "parent": "HTkA_(Thermococcales)",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [99]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HTkA_(Thermococcus_kodakarensis)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
198,HTkA_(Thermococcus_kodakarensis),variant,Thermococcus kodakarensis,311400,,HTkA_(Thermococcales)


In [100]:
histone_description_summary = "HTkA_(Thermococcus_kodakarensis) are histones encoded by the HTkA gene of Thermococcus_kodakarensis [stevens_deep_2022]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HTkA_(Thermococcus_kodakarensis)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [101]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HTkA_(Thermococcus_kodakarensis)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HTkA_(Thermococcus_kodakarensis),variant,Thermococcus kodakarensis,311400,228,HTkA_(Thermococcales),228,HTkA_(Thermococcus_kodakarensis) are histones ...,,,...,,,,,,,,,,


## HTkB_(Thermococcales)

In [102]:
data_histone = {
    "id": "HTkB_(Thermococcales)",
    "level": "variant",
    "taxonomic_span": "Thermococcales",
    "taxonomic_span_id": "2258",
    "description": None,
    "parent": "Nucleosomal",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [103]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HTkB_(Thermococcales)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
199,HTkB_(Thermococcales),variant,Thermococcales,2258,,Nucleosomal


In [104]:
histone_description_summary = "HTkB_(Thermococcales) is a group of histones from various species of Thermococcales. These histones are homologs for the HTkB histone of Thermococcus kodakarensis [stevens_deep_2022]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HTkB_(Thermococcales)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [105]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HTkB_(Thermococcales)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HTkB_(Thermococcales),variant,Thermococcales,2258,229,Nucleosomal,229,HTkB_(Thermococcales) is a group of histones f...,,,...,,,,,,,,,,


## HTkB_(Thermococcus_kodakarensis)

In [106]:
data_histone = {
    "id": "HTkB_(Thermococcus_kodakarensis)",
    "level": "variant",
    "taxonomic_span": "Thermococcus kodakarensis",
    "taxonomic_span_id": "311400",
    "description": None,
    "parent": "HTkB_(Thermococcales)",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [107]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "HTkB_(Thermococcus_kodakarensis)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
200,HTkB_(Thermococcus_kodakarensis),variant,Thermococcus kodakarensis,311400,,HTkB_(Thermococcales)


In [108]:
histone_description_summary = "HTkB_(Thermococcus_kodakarensis) are histones encoded by the HTkB gene of Thermococcus kodakarensis [stevens_deep_2022]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET description={histone_description_id} WHERE id='HTkB_(Thermococcus_kodakarensis)'"
cursor.execute(query)

# Make sure data is committed to the database
conn.commit()

In [109]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='HTkB_(Thermococcus_kodakarensis)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,HTkB_(Thermococcus_kodakarensis),variant,Thermococcus kodakarensis,311400,230,HTkB_(Thermococcales),230,HTkB_(Thermococcus_kodakarensis) are histones ...,,,...,,,,,,,,,,


In [None]:
pid = "stevens_deep_2022"

# Close connections

In [114]:
cursor.close()
conn.close()
tunnel.stop()