In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

36799


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [7]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

In [55]:
with Entrez.efetch(db="protein", id=accession, rettype="gb", retmode="text") as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)
print(record.seq)

ID: WP_013413995.1
Name: WP_013413995
Description: histone HmfA [Methanothermus fervidus]
Number of features: 4
/topology=linear
/data_file_division=BCT
/date=22-OCT-2023
/accessions=['WP_013413995']
/sequence_version=1
/keywords=['RefSeq']
/source=Methanothermus fervidus
/organism=Methanothermus fervidus
/taxonomy=['Archaea', 'Euryarchaeota', 'Methanomada group', 'Methanobacteria', 'Methanobacteriales', 'Methanothermaceae', 'Methanothermus']
/references=[Reference(title='Specific DNA binding of archaeal histones HMfA and HMfB', ...), Reference(title='Crystal structures of recombinant histones HMfA and HMfB from the hyperthermophilic archaeon Methanothermus fervidus', ...), Reference(title='HMf, a DNA-binding protein isolated from the hyperthermophilic archaeon Methanothermus fervidus, is most closely related to histones', ...)]
/comment=REFSEQ: This record represents a single, non-redundant, protein
sequence which may be annotated on many different RefSeq genomes
from the same, or dif

In [62]:
record.id

'WP_013413995.1'

In [57]:
taxonomy_data = get_taxonomy_data(record)
data_sequence = {
    "accession": accession,
    "variant": "HMfA_(Methanothermus_fervidus)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
for k, v in data_sequence.items():
    print(k, v, type(v))

Fetched taxid from NCBI 2180
accession WP_013413995 <class 'str'>
variant HMfA_(Methanothermus_fervidus) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 2180 <class 'int'>
organism Methanothermus fervidus <class 'str'>
phylum Methanobacteriota <class 'str'>
class Methanobacteria <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVKLAKHAGRKTIKAEDIELARKMFK <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [60]:
cursor.execute(add_sequence, data_sequence)

In [63]:
query = f"UPDATE sequence SET accession='{record.id}' WHERE accession='WP_013413995'"
cursor.execute(query)

In [65]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.contains("WP_013413995")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,,


In [66]:
pid = "henneman_structure_2018"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [67]:
data_publication = {
    "id": pid,
    "title": "Structure and function of archaeal histones",
    "doi": "10.1371/journal.pgen.1007582",
    "author": None,
    "year": "2018",
}
cursor.execute(add_publication, data_publication)

In [68]:
cursor.execute(add_sequence_has_publication, ("WP_013413995.1", pid))

In [69]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.contains("WP_013413995")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,WP_013413995.1,henneman_structure_2018


In [70]:
# Make sure data is committed to the database
conn.commit()

# Thermococcus

В базе уже добавлены 132 последовательностей данного отряда.

Некоторые последовательности попали в группу FtF. Скорее всего это гистоны, похожие на бактериальные. В [статье](https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab274/6459647) говорилось о том, что отряд *Thermococcus* имеет такие гистоны. Поэтому надо будет проверить.

Гены hpkA (TK1413) и hpkB (TK2289) кодируют гистоны HTkA и HTkB, соответственно (см. [статью](https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab274/6459647)). Accessions для HTkA: WP_011250364.1, BAA77575.1, BAD85602.1, CAT72447.1; для HTkB: WP_011251239.1, BAA77576.1, BAD86478.1, CAT71503.1.

BAD85229.1 классифицирован как FtF ???

In [8]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["organism"].str.contains("Thermococcus")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
11,AAB53861.1,Nucleosomal,,,,1151117.0,Thermococcus zilligii AN1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKALAEYLEEYAIEVGKKATE...,
93,ACJ15670.1,Nucleosomal,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAMDLAKRAAE...,
94,ACJ16232.1,FtF,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAEMIVKSKVKEAVKAIDPEMRINPEFYEALEAEIKILIEKAVKRA...,
95,ACJ16723.1,Nucleosomal,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,
105,ACS32979.1,Nucleosomal,,,,593117.0,Thermococcus gammatolerans EJ3,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKAIEIAKKAVE...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3764,QEK14533.1,Nucleosomal,,,,2598455.0,Thermococcus aciditolerans,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIELARKSAD...,
3765,QEK15571.1,Nucleosomal,,,,2598455.0,Thermococcus aciditolerans,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIARKAVD...,
4414,SEV81931.1,Nucleosomal,,,,277988.0,Thermococcus thioreducens,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,
4415,SEV88938.1,FtF,,,,277988.0,Thermococcus thioreducens,Methanobacteriota,Thermococci,,,MAELIVKSKVKEAVKAIEPEMRVNPEFYEALEAEIKALIEKAVKRA...,


In [9]:
df_therm = df[df["organism"].str.contains("Thermococcus")]

In [10]:
df_therm["variant"].value_counts()

variant
Nucleosomal    91
FtF            41
Name: count, dtype: int64

In [11]:
df_therm[df_therm["organism"].str.contains("TS600")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [14]:
df_therm[df_therm["organism"].str.contains("Thermococcus kodakarensis")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
477,BAA77575.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,
478,BAA77576.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,
480,BAD85229.1,FtF,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAEMLVKSKVKEFVKSVDPEMRVSPEFYDALEAEVKALVEKAIKRA...,
481,BAD85602.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,
482,BAD86478.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,


In [12]:
accessions = {
    "HTkA": ["WP_011250364.1", "BAA77575.1", "BAD85602.1", "CAT72447.1"],
    "HTkB": ["WP_011251239.1", "BAA77576.1", "BAD86478.1", "CAT71503.1"],
}
df_therm[df_therm["accession"].isin(accessions["HTkA"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
477,BAA77575.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,
481,BAD85602.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,


In [13]:
df_therm[df_therm["accession"].isin(accessions["HTkB"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
478,BAA77576.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,
482,BAD86478.1,Nucleosomal,,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,


# Correct variants for BAA77575.1 and BAD85602.1

In [15]:
query = f"UPDATE sequence SET variant='HTkA_(Thermococcus_kodakarensis)' WHERE accession='BAA77575.1'"
cursor.execute(query)

query = f"UPDATE sequence SET variant='HTkA_(Thermococcus_kodakarensis)' WHERE accession='BAD85602.1'"
cursor.execute(query)

In [18]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions["HTkA"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
483,BAA77575.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,BAA77575.1,schwab_histones_2024
487,BAD85602.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,BAD85602.1,schwab_histones_2024


In [19]:
pid = "stevens_deep_2022"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [20]:
data_publication = {
    "id": pid,
    "title": "Deep Conservation of Histone Variants in Thermococcales Archaea",
    "doi": "10.1093/gbe/evab274",
    "author": None,
    "year": "2022",
}
cursor.execute(add_publication, data_publication)

In [21]:
for acc in ["BAA77575.1", "BAD85602.1"]:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [22]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions["HTkA"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
483,BAA77575.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,BAA77575.1,schwab_histones_2024
484,BAA77575.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,BAA77575.1,stevens_deep_2022
488,BAD85602.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,BAD85602.1,schwab_histones_2024
489,BAD85602.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,BAD85602.1,stevens_deep_2022


In [23]:
# Make sure data is committed to the database
conn.commit()

# Correct variants for BAA77576.1 and BAD86478.1

In [24]:
query = f"UPDATE sequence SET variant='HTkB_(Thermococcus_kodakarensis)' WHERE accession='BAA77576.1'"
cursor.execute(query)

query = f"UPDATE sequence SET variant='HTkB_(Thermococcus_kodakarensis)' WHERE accession='BAD86478.1'"
cursor.execute(query)

In [25]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions["HTkB"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
485,BAA77576.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,BAA77576.1,schwab_histones_2024
490,BAD86478.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,BAD86478.1,schwab_histones_2024


In [26]:
pid = "stevens_deep_2022"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,stevens_deep_2022,Deep Conservation of Histone Variants in Therm...,10.1093/gbe/evab274,,2022


In [27]:
for acc in ["BAA77576.1", "BAD86478.1"]:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [28]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions["HTkB"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
485,BAA77576.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,BAA77576.1,schwab_histones_2024
486,BAA77576.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,BAA77576.1,stevens_deep_2022
491,BAD86478.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,BAD86478.1,schwab_histones_2024
492,BAD86478.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,BAD86478.1,stevens_deep_2022


In [29]:
# Make sure data is committed to the database
conn.commit()

# Add WP_011250364.1 and CAT72447.1 as HTkA_(Thermococcus_kodakarensis)

These seqs from [article](https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab274/6459647)

In [31]:
accessions = ["WP_011250364.1", "CAT72447.1"]

In [32]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [36]:
data_sequence = []
for acc in accessions:
    with Entrez.efetch(db="protein", id=acc, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": record.id,
        "variant": "HTkA_(Thermococcus_kodakarensis)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

Fetched taxid from NCBI 2263
Fetched taxid from NCBI 69014


In [37]:
for ds in data_sequence:
    cursor.execute(add_sequence, ds)

In [38]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
555,CAT72447.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,,
5007,WP_011250364.1,HTkA_(Thermococcus_kodakarensis),,,,2263.0,Thermococcus,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,,


In [39]:
pid = "stevens_deep_2022"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,stevens_deep_2022,Deep Conservation of Histone Variants in Therm...,10.1093/gbe/evab274,,2022


In [40]:
for acc in accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [41]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
555,CAT72447.1,HTkA_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,CAT72447.1,stevens_deep_2022
5007,WP_011250364.1,HTkA_(Thermococcus_kodakarensis),,,,2263.0,Thermococcus,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEDAAKVLAEYLEEYAIELSKKAVD...,,WP_011250364.1,stevens_deep_2022


In [42]:
# Make sure data is committed to the database
conn.commit()

# Add WP_011251239.1 and CAT71503.1 as HTkB_(Thermococcus_kodakarensis)

These seqs from [article](https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab274/6459647)

In [43]:
accessions = ["WP_011251239.1", "CAT71503.1"]

In [44]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [46]:
data_sequence = []
for acc in accessions:
    with Entrez.efetch(db="protein", id=acc, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": record.id,
        "variant": "HTkB_(Thermococcus_kodakarensis)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

Fetched taxid from NCBI 2263
Fetched taxid from NCBI 69014


In [47]:
for ds in data_sequence:
    cursor.execute(add_sequence, ds)

In [48]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
555,CAT71503.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,,
5009,WP_011251239.1,HTkB_(Thermococcus_kodakarensis),,,,2263.0,Thermococcus,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,,


In [49]:
pid = "stevens_deep_2022"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,stevens_deep_2022,Deep Conservation of Histone Variants in Therm...,10.1093/gbe/evab274,,2022


In [50]:
for acc in accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [51]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
555,CAT71503.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,CAT71503.1,stevens_deep_2022
5009,WP_011251239.1,HTkB_(Thermococcus_kodakarensis),,,,2263.0,Thermococcus,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,WP_011251239.1,stevens_deep_2022


In [52]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as Nucleosomal

These seqs from [article](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1007582)

In [84]:
accessions = [
    "BAI60563"
    "ADC47610"
    "WP_02966277"
    "KYK38613"
    "WP_048125684"
    "WP_011973395"
    "WP_010871171"
    "KYH40538"
    "KPV63666"
    "WP_04104690"
    "AIE90726"
    "BAJ48508"
    "KON33214"
    "WP_052884954"
    "WP_012186746"
    "OLS16336"
    "OLS15619"
    "OLS12771"
    "KKK44894"
    "KKK41688"
    "KKK45508"
    "EGQ43804"
    "AOV94489"
    "EHK02195"
    "OLS26110"
    "OLS24625"
    "OLS19133"
    "OLS18443"
    "OLS19133"
    "OLS22331"
    "AAC72546"
    "WP_008091782"
    "WP_015792102"
    "AAR39136"
]

In [85]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
5002,WP_013413995.1,HMfA_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,WP_013413995.1,henneman_structure_2018
5003,WP_013414263.1,HMfB_(Methanothermus_fervidus),,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,WP_013414263.1,henneman_structure_2018


# Close connections

In [53]:
cursor.close()
conn.close()
tunnel.stop()