In [95]:
from urllib.error import HTTPError

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [96]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [97]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

34235


In [98]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [99]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [7]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add other archaeal sequences 3

These seqs from [article](https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab274/6459647)

pid='stevens_deep_2022'

In [8]:
seq_stevens_deep_2022 = pd.DataFrame(
    {
        0: [
            "AAL81846.1",
            "BAA30815.1",
            "CAB49394.1",
            "AFK23164.1",
            "AMM55014.1",
            "ASJ17568.1",
            "AEC51213.1",
            "ADT83796.1",
            "AHF80071.1",
            "BAD86478.1",
            "AHL23243.1",
            "AIU68885.1",
            "LT900021.1",
            "ASJ01246.1",
            "ACS32979.1",
            "EEB74510.1",
            "AJC71819.1",
            "ASJ03532.1",
            "LANF01000006.1",
            "ACS90646.1",
            "EHR77909.1",
            "KPU62412.1",
            "KE387150.1",
            "AEH23914.1",
            "MTLP01000042.1",
            "AIF69774.1",
            "ACJ16723.1",
            "ANF22029.1",
            "AEK73791.1",
            "AFL95685.1",
            "KUH33491.1",
            "QDA31915.1",
            "ASJ05087.1",
            "ASJ05842.1",
            "ASJ07907.1",
            "AJLF01000003.1",
            "ALV63354.1",
            "ASJ10074.1",
            "ASJ11296.1",
            "AMQ18382.1",
            "ASI99083.1",
            "ASJ13045.1",
            "ASJ14748.1",
            "RLF87512.1",
            "RLF89200.1",
            "RLF89622.1",
            "RLF86230.1",
            "RLF82482.1",
            "RLF78498.1",
            "HDZ36007.1",
            "NJE09500.1",
            "NJE07720.1",
            "NJE05021.1",
            "NJD98225.1",
            "NJE26633.1",
            "NJE02313.1",
            "NJE85045.1",
            "NJF25470.1",
            "NJE60507.1",
            "NJE53991.1",
            "NJE29684.1",
            "CAD5244819.1",
            "AAL81955.1",
            "CAB49269.1",
            "BAA30901.1",
            "AEC51304.1",
            "BAD85602.1",
            "ADT83177.1",
            "AHF79684.1",
            "AEH23840.1",
            "EHR78220.1",
            "ALV63249.1",
            "KE387149.1",
            "ANF23023.1",
            "AFL94617.1",
            "AIU69875.1",
            "ASJ06692.1",
            "AHL21670.1",
            "LT900021.1",
            "ASJ00192.1",
            "AFK23282.1",
            "AMM55009.1",
            "ASJ17557.1",
            "MTLP01000031.1",
            "ACS90532.1",
            "KPU62458.1",
            "ASJ04222.1",
            "ACS34282.1",
            "EEB74307.1",
            "ASJ08794.1",
            "AEK72656.1",
            "KUH31667.1",
            "QDA30973.1",
            "AJC71056.1",
            "ASJ02315.1",
            "AJLF01000001.1",
            "AIF68887.1",
            "ACJ15670.1",
            "LANF01000010.1",
            "AMQ19125.1",
            "ASI99810.1",
            "ASJ12022.1",
            "ASJ14408.1",
            "RLF92116.1",
            "RLF90088.1",
            "RLF83483.1",
            "RLF79735.1",
            "RLF76361.1",
            "HDZ35934.1",
            "NJE09850.1",
            "NJE07296.1",
            "NJE06015.1",
            "NJD99123.1",
            "NJE26742.1",
            "NJE27157.1",
            "NJE01747.1",
            "NJE84772.1",
            "NJF25404.1",
            "NJE62344.1",
            "NJE54925.1",
            "NJE30753.1",
            "NPA48162.1",
            "CAD5243811.1",
            "AAL81408.1",
            "BAD85229.1",
            "AHL22602.1",
            "AIU70766.1",
            "ACS33228.1",
            "AJC71892.1",
            "EEB74725.1",
            "ACJ16232.1",
            "AFL96134.1",
            "AEK72144.1",
            "KUH33956.1",
            "AMQ19473.1",
            "ASI99555.1",
            "ASJ13525.1",
            "ASJ13980.1",
            "HDZ36063.1",
            "NJE10392.1",
            "NJE04579.1",
            "NJD99609.1",
            "NJE01410.1",
            "NJF24431.1",
            "NJE61331.1",
            "NJE53775.1",
            "NJE31255.1",
            "CAD5244308.1",
            "NJE27137.1",
            "AAL80748.1",
            "BAD84939.1",
            "ACS33435.1",
            "EEB74519.1",
            "AHL22334.1",
            "LT900021.1",
            "AIU70291.1",
            "AJC72509.1",
            "QDA31581.1",
            "AEK74020.1",
            "AFL95897.1",
            "ASJ04868.1",
            "ASJ00818.1",
            "AJLF01000001.1",
            "ASJ07429.1",
            "LANF01000011.1",
            "ADT84726.1",
            "AHF81122.1",
            "EHR79820.1",
            "KE387148.1",
            "ACS89931.1",
            "AMQ17906.1",
            "RLF83406.1",
            "HDZ36459.1",
            "NJE08284.1",
            "NJF24142.1",
            "NJE55587.1",
        ],
        1: [
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkB-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "HTkA-like",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type singlets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
            "bacteria-type doublets",
        ],
    }
)
seq_stevens_deep_2022

Unnamed: 0,0,1
0,AAL81846.1,HTkB-like
1,BAA30815.1,HTkB-like
2,CAB49394.1,HTkB-like
3,AFK23164.1,HTkB-like
4,AMM55014.1,HTkB-like
...,...,...
171,RLF83406.1,bacteria-type doublets
172,HDZ36459.1,bacteria-type doublets
173,NJE08284.1,bacteria-type doublets
174,NJF24142.1,bacteria-type doublets


In [14]:
seq_stevens_deep_2022.columns = ["accession", "histone_class"]

In [15]:
accessions = seq_stevens_deep_2022["accession"].values
accessions

array(['AAL81846.1', 'BAA30815.1', 'CAB49394.1', 'AFK23164.1',
       'AMM55014.1', 'ASJ17568.1', 'AEC51213.1', 'ADT83796.1',
       'AHF80071.1', 'BAD86478.1', 'AHL23243.1', 'AIU68885.1',
       'LT900021.1', 'ASJ01246.1', 'ACS32979.1', 'EEB74510.1',
       'AJC71819.1', 'ASJ03532.1', 'LANF01000006.1', 'ACS90646.1',
       'EHR77909.1', 'KPU62412.1', 'KE387150.1', 'AEH23914.1',
       'MTLP01000042.1', 'AIF69774.1', 'ACJ16723.1', 'ANF22029.1',
       'AEK73791.1', 'AFL95685.1', 'KUH33491.1', 'QDA31915.1',
       'ASJ05087.1', 'ASJ05842.1', 'ASJ07907.1', 'AJLF01000003.1',
       'ALV63354.1', 'ASJ10074.1', 'ASJ11296.1', 'AMQ18382.1',
       'ASI99083.1', 'ASJ13045.1', 'ASJ14748.1', 'RLF87512.1',
       'RLF89200.1', 'RLF89622.1', 'RLF86230.1', 'RLF82482.1',
       'RLF78498.1', 'HDZ36007.1', 'NJE09500.1', 'NJE07720.1',
       'NJE05021.1', 'NJD98225.1', 'NJE26633.1', 'NJE02313.1',
       'NJE85045.1', 'NJF25470.1', 'NJE60507.1', 'NJE53991.1',
       'NJE29684.1', 'CAD5244819.1', 'AAL81

In [10]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df_stevens_deep_2022 = df[df["accession"].isin(accessions)]
df_stevens_deep_2022

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
32,AAL81408.1,FtF,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MVELLVKSKVKEFVSSIDKDMRVSPEFYEALEAEVKALIEKAVKRA...,
33,AAL81846.1,Nucleosomal,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAQRVSEQAAKVLAEHLEEKAIEIAKKAVD...,
34,AAL81955.1,Nucleosomal,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAERVSEQAAKVLAEYLEEYAIEVAKKAVE...,
94,ACJ15670.1,Nucleosomal,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAMDLAKRAAE...,
95,ACJ16232.1,FtF,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAEMIVKSKVKEAVKAIDPEMRINPEFYEALEAEIKILIEKAVKRA...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3755,QDA31915.1,Nucleosomal,,,,2586643.0,Thermococcus indicus,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIARKAVD...,
4088,RLF76361.1,Nucleosomal,,,,2250254.0,Thermococci archaeon,Methanobacteriota,Thermococci,,,MELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEDYAIELAKKSSDF...,
4089,RLF78498.1,Nucleosomal,,,,2250254.0,Thermococci archaeon,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAPRVSEEAAKVLAEHLEDKAMEIAKKAVE...,
4093,RLF89622.1,Nucleosomal,,,,2250254.0,Thermococci archaeon,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAPRVSEEAAKVLAEHLEDKAMEIAKKAVE...,


In [11]:
df_stevens_deep_2022["variant"].value_counts()

variant
Nucleosomal                         80
FtF                                 24
HTkA_(Thermococcus_kodakarensis)     1
HTkB_(Thermococcus_kodakarensis)     1
Name: count, dtype: int64

In [12]:
len(set(accessions)), len(accessions)

(173, 176)

In [17]:
df_merged = pd.merge(
    df_stevens_deep_2022[["accession", "variant"]],
    seq_stevens_deep_2022,
    on="accession",
)
df_merged

Unnamed: 0,accession,variant,histone_class
0,AAL81408.1,FtF,bacteria-type singlets
1,AAL81846.1,Nucleosomal,HTkB-like
2,AAL81955.1,Nucleosomal,HTkA-like
3,ACJ15670.1,Nucleosomal,HTkA-like
4,ACJ16232.1,FtF,bacteria-type singlets
...,...,...,...
101,QDA31915.1,Nucleosomal,HTkB-like
102,RLF76361.1,Nucleosomal,HTkA-like
103,RLF78498.1,Nucleosomal,HTkB-like
104,RLF89622.1,Nucleosomal,HTkB-like


Все последовательности, классифицированные как Face-to-Face (см. [статью](https://www.nature.com/articles/s41467-024-52337-y)), в [статье](https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab274/6459647) определены в группу `bacteria-type singlets`.

In [18]:
df_merged[df_merged["variant"] == "FtF"]

Unnamed: 0,accession,variant,histone_class
0,AAL81408.1,FtF,bacteria-type singlets
4,ACJ16232.1,FtF,bacteria-type singlets
7,ACS33228.1,FtF,bacteria-type singlets
17,AEK72144.1,FtF,bacteria-type singlets
24,AFL96134.1,FtF,bacteria-type singlets
28,AHL22602.1,FtF,bacteria-type singlets
34,AIU70766.1,FtF,bacteria-type singlets
37,AJC71892.1,FtF,bacteria-type singlets
44,AMQ19473.1,FtF,bacteria-type singlets
48,ASI99555.1,FtF,bacteria-type singlets


Все последовательности, классифицированные как Nucleosomal (см. [статью](https://www.nature.com/articles/s41467-024-52337-y)), в [статье](https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab274/6459647) определены в группы `HTkA-like` и `HTkB-like`.

In [19]:
df_merged[df_merged["variant"] == "Nucleosomal"]["histone_class"].value_counts()

histone_class
HTkB-like    41
HTkA-like    39
Name: count, dtype: int64

Посмотрим последовательности, которых еще нет в базе:

In [22]:
seq_stevens_deep_2022[
    ~seq_stevens_deep_2022["accession"].isin(df_stevens_deep_2022["accession"])
]["histone_class"].value_counts()

histone_class
bacteria-type doublets    27
HTkA-like                 21
HTkB-like                 20
bacteria-type singlets     2
Name: count, dtype: int64

## Оставим Face-to-Face (см. [статью](https://www.nature.com/articles/s41467-024-52337-y)) как были, так как назвать их бактериа-подобными кажется слишком "грубым" (см. [статью](https://doi.org/10.1093/bioinformatics/bty1000)).

In [24]:
df_stevens_deep_2022[
    df_stevens_deep_2022["accession"].isin(
        df_merged[df_merged["variant"] == "FtF"]["accession"]
    )
]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
32,AAL81408.1,FtF,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MVELLVKSKVKEFVSSIDKDMRVSPEFYEALEAEVKALIEKAVKRA...,
95,ACJ16232.1,FtF,,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAEMIVKSKVKEAVKAIDPEMRINPEFYEALEAEIKILIEKAVKRA...,
107,ACS33228.1,FtF,,,,593117.0,Thermococcus gammatolerans EJ3,Methanobacteriota,Thermococci,,,MAEIIVKSKVKEFVKSLDAEMRVSPEFYDALEAEVKVLIEKAVKRA...,
196,AEK72144.1,FtF,,,,1042877.0,Thermococcus sp. 4557,Methanobacteriota,Thermococci,,,MAEMIVKSKVKEAIKAIDPEMRVNPEFYEALEAEIKTLVEKAVKRA...,
208,AFL96134.1,FtF,,,,163003.0,Thermococcus cleftensis,Methanobacteriota,Thermococci,,,MAEMIVKSKVKEAVKAIDPEMRINPEFYEALEAELKVLIEKAVKRA...,
242,AHL22602.1,FtF,,,,195522.0,Thermococcus nautili,Methanobacteriota,Thermococci,,,MAEMLVKSKVKELVKNLDPEMRVSPEFYDALEEELKALVEKAVKRA...,
271,AIU70766.1,FtF,,,,1505907.0,Thermococcus eurythermalis,Methanobacteriota,Thermococci,,,MAEMLVKSKVKELVKNLDPEMRVSPEFYDALEEELKVLVEKAVKRA...,
278,AJC71892.1,FtF,,,,1432656.0,Thermococcus guaymasensis DSM 11113,Methanobacteriota,Thermococci,,,MAEIIVKSKVKEFVKSLDAEMRVSPEFYDALEAEIKVLIEKAIKRA...,
343,AMQ19473.1,FtF,,,,53952.0,Thermococcus peptonophilus,Methanobacteriota,Thermococci,,,MAEMLVKSKVKEFVKSVDPEMRVSPEFYDALEAEVKALVEKAIKRA...,
391,ASI99555.1,FtF,,,,1293037.0,Thermococcus celer Vu 13 = JCM 8558,Methanobacteriota,Thermococci,,,MVEMIVKSKVKEAVKAIDPEMRVNPEFYEALEAEVKALIEKAVKRA...,


## Update some Nucleosomal histones to HTkA_(Thermococcales)

In [31]:
df_stevens_deep_2022[
    df_stevens_deep_2022["accession"].isin(
        df_merged[df_merged["variant"] == "Nucleosomal"]["accession"]
    )
]["class"].value_counts()

class
Thermococci    80
Name: count, dtype: int64

In [34]:
upd_accessions = df_merged[
    (df_merged["variant"] == "Nucleosomal")
    & (df_merged["histone_class"] == "HTkA-like")
]["accession"].values
upd_accessions

array(['AAL81955.1', 'ACJ15670.1', 'ACS34282.1', 'ACS90532.1',
       'ADT83177.1', 'AEC51304.1', 'AEH23840.1', 'AEK72656.1',
       'AFK23282.1', 'AFL94617.1', 'AHF79684.1', 'AHL21670.1',
       'AIF68887.1', 'AIU69875.1', 'AJC71056.1', 'ALV63249.1',
       'AMM55009.1', 'AMQ19125.1', 'ANF23023.1', 'ASI99810.1',
       'ASJ00192.1', 'ASJ02315.1', 'ASJ04222.1', 'ASJ06692.1',
       'ASJ08794.1', 'ASJ12022.1', 'ASJ14408.1', 'ASJ17557.1',
       'BAA30901.1', 'CAB49269.1', 'CAD5243811.1', 'EEB74307.1',
       'EHR78220.1', 'HDZ35934.1', 'KPU62458.1', 'KUH31667.1',
       'QDA30973.1', 'RLF76361.1', 'RLF90088.1'], dtype=object)

In [40]:
for acc in upd_accessions:
    query = (
        f"UPDATE sequence SET variant='HTkA_(Thermococcales)' WHERE accession='{acc}'"
    )
    # print(query)
    cursor.execute(query)

In [41]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(upd_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
34,AAL81955.1,HTkA_(Thermococcales),,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAERVSEQAAKVLAEYLEEYAIEVAKKAVE...,
94,ACJ15670.1,HTkA_(Thermococcales),,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAMDLAKRAAE...,
108,ACS34282.1,HTkA_(Thermococcales),,,,593117.0,Thermococcus gammatolerans EJ3,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIEIARKAND...,
109,ACS90532.1,HTkA_(Thermococcales),,,,604354.0,Thermococcus sibiricus MM 739,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAVEVAKKSVE...,
165,ADT83177.1,HTkA_(Thermococcales),,,,391623.0,Thermococcus barophilus MP,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIELSKKAVE...,
177,AEC51304.1,HTkA_(Thermococcales),,,,342949.0,Pyrococcus sp. NA2,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAERVSEQAAKVLAEYLEEYAVEVAKKAVE...,
189,AEH23840.1,HTkA_(Thermococcales),,,,529709.0,Pyrococcus yayanosii CH1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAERVSEEAAKILAEYLEEYAIEISKKAVE...,
197,AEK72656.1,HTkA_(Thermococcales),,,,1042877.0,Thermococcus sp. 4557,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIELARKSAD...,
205,AFK23282.1,HTkA_(Thermococcales),,,,1183377.0,Pyrococcus sp. ST04,Methanobacteriota,Thermococci,,,MMGELPIAPVDRLIRKAGAERVSEEAAKILAEYLEEYAIEVSKKAV...,
206,AFL94617.1,HTkA_(Thermococcales),,,,163003.0,Thermococcus cleftensis,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIELAKKAND...,


In [42]:
# Make sure data is committed to the database
conn.commit()

## Update some Nucleosomal histones to HTkB_(Thermococcales)

In [43]:
df_stevens_deep_2022[
    df_stevens_deep_2022["accession"].isin(
        df_merged[df_merged["variant"] == "Nucleosomal"]["accession"]
    )
]["class"].value_counts()

class
Thermococci    80
Name: count, dtype: int64

In [44]:
upd_accessions = df_merged[
    (df_merged["variant"] == "Nucleosomal")
    & (df_merged["histone_class"] == "HTkB-like")
]["accession"].values
upd_accessions

array(['AAL81846.1', 'ACJ16723.1', 'ACS32979.1', 'ACS90646.1',
       'ADT83796.1', 'AEC51213.1', 'AEH23914.1', 'AEK73791.1',
       'AFK23164.1', 'AFL95685.1', 'AHF80071.1', 'AHL23243.1',
       'AIF69774.1', 'AIU68885.1', 'AJC71819.1', 'ALV63354.1',
       'AMM55014.1', 'AMQ18382.1', 'ANF22029.1', 'ASI99083.1',
       'ASJ01246.1', 'ASJ03532.1', 'ASJ05087.1', 'ASJ05842.1',
       'ASJ07907.1', 'ASJ10074.1', 'ASJ11296.1', 'ASJ13045.1',
       'ASJ14748.1', 'ASJ17568.1', 'BAA30815.1', 'CAB49394.1',
       'CAD5244819.1', 'EEB74510.1', 'EHR77909.1', 'HDZ36007.1',
       'KPU62412.1', 'KUH33491.1', 'QDA31915.1', 'RLF78498.1',
       'RLF89622.1'], dtype=object)

In [45]:
for acc in upd_accessions:
    query = (
        f"UPDATE sequence SET variant='HTkB_(Thermococcales)' WHERE accession='{acc}'"
    )
    # print(query)
    cursor.execute(query)

In [46]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(upd_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
33,AAL81846.1,HTkB_(Thermococcales),,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAQRVSEQAAKVLAEHLEEKAIEIAKKAVD...,
96,ACJ16723.1,HTkB_(Thermococcales),,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,
106,ACS32979.1,HTkB_(Thermococcales),,,,593117.0,Thermococcus gammatolerans EJ3,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKAIEIAKKAVE...,
110,ACS90646.1,HTkB_(Thermococcales),,,,604354.0,Thermococcus sibiricus MM 739,Methanobacteriota,Thermococci,,,MLGMAELPIAPIDRLVRKAGAPRVSEEAAKVLAEHLEEKAMEIAKK...,
166,ADT83796.1,HTkB_(Thermococcales),,,,391623.0,Thermococcus barophilus MP,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAQRVSEEAAKLLAEHLEEKAVEIAKKAVD...,
176,AEC51213.1,HTkB_(Thermococcales),,,,342949.0,Pyrococcus sp. NA2,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAQRVSEKAAKLLAEHLEEKALEIARKAVD...,
190,AEH23914.1,HTkB_(Thermococcales),,,,529709.0,Pyrococcus yayanosii CH1,Methanobacteriota,Thermococci,,,MGAYRGVERPQNPKGLYISFALKENDKTFAGVVEMAELPIAPVDRL...,
198,AEK73791.1,HTkB_(Thermococcales),,,,1042877.0,Thermococcus sp. 4557,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIARKAVD...,
204,AFK23164.1,HTkB_(Thermococcales),,,,1183377.0,Pyrococcus sp. ST04,Methanobacteriota,Thermococci,,,MMGELPIAPVDRLIRKAGAQRVSEQAAKLLAEHLEEKALEIARKAV...,
207,AFL95685.1,HTkB_(Thermococcales),,,,163003.0,Thermococcus cleftensis,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIARKAVD...,


In [50]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[
    df["accession"].isin(
        df_merged[df_merged["variant"] == "Nucleosomal"]["accession"].values
    )
]["variant"].value_counts()

variant
HTkB_(Thermococcales)    41
HTkA_(Thermococcales)    39
Name: count, dtype: int64

In [51]:
# Make sure data is committed to the database
conn.commit()

## Update references for updated HTkB_(Thermococcales) and HTkA_(Thermococcales)

In [52]:
upd_accessions = df_merged[df_merged["variant"] == "Nucleosomal"]["accession"].values
upd_accessions

array(['AAL81846.1', 'AAL81955.1', 'ACJ15670.1', 'ACJ16723.1',
       'ACS32979.1', 'ACS34282.1', 'ACS90532.1', 'ACS90646.1',
       'ADT83177.1', 'ADT83796.1', 'AEC51213.1', 'AEC51304.1',
       'AEH23840.1', 'AEH23914.1', 'AEK72656.1', 'AEK73791.1',
       'AFK23164.1', 'AFK23282.1', 'AFL94617.1', 'AFL95685.1',
       'AHF79684.1', 'AHF80071.1', 'AHL21670.1', 'AHL23243.1',
       'AIF68887.1', 'AIF69774.1', 'AIU68885.1', 'AIU69875.1',
       'AJC71056.1', 'AJC71819.1', 'ALV63249.1', 'ALV63354.1',
       'AMM55009.1', 'AMM55014.1', 'AMQ18382.1', 'AMQ19125.1',
       'ANF22029.1', 'ANF23023.1', 'ASI99083.1', 'ASI99810.1',
       'ASJ00192.1', 'ASJ01246.1', 'ASJ02315.1', 'ASJ03532.1',
       'ASJ04222.1', 'ASJ05087.1', 'ASJ05842.1', 'ASJ06692.1',
       'ASJ07907.1', 'ASJ08794.1', 'ASJ10074.1', 'ASJ11296.1',
       'ASJ12022.1', 'ASJ13045.1', 'ASJ14408.1', 'ASJ14748.1',
       'ASJ17557.1', 'ASJ17568.1', 'BAA30815.1', 'BAA30901.1',
       'CAB49269.1', 'CAB49394.1', 'CAD5243811.1', 'CAD

In [53]:
pid = "stevens_deep_2022"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,stevens_deep_2022,Deep Conservation of Histone Variants in Therm...,10.1093/gbe/evab274,,2022


In [54]:
failed_toadd_publication = []
for acc in upd_accessions:
    try:
        cursor.execute(add_sequence_has_publication, (acc, pid))
    except:
        print(acc)
        failed_toadd_publication.append(acc)

In [55]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(upd_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
38,AAL81846.1,HTkB_(Thermococcales),,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAQRVSEQAAKVLAEHLEEKAIEIAKKAVD...,,AAL81846.1,schwab_histones_2024
39,AAL81846.1,HTkB_(Thermococcales),,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAQRVSEQAAKVLAEHLEEKAIEIAKKAVD...,,AAL81846.1,stevens_deep_2022
40,AAL81955.1,HTkA_(Thermococcales),,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAERVSEQAAKVLAEYLEEYAIEVAKKAVE...,,AAL81955.1,schwab_histones_2024
41,AAL81955.1,HTkA_(Thermococcales),,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MGELPIAPVDRLIRKAGAERVSEQAAKVLAEYLEEYAIEVAKKAVE...,,AAL81955.1,stevens_deep_2022
105,ACJ15670.1,HTkA_(Thermococcales),,,,523850.0,Thermococcus onnurineus NA1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAMDLAKRAAE...,,ACJ15670.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4371,RLF78498.1,HTkB_(Thermococcales),,,,2250254.0,Thermococci archaeon,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAPRVSEEAAKVLAEHLEDKAMEIAKKAVE...,,RLF78498.1,stevens_deep_2022
4375,RLF89622.1,HTkB_(Thermococcales),,,,2250254.0,Thermococci archaeon,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAPRVSEEAAKVLAEHLEDKAMEIAKKAVE...,,RLF89622.1,schwab_histones_2024
4376,RLF89622.1,HTkB_(Thermococcales),,,,2250254.0,Thermococci archaeon,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAPRVSEEAAKVLAEHLEDKAMEIAKKAVE...,,RLF89622.1,stevens_deep_2022
4377,RLF90088.1,HTkA_(Thermococcales),,,,2250254.0,Thermococci archaeon,Methanobacteriota,Thermococci,,,MELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEDYAIELAKKSSDF...,,RLF90088.1,schwab_histones_2024


## Add new HTkA_(Thermococcales) histones

In [57]:
add_accessions = seq_stevens_deep_2022[
    (~seq_stevens_deep_2022["accession"].isin(df_stevens_deep_2022["accession"]))
    & (seq_stevens_deep_2022["histone_class"] == "HTkA-like")
]["accession"].values
add_accessions

array(['KE387149.1', 'LT900021.1', 'MTLP01000031.1', 'AJLF01000001.1',
       'LANF01000010.1', 'RLF92116.1', 'RLF83483.1', 'RLF79735.1',
       'NJE09850.1', 'NJE07296.1', 'NJE06015.1', 'NJD99123.1',
       'NJE26742.1', 'NJE27157.1', 'NJE01747.1', 'NJE84772.1',
       'NJF25404.1', 'NJE62344.1', 'NJE54925.1', 'NJE30753.1',
       'NPA48162.1'], dtype=object)

In [58]:
add_accessions.size

21

In [59]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [60]:
data_sequence = []
for acc in add_accessions:
    print("------------------------------------------------")
    print(acc)
    try:
        with Entrez.efetch(
            db="protein", id=acc, rettype="gb", retmode="text"
        ) as handle:
            record = SeqIO.read(handle, "genbank")
        #     print(record)
        # print(record.seq)
        taxonomy_data = get_taxonomy_data(record)
        data_sequence.append(
            {
                "accession": record.id,
                "variant": "HTkA_(Thermococcales)",
                "gi": None,
                "ncbi_gene_id": None,
                "hgnc_gene_name": None,
                "taxonomy_id": 9606,
                "organism": None,
                "phylum": None,
                "class": None,
                "taxonomy_group": None,
                "info": None,
                "sequence": str(record.seq),
                "variant_under_consideration": None,
            }
        )
        data_sequence[-1].update(taxonomy_data)
        # for k, v in data_sequence[-1].items():
        #     print(k, v, type(v))
    except HTTPError as err:
        if err.code != 400:
            raise
        print(err)

------------------------------------------------
KE387149.1
HTTP Error 400: Bad Request
------------------------------------------------
LT900021.1
HTTP Error 400: Bad Request
------------------------------------------------
MTLP01000031.1
HTTP Error 400: Bad Request
------------------------------------------------
AJLF01000001.1
HTTP Error 400: Bad Request
------------------------------------------------
LANF01000010.1
HTTP Error 400: Bad Request
------------------------------------------------
RLF92116.1
Fetched taxid from NCBI 2250254
------------------------------------------------
RLF83483.1
Fetched taxid from NCBI 2250254
------------------------------------------------
RLF79735.1
Fetched taxid from NCBI 2250254
------------------------------------------------
NJE09850.1
Fetched taxid from NCBI 1638263
------------------------------------------------
NJE07296.1
Fetched taxid from NCBI 1638262
------------------------------------------------
NJE06015.1
Fetched taxid from NCBI 1638

**Не найдено:**
- KE387149.1 идентификатор гена
- LT900021.1 идентификатор гена
- MTLP01000031.1
- AJLF01000001.1
- LANF01000010.1

In [62]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [63]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2627,NJD99123.1,HTkA_(Thermococcales),,,,1638259.0,Thermococcus sp. LS1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIDVAKKAAE...,,,
2630,NJE01747.1,HTkA_(Thermococcales),,,,1638258.0,Thermococcus sp. JdF3,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIELARKSAD...,,,
2633,NJE06015.1,HTkA_(Thermococcales),,,,1638261.0,Thermococcus sp. M36,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIELSKKAAA...,,,
2634,NJE07296.1,HTkA_(Thermococcales),,,,1638262.0,Thermococcus sp. M39,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIELSKKAVE...,,,
2635,NJE09850.1,HTkA_(Thermococcales),,,,1638263.0,Thermococcus sp. MAR1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKILAEYLEEYAIEVGRKSVE...,,,
2637,NJE26742.1,HTkA_(Thermococcales),,,,1638272.0,Thermococcus sp. MV5,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAVELAKKSVE...,,,
2638,NJE27157.1,HTkA_(Thermococcales),,,,1638272.0,Thermococcus sp. MV5,Methanobacteriota,Thermococci,,,SEEAAKVLAEYLEEYAIELSKKAAAFARHAGRKTVKAEDIKLAIKS,,,
2639,NJE30753.1,HTkA_(Thermococcales),,,,1638210.0,Thermococcus sp. 18S1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIEVARKSTD...,,,
2644,NJE54925.1,HTkA_(Thermococcales),,,,1638223.0,Thermococcus sp. 21S9,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIELAKKANE...,,,
2646,NJE62344.1,HTkA_(Thermococcales),,,,1638221.0,Thermococcus sp. 21S7,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKILAEYLEEYAIELSKKSVQ...,,,


In [64]:
failed_toadd_publication = []
for nex_acc in add_accessions:
    try:
        cursor.execute(add_sequence_has_publication, (nex_acc, pid))
    except:
        print(nex_acc)
        failed_toadd_publication.append(nex_acc)

KE387149.1
LT900021.1
MTLP01000031.1
AJLF01000001.1
LANF01000010.1


In [65]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2627,NJD99123.1,HTkA_(Thermococcales),,,,1638259.0,Thermococcus sp. LS1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIDVAKKAAE...,,NJD99123.1,stevens_deep_2022
2630,NJE01747.1,HTkA_(Thermococcales),,,,1638258.0,Thermococcus sp. JdF3,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIELARKSAD...,,NJE01747.1,stevens_deep_2022
2633,NJE06015.1,HTkA_(Thermococcales),,,,1638261.0,Thermococcus sp. M36,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIELSKKAAA...,,NJE06015.1,stevens_deep_2022
2634,NJE07296.1,HTkA_(Thermococcales),,,,1638262.0,Thermococcus sp. M39,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIELSKKAVE...,,NJE07296.1,stevens_deep_2022
2635,NJE09850.1,HTkA_(Thermococcales),,,,1638263.0,Thermococcus sp. MAR1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKILAEYLEEYAIEVGRKSVE...,,NJE09850.1,stevens_deep_2022
2637,NJE26742.1,HTkA_(Thermococcales),,,,1638272.0,Thermococcus sp. MV5,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAVELAKKSVE...,,NJE26742.1,stevens_deep_2022
2638,NJE27157.1,HTkA_(Thermococcales),,,,1638272.0,Thermococcus sp. MV5,Methanobacteriota,Thermococci,,,SEEAAKVLAEYLEEYAIELSKKAAAFARHAGRKTVKAEDIKLAIKS,,NJE27157.1,stevens_deep_2022
2639,NJE30753.1,HTkA_(Thermococcales),,,,1638210.0,Thermococcus sp. 18S1,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEDAAKVLAEYLEEYAIEVARKSTD...,,NJE30753.1,stevens_deep_2022
2644,NJE54925.1,HTkA_(Thermococcales),,,,1638223.0,Thermococcus sp. 21S9,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKVLAEYLEEYAIELAKKANE...,,NJE54925.1,stevens_deep_2022
2646,NJE62344.1,HTkA_(Thermococcales),,,,1638221.0,Thermococcus sp. 21S7,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAERVSEEAAKILAEYLEEYAIELSKKSVQ...,,NJE62344.1,stevens_deep_2022


In [66]:
# Make sure data is committed to the database
conn.commit()

## Add new HTkB_(Thermococcales) histones

In [67]:
add_accessions = seq_stevens_deep_2022[
    (~seq_stevens_deep_2022["accession"].isin(df_stevens_deep_2022["accession"]))
    & (seq_stevens_deep_2022["histone_class"] == "HTkB-like")
]["accession"].values
add_accessions

array(['LT900021.1', 'LANF01000006.1', 'KE387150.1', 'MTLP01000042.1',
       'AJLF01000003.1', 'RLF87512.1', 'RLF89200.1', 'RLF86230.1',
       'RLF82482.1', 'NJE09500.1', 'NJE07720.1', 'NJE05021.1',
       'NJD98225.1', 'NJE26633.1', 'NJE02313.1', 'NJE85045.1',
       'NJF25470.1', 'NJE60507.1', 'NJE53991.1', 'NJE29684.1'],
      dtype=object)

In [68]:
add_accessions.size

20

In [69]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [70]:
data_sequence = []
for acc in add_accessions:
    print("------------------------------------------------")
    print(acc)
    try:
        with Entrez.efetch(
            db="protein", id=acc, rettype="gb", retmode="text"
        ) as handle:
            record = SeqIO.read(handle, "genbank")
        #     print(record)
        # print(record.seq)
        taxonomy_data = get_taxonomy_data(record)
        data_sequence.append(
            {
                "accession": record.id,
                "variant": "HTkB_(Thermococcales)",
                "gi": None,
                "ncbi_gene_id": None,
                "hgnc_gene_name": None,
                "taxonomy_id": 9606,
                "organism": None,
                "phylum": None,
                "class": None,
                "taxonomy_group": None,
                "info": None,
                "sequence": str(record.seq),
                "variant_under_consideration": None,
            }
        )
        data_sequence[-1].update(taxonomy_data)
        # for k, v in data_sequence[-1].items():
        #     print(k, v, type(v))
    except HTTPError as err:
        if err.code != 400:
            raise
        print(err)

------------------------------------------------
LT900021.1
HTTP Error 400: Bad Request
------------------------------------------------
LANF01000006.1
HTTP Error 400: Bad Request
------------------------------------------------
KE387150.1
HTTP Error 400: Bad Request
------------------------------------------------
MTLP01000042.1
HTTP Error 400: Bad Request
------------------------------------------------
AJLF01000003.1
HTTP Error 400: Bad Request
------------------------------------------------
RLF87512.1
Fetched taxid from NCBI 2250254
------------------------------------------------
RLF89200.1
Fetched taxid from NCBI 2250254
------------------------------------------------
RLF86230.1
Fetched taxid from NCBI 2250254
------------------------------------------------
RLF82482.1
Fetched taxid from NCBI 2250254
------------------------------------------------
NJE09500.1
Fetched taxid from NCBI 1638263
------------------------------------------------
NJE07720.1
Fetched taxid from NCBI 1638

**Не найдено:**
- LT900021.1 идентификатор гена
- LANF01000006.1
- KE387150.1 идентификатор гена
- MTLP01000042.1
- AJLF01000003.1

In [71]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [72]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2627,NJD98225.1,HTkB_(Thermococcales),,,,1638259.0,Thermococcus sp. LS1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,,,
2632,NJE02313.1,HTkB_(Thermococcales),,,,1638258.0,Thermococcus sp. JdF3,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMDIARKAVD...,,,
2635,NJE05021.1,HTkB_(Thermococcales),,,,1638261.0,Thermococcus sp. M36,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVD...,,,
2638,NJE07720.1,HTkB_(Thermococcales),,,,1638262.0,Thermococcus sp. M39,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAQRVSEEAAKLLAEHLEEKAIEIAKKAVD...,,,
2639,NJE09500.1,HTkB_(Thermococcales),,,,1638263.0,Thermococcus sp. MAR1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIAKKAVD...,,,
2642,NJE26633.1,HTkB_(Thermococcales),,,,1638272.0,Thermococcus sp. MV5,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAPRVSEEAAKVMAEHLEEKAMEIAKKAVD...,,,
2645,NJE29684.1,HTkB_(Thermococcales),,,,1638210.0,Thermococcus sp. 18S1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIARKAVD...,,,
2651,NJE53991.1,HTkB_(Thermococcales),,,,1638223.0,Thermococcus sp. 21S9,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,,
2653,NJE60507.1,HTkB_(Thermococcales),,,,1638221.0,Thermococcus sp. 21S7,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIASKAVD...,,,
2659,NJE85045.1,HTkB_(Thermococcales),,,,163006.0,Thermococcus sp. CX2,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,,,


In [73]:
failed_toadd_publication = []
for nex_acc in add_accessions:
    try:
        cursor.execute(add_sequence_has_publication, (nex_acc, pid))
    except:
        print(nex_acc)
        failed_toadd_publication.append(nex_acc)

LT900021.1
LANF01000006.1
KE387150.1
MTLP01000042.1
AJLF01000003.1


In [74]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2627,NJD98225.1,HTkB_(Thermococcales),,,,1638259.0,Thermococcus sp. LS1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,,NJD98225.1,stevens_deep_2022
2632,NJE02313.1,HTkB_(Thermococcales),,,,1638258.0,Thermococcus sp. JdF3,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMDIARKAVD...,,NJE02313.1,stevens_deep_2022
2635,NJE05021.1,HTkB_(Thermococcales),,,,1638261.0,Thermococcus sp. M36,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVD...,,NJE05021.1,stevens_deep_2022
2638,NJE07720.1,HTkB_(Thermococcales),,,,1638262.0,Thermococcus sp. M39,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAQRVSEEAAKLLAEHLEEKAIEIAKKAVD...,,NJE07720.1,stevens_deep_2022
2639,NJE09500.1,HTkB_(Thermococcales),,,,1638263.0,Thermococcus sp. MAR1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIAKKAVD...,,NJE09500.1,stevens_deep_2022
2642,NJE26633.1,HTkB_(Thermococcales),,,,1638272.0,Thermococcus sp. MV5,Methanobacteriota,Thermococci,,,MAELPIAPIDRLIRKAGAPRVSEEAAKVMAEHLEEKAMEIAKKAVD...,,NJE26633.1,stevens_deep_2022
2645,NJE29684.1,HTkB_(Thermococcales),,,,1638210.0,Thermococcus sp. 18S1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIARKAVD...,,NJE29684.1,stevens_deep_2022
2651,NJE53991.1,HTkB_(Thermococcales),,,,1638223.0,Thermococcus sp. 21S9,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,,NJE53991.1,stevens_deep_2022
2653,NJE60507.1,HTkB_(Thermococcales),,,,1638221.0,Thermococcus sp. 21S7,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKAMEIASKAVD...,,NJE60507.1,stevens_deep_2022
2659,NJE85045.1,HTkB_(Thermococcales),,,,163006.0,Thermococcus sp. CX2,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEDAAKLLAEHLEEKALEIAKKAVD...,,NJE85045.1,stevens_deep_2022


In [75]:
# Make sure data is committed to the database
conn.commit()

## Other bacteria-type singlets NOT added

In [76]:
seq_stevens_deep_2022[
    (~seq_stevens_deep_2022["accession"].isin(df_stevens_deep_2022["accession"]))
    & (seq_stevens_deep_2022["histone_class"] == "bacteria-type singlets")
]

Unnamed: 0,accession,histone_class
145,NJE53775.1,bacteria-type singlets
148,NJE27137.1,bacteria-type singlets


Исходя из анализа статьи, `NJE27137.1` является дублетом.

## Add bacteria-type doublets

In [80]:
import numpy as np

In [82]:
add_accessions = seq_stevens_deep_2022[
    (~seq_stevens_deep_2022["accession"].isin(df_stevens_deep_2022["accession"]))
    & (seq_stevens_deep_2022["histone_class"] == "bacteria-type doublets")
]["accession"].values
add_accessions = np.append(add_accessions, 'NJE27137.1')
add_accessions

array(['AAL80748.1', 'BAD84939.1', 'ACS33435.1', 'EEB74519.1',
       'AHL22334.1', 'LT900021.1', 'AIU70291.1', 'AJC72509.1',
       'QDA31581.1', 'AEK74020.1', 'AFL95897.1', 'ASJ04868.1',
       'ASJ00818.1', 'AJLF01000001.1', 'ASJ07429.1', 'LANF01000011.1',
       'ADT84726.1', 'AHF81122.1', 'EHR79820.1', 'KE387148.1',
       'ACS89931.1', 'AMQ17906.1', 'RLF83406.1', 'HDZ36459.1',
       'NJE08284.1', 'NJF24142.1', 'NJE55587.1', 'NJE27137.1'],
      dtype=object)

In [83]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [85]:
data_sequence = []
for acc in add_accessions:
    print("------------------------------------------------")
    print(acc)
    try:
        with Entrez.efetch(
            db="protein", id=acc, rettype="gb", retmode="text"
        ) as handle:
            record = SeqIO.read(handle, "genbank")
        #     print(record)
        # print(record.seq)
        taxonomy_data = get_taxonomy_data(record)
        data_sequence.append(
            {
                "accession": record.id,
                "variant": "Doublet",
                "gi": None,
                "ncbi_gene_id": None,
                "hgnc_gene_name": None,
                "taxonomy_id": 9606,
                "organism": None,
                "phylum": None,
                "class": None,
                "taxonomy_group": None,
                "info": None,
                "sequence": str(record.seq),
                "variant_under_consideration": None,
            }
        )
        data_sequence[-1].update(taxonomy_data)
        # for k, v in data_sequence[-1].items():
        #     print(k, v, type(v))
    except HTTPError as err:
        if err.code != 400:
            raise
        print(err)

------------------------------------------------
AAL80748.1
Fetched taxid from NCBI 186497
------------------------------------------------
BAD84939.1
Fetched taxid from NCBI 69014
------------------------------------------------
ACS33435.1
Fetched taxid from NCBI 593117
------------------------------------------------
EEB74519.1
Fetched taxid from NCBI 246969
------------------------------------------------
AHL22334.1
Fetched taxid from NCBI 195522
------------------------------------------------
LT900021.1
HTTP Error 400: Bad Request
------------------------------------------------
AIU70291.1
Fetched taxid from NCBI 1505907
------------------------------------------------
AJC72509.1
Fetched taxid from NCBI 1432656
------------------------------------------------
QDA31581.1
Fetched taxid from NCBI 2586643
------------------------------------------------
AEK74020.1
Fetched taxid from NCBI 1042877
------------------------------------------------
AFL95897.1
Fetched taxid from NCBI 163003

**Не найдено:**
- LT900021.1 идентификатор гена
- LANF01000011.1
- KE387148.1 идентификатор гена

In [86]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [87]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
37,AAL80748.1,Doublet,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MAEMVIPYPHLQRILEKTCELAVTKPMAERMMEIVERKLADLFEVA...,,,
123,ACS33435.1,Doublet,,,,593117.0,Thermococcus gammatolerans EJ3,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLADLFEVA...,,,
126,ACS89931.1,Doublet,,,,604354.0,Thermococcus sibiricus MM 739,Methanobacteriota,Thermococci,,,MREMIIPFPQLQKILEKTCELALIKPRAEEMMKIVERKLTDLFEVA...,,,
193,ADT84726.1,Doublet,,,,391623.0,Thermococcus barophilus MP,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLSDLFEVA...,,,
232,AEK74020.1,Doublet,,,,1042877.0,Thermococcus sp. 4557,Methanobacteriota,Thermococci,,,MSEMVIPYPQLQKILERTCELAVIKPRAEEMMGVVEKKLADLFEVA...,,,
246,AFL95897.1,Doublet,,,,163003.0,Thermococcus cleftensis,Methanobacteriota,Thermococci,,,MAEMVIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLADLFEVA...,,,
282,AHF81122.1,Doublet,,,,582419.0,Thermococcus paralvinellae,Methanobacteriota,Thermococci,,,MAEMIIPYPQLKKILERTCELAVIKPRAEEMMEIVEKKLSDLFEVA...,,,
286,AHL22334.1,Doublet,,,,195522.0,Thermococcus nautili,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMDIVEKKLSDLFEVA...,,,
322,AIU70291.1,Doublet,,,,1505907.0,Thermococcus eurythermalis,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLADLFEVA...,,,
333,AJC72509.1,Doublet,,,,1432656.0,Thermococcus guaymasensis DSM 11113,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVVKPRAEEMMEIVEKKLADLFEVA...,,,


In [88]:
failed_toadd_publication = []
for nex_acc in add_accessions:
    try:
        cursor.execute(add_sequence_has_publication, (nex_acc, pid))
    except:
        print(nex_acc)
        failed_toadd_publication.append(nex_acc)

LT900021.1
AJLF01000001.1
LANF01000011.1
KE387148.1


In [101]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(add_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
37,AAL80748.1,Doublet,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MAEMVIPYPHLQRILEKTCELAVTKPMAERMMEIVERKLADLFEVA...,,AAL80748.1,stevens_deep_2022
123,ACS33435.1,Doublet,,,,593117.0,Thermococcus gammatolerans EJ3,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLADLFEVA...,,ACS33435.1,stevens_deep_2022
126,ACS89931.1,Doublet,,,,604354.0,Thermococcus sibiricus MM 739,Methanobacteriota,Thermococci,,,MREMIIPFPQLQKILEKTCELALIKPRAEEMMKIVERKLTDLFEVA...,,ACS89931.1,stevens_deep_2022
193,ADT84726.1,Doublet,,,,391623.0,Thermococcus barophilus MP,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLSDLFEVA...,,ADT84726.1,stevens_deep_2022
232,AEK74020.1,Doublet,,,,1042877.0,Thermococcus sp. 4557,Methanobacteriota,Thermococci,,,MSEMVIPYPQLQKILERTCELAVIKPRAEEMMGVVEKKLADLFEVA...,,AEK74020.1,stevens_deep_2022
246,AFL95897.1,Doublet,,,,163003.0,Thermococcus cleftensis,Methanobacteriota,Thermococci,,,MAEMVIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLADLFEVA...,,AFL95897.1,stevens_deep_2022
282,AHF81122.1,Doublet,,,,582419.0,Thermococcus paralvinellae,Methanobacteriota,Thermococci,,,MAEMIIPYPQLKKILERTCELAVIKPRAEEMMEIVEKKLSDLFEVA...,,AHF81122.1,stevens_deep_2022
286,AHL22334.1,Doublet,,,,195522.0,Thermococcus nautili,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMDIVEKKLSDLFEVA...,,AHL22334.1,stevens_deep_2022
322,AIU70291.1,Doublet,,,,1505907.0,Thermococcus eurythermalis,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVIKPRAEEMMEIVEKKLADLFEVA...,,AIU70291.1,stevens_deep_2022
333,AJC72509.1,Doublet,,,,1432656.0,Thermococcus guaymasensis DSM 11113,Methanobacteriota,Thermococci,,,MAEMIIPYPQLQKILERTCELAVVKPRAEEMMEIVEKKLADLFEVA...,,AJC72509.1,stevens_deep_2022


In [102]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [103]:
cursor.close()
conn.close()
tunnel.stop()