In [1]:
from urllib.error import HTTPError

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

46789


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [7]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add other archaeal sequences as Nucleosomal

These seqs from [article](https://www.pnas.org/doi/suppl/10.1073/pnas.2007056117#supplementary-materials)

pid='stevens_histone_2020'

**Последовательности добавлены только из файла pnas.2007056117.sd07.txt, остальные стоит распарсить и посмотреть, есть ли там еще последовательности**

In [8]:
accessions = [
    "ADC47610.1",
    "AMK15744.1",
    "ADC47581.1",
    "WP_042707783.1",
    "WP_040681874.1",
    "AGN16739.1",
    "AMK16160.1",
    "ADC45863.1",
    "ABQ87465.1",
    "CDF29678.1",
    "ALT67815.1",
    "WP_042694673.1",
    "RAP47890.1",
    "PAV07521.1",
    "OED29812.1",
    "Msp_0924.b",
    "OEC91430.1",
    "RAP45480.1",
    "RAP45776.1",
    "RAP50668.1",
    "RAP54481.1",
    "AWX32096.1",
    "RAP52332.1",
    "PAV06688.1",
    "ADC46625.1",
    "WP_042703161.1",
    "WP_042708287.1",
    "CDG64810.1",
    "WP_048080739.1",
    "ADZ10152.1",
    "WP_013645503.1",
    "AEG17722.1",
    "WP_048082423.1",
    "WP_048079919.1",
    "WP_048080127.1",
    "KUK01880.1",
    "BAM69466.1",
    "ADL58299.1",
    "AAB84760.1",
    "ADP77985.1",
    "ADP77717.1",
    "AEG18760.1",
    "WP_048080060.1",
    "CDG65471.1",
    "WP_048191214.1",
    "ADZ08863.1",
    "EKQ54952.1",
    "KUK74590.1",
    "AIS31421.1",
    "WP_048080037.1",
    "WP_048082428.1",
    "CDG65490.1",
    "KUK72323.1",
    "EKQ54972.1",
    "AIS31443.1",
    "AEG18782.1",
    "WP_048191238.1",
    "ADZ08844.1",
    "KUK00930.1",
    "OED30417.1",
    "RAP44245.1",
    "PAV07344.1",
    "ABC56541.1",
    "OEC90511.1",
    "RAP50801.1",
    "AWX32771.1",
    "RAP53893.1",
    "RAP43830.1",
    "RAP51060.1",
    "AWX32048.1",
    "RAP53014.1",
    "RAP54547.1",
    "RAP45660.1",
    "RAP45179.1",
    "RAP51231.1",
    "RAP48712.1",
    "RAP54459.1",
    "RAP52148.1",
    "AWX32163.1",
    "RAP50540.1",
    "RAP53588.1",
    "RAP50719.1",
    "AWX33002.1",
    "RAP51671.1",
    "AWX32353.1",
    "RAP54286.1",
    "AWX31846.1",
    "AWX33472.1",
    "RAP48522.1",
    "RAP45846.1",
    "RAP53015.1",
    "OEC91351.1",
    "ABC56917.1",
    "RAP45999.1",
    "RAP46909.1",
    "RAP51754.1",
    "ABC56586.1",
    "RAP49193.1",
    "RAP50856.1",
    "AWX32727.1",
    "RAP53921.1",
    "RAP43478.1",
    "PAV06641.1",
    "OED29760.1",
    "RAP44398.1",
    "PAV07629.1",
    "PAV06868.1",
    "RAP50195.1",
    "RAP53391.1",
    "RAP44877.1",
    "OED30104.1",
    "PAV06893.1",
    "RAP45065.1",
    "ABC57012.1",
    "OEC93501.1",
    "RAP54253.1",
    "AWX32394.1",
    "RAP51588.1",
    "OED30333.1",
    "PAV06619.1",
    "ABC57162.1",
    "OEC93458.1",
    "KUK00359.1",
    "BAM70796.1",
    "AAB86168.1",
    "ADL57882.1",
    "WP_048190683.1",
    "WP_048191910.1",
    "WP_048189825.1",
    "WP_048190788.1",
    "WP_048190279.1",
    "AAB85321.1",
    "BAM69986.1",
    "ADL58808.1",
    "AIS32303.1",
    "KUK71850.1",
    "EKQ55831.1",
    "WP_042703508.1",
    "ADC47536.1",
    "CDF28895.1",
    "ABQ87049.1",
    "WP_042691314.1",
    "WP_042706862.1",
    "AMK15548.1",
    "ADC47341.1",
    "ALT68631.1",
    "WP_042703701.1",
    "AGN16601.1",
    "WP_016358304.1",
    "WP_042707525.1",
    "WP_081738351.1",
    "ABQ86418.1",
    "CDF28750.1",
    "ALT69537.1",
    "AMK15711.1",
    "WP_016358130.1",
    "AGN16427.1",
    "RAP47010.1",
    "OEC87236.1",
    "ABC56784.1",
    "RAP44639.1",
    "PAV07855.1",
    "OED30073.1",
    "AWX32523.1",
    "RAP54124.1",
    "RAP51258.1",
    "KYC44834.1",
    "KYC49354.1",
]

In [10]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
12,AAB84760.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MRWLIMELPIAPIGRIIKNAGAEIVSDDAREALAKVLEAKGEEIAE...,,AAB84760.1,schwab_histones_2024
13,AAB85321.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAKEALAKALEEMGEEISRKAVE...,,AAB85321.1,schwab_histones_2024
14,AAB86168.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAREALAKILEEKGEEIAKEAVK...,,AAB86168.1,schwab_histones_2024
59,ABC56541.1,Nucleosomal,,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRLLKNAGASRISDDAKEELAEVLETFGTSIAEDAVK...,,ABC56541.1,schwab_histones_2024
60,ABC56586.1,Nucleosomal,,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MELPIAPVGRIIKNAGADRISDDAKAELTQILEQIGEEISKDAIQV...,,ABC56586.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4261,RAP54253.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MEQLPIAPMGRILSNAGASRATKDAKIELSKRLSELGEAISAEAVA...,,RAP54253.1,schwab_histones_2024
4262,RAP54286.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MAILPKAPVKRILSNSGVSRVSDDAVDELINILEEYGEEISKRSIK...,,RAP54286.1,schwab_histones_2024
4263,RAP54459.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MAELPIAPIKRILKDAGAQRVSDDAAAALAKVLEEYGEQISEDANR...,,RAP54459.1,schwab_histones_2024
4264,RAP54481.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MTEIPKAPITRIVKNAGAERISKDAEEKFVEAVEAYTAKLAEAAID...,,RAP54481.1,schwab_histones_2024


In [11]:
len(set(accessions))

168

In [12]:
df[df["accession"].isin(accessions)]["publication_id"].value_counts()

publication_id
schwab_histones_2024        117
stevens_histone_2020          2
mattiroli_structure_2017      1
henneman_structure_2018       1
Name: count, dtype: int64

## Add reference for existing sequences

In [13]:
df[df["accession"].isin(accessions) & (df["publication_id"] == "stevens_histone_2020")]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
183,ADP77717.1,HMfA_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGAERVSDDARIALAKVLEEMGEEIASEAVK...,,ADP77717.1,stevens_histone_2020
187,ADP77985.1,HMfB_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,ADP77985.1,stevens_histone_2020


In [14]:
existing = df[
    df["accession"].isin(accessions) & (df["publication_id"] != "stevens_histone_2020")
]["accession"].unique()
existing = set(existing) - set(
    df[
        df["accession"].isin(accessions)
        & (df["publication_id"] == "stevens_histone_2020")
    ]["accession"].unique()
)
existing

{'AAB84760.1',
 'AAB85321.1',
 'AAB86168.1',
 'ABC56541.1',
 'ABC56586.1',
 'ABC56917.1',
 'ABC57012.1',
 'ABC57162.1',
 'ABQ86418.1',
 'ABQ87049.1',
 'ABQ87465.1',
 'ADC45863.1',
 'ADC46625.1',
 'ADC47341.1',
 'ADC47536.1',
 'ADC47581.1',
 'ADC47610.1',
 'ADL57882.1',
 'ADL58299.1',
 'ADL58808.1',
 'ADZ08844.1',
 'ADZ08863.1',
 'ADZ10152.1',
 'AEG17722.1',
 'AEG18760.1',
 'AEG18782.1',
 'AGN16427.1',
 'AGN16601.1',
 'AGN16739.1',
 'AIS31421.1',
 'AIS31443.1',
 'AIS32303.1',
 'ALT68631.1',
 'ALT69537.1',
 'AMK15548.1',
 'AMK15711.1',
 'AMK15744.1',
 'AWX32048.1',
 'AWX32096.1',
 'AWX32163.1',
 'AWX32353.1',
 'AWX32394.1',
 'AWX32523.1',
 'AWX32727.1',
 'AWX32771.1',
 'AWX33002.1',
 'AWX33472.1',
 'BAM69466.1',
 'BAM69986.1',
 'BAM70796.1',
 'CDF28895.1',
 'CDG64810.1',
 'CDG65471.1',
 'CDG65490.1',
 'EKQ54952.1',
 'EKQ54972.1',
 'EKQ55831.1',
 'KYC44834.1',
 'KYC49354.1',
 'OEC90511.1',
 'OEC91351.1',
 'OEC91430.1',
 'OEC93501.1',
 'OED29760.1',
 'OED29812.1',
 'OED30073.1',
 'OED30104

In [15]:
pid = "stevens_histone_2020"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,stevens_histone_2020,Histone variants in archaea and the evolution ...,10.1073/pnas.2007056117,,2020


In [16]:
failed_toadd_publication = []
for ex_acc in existing:
    try:
        cursor.execute(add_sequence_has_publication, (ex_acc, pid))
    except:
        print(ex_acc)
        failed_toadd_publication(ex_acc)

In [17]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
12,AAB84760.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MRWLIMELPIAPIGRIIKNAGAEIVSDDAREALAKVLEAKGEEIAE...,,AAB84760.1,schwab_histones_2024
13,AAB84760.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MRWLIMELPIAPIGRIIKNAGAEIVSDDAREALAKVLEAKGEEIAE...,,AAB84760.1,stevens_histone_2020
14,AAB85321.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAKEALAKALEEMGEEISRKAVE...,,AAB85321.1,schwab_histones_2024
15,AAB85321.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAKEALAKALEEMGEEISRKAVE...,,AAB85321.1,stevens_histone_2020
16,AAB86168.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAREALAKILEEKGEEIAKEAVK...,,AAB86168.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4377,RAP54459.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MAELPIAPIKRILKDAGAQRVSDDAAAALAKVLEEYGEQISEDANR...,,RAP54459.1,stevens_histone_2020
4378,RAP54481.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MTEIPKAPITRIVKNAGAERISKDAEEKFVEAVEAYTAKLAEAAID...,,RAP54481.1,schwab_histones_2024
4379,RAP54481.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MTEIPKAPITRIVKNAGAERISKDAEEKFVEAVEAYTAKLAEAAID...,,RAP54481.1,stevens_histone_2020
4380,RAP54547.1,Nucleosomal,,,,1945578.0,Methanosphaera sp. rholeuAM130,Methanobacteriota,Methanobacteria,,,MAELPIAPVKRIIKQAGGKRISDEAAVELANVLEEYGIEVAERANK...,,RAP54547.1,schwab_histones_2024


## Add other sequences

In [18]:
not_exist = set(accessions) - set(df["accession"])
len(not_exist)

50

In [19]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [20]:
data_sequence = []
for acc in not_exist:
    print("------------------------------------------------")
    print(acc)
    try:
        with Entrez.efetch(
            db="protein", id=acc, rettype="gb", retmode="text"
        ) as handle:
            record = SeqIO.read(handle, "genbank")
        #     print(record)
        # print(record.seq)
        taxonomy_data = get_taxonomy_data(record)
        data_sequence.append(
            {
                "accession": record.id,
                "variant": "Nucleosomal",
                "gi": None,
                "ncbi_gene_id": None,
                "hgnc_gene_name": None,
                "taxonomy_id": None,
                "organism": None,
                "phylum": None,
                "class": None,
                "taxonomy_group": None,
                "info": None,
                "sequence": str(record.seq),
                "variant_under_consideration": None,
            }
        )
        data_sequence[-1].update(taxonomy_data)
        # for k, v in data_sequence[-1].items():
        #     print(k, v, type(v))
    except HTTPError as err:
        if err.code != 400:
            raise
        print(err)

------------------------------------------------
WP_042694673.1
Fetched taxid from NCBI 66851
------------------------------------------------
WP_040681874.1
Fetched taxid from NCBI 1348249
------------------------------------------------
WP_048082423.1
Fetched taxid from NCBI 2160
------------------------------------------------
WP_042707525.1
Fetched taxid from NCBI 190977
------------------------------------------------
WP_042691314.1
Fetched taxid from NCBI 2172
------------------------------------------------
WP_048080739.1
Fetched taxid from NCBI 2160
------------------------------------------------
WP_048080037.1
Fetched taxid from NCBI 2160
------------------------------------------------
KUK72323.1
Fetched taxid from NCBI 1641383
Unexpected error: <class 'urllib.error.HTTPError'>, Retrying, attempt 0
------------------------------------------------
WP_048079919.1
Fetched taxid from NCBI 2160
------------------------------------------------
WP_016358304.1
Fetched taxid from NCB

In [21]:
len(data_sequence)

49

In [22]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession WP_040681874.1 <class 'str'>
variant Nucleosomal <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 1348249 <class 'int'>
organism Methanobrevibacter boviskoreani <class 'str'>
phylum Methanobacteriota <class 'str'>
class Methanobacteria <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MGIAKAPINRIIKEAGAERVSGEATDALVKYLEEEAEAIATKAIEYAKIAKRQTVKADDIELAIKDQ <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [23]:
failed_toadd = []
for ds in data_sequence:
    if ds["accession"] not in not_exist:
        continue
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [24]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
66,ABC56784.1,Nucleosomal,,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MSELPLTPLGRIIKNGGAERVSEDAKVELSAFLEDTAEELAKLALN...,,,
413,ALT67815.1,Nucleosomal,,,,230361.0,Methanobrevibacter millerae,Methanobacteriota,Methanobacteria,,,MAEIPKAPIARIIKDTGAERVSEDAKVELAEALEEIARNIAIKANE...,,,
435,AMK16160.1,Nucleosomal,,,,294671.0,Methanobrevibacter olleyae,Methanobacteriota,Methanobacteria,,,MSIPVAPIGRIIKEAGAERVSEDAKKELNAYVEAQATEIAKKAIKF...,,,
574,AWX31846.1,Nucleosomal,,,,1789762.0,Methanosphaera sp. BMS,Methanobacteriota,Methanobacteria,,,MSENRELILNSIKTNGKQITDDALDNLEELLNVVEEDPFETFNDKI...,,,
717,CDF28750.1,Nucleosomal,,,,1263088.0,Methanobrevibacter smithii CAG:186,Methanobacteriota,Methanobacteria,,,MELPIAPVGRILKNAGAQRVSDDAKIALTEAIEECGNEIAQKAVGF...,,,
721,CDF29678.1,Nucleosomal,,,,1263088.0,Methanobrevibacter smithii CAG:186,Methanobacteriota,Methanobacteria,,,MSEIPKAPIARIIKDTGAERVSEDAKAELAEYLEEVARDVAIEANN...,,,
1989,KUK00359.1,Nucleosomal,,,,1635284.0,Methanobacteriaceae archaeon 41_258,Methanobacteriota,Methanobacteria,,,MVELPIAPVGRIIKNAGAERISNDAREELAKALEKMGEEIAASAVK...,,,
1990,KUK00930.1,Nucleosomal,,,,1635284.0,Methanobacteriaceae archaeon 41_258,Methanobacteriota,Methanobacteria,,,MGQLPIAPVGRIIKNAGAQRISDDAREALAKALEEIGEEIAASAVK...,,,
1991,KUK01880.1,Nucleosomal,,,,1635284.0,Methanobacteriaceae archaeon 41_258,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKNAGAERVSDDAREALAKALEAKGEEIAAAAIKF...,,,
2000,KUK71850.1,Nucleosomal,,,,1641383.0,Methanobacterium sp. 42_16,Methanobacteriota,Methanobacteria,,,MLSTMADYCMKTDIFKYYRVIAKLGGENMAELPIAPVGRIIKNAGA...,,,


In [25]:
failed_toadd_publication = []
for nex_acc in not_exist:
    try:
        cursor.execute(add_sequence_has_publication, (nex_acc, pid))
    except:
        print(nex_acc)
        failed_toadd_publication.append(nex_acc)

Msp_0924.b


In [26]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
66,ABC56784.1,Nucleosomal,,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MSELPLTPLGRIIKNGGAERVSEDAKVELSAFLEDTAEELAKLALN...,,ABC56784.1,stevens_histone_2020
413,ALT67815.1,Nucleosomal,,,,230361.0,Methanobrevibacter millerae,Methanobacteriota,Methanobacteria,,,MAEIPKAPIARIIKDTGAERVSEDAKVELAEALEEIARNIAIKANE...,,ALT67815.1,stevens_histone_2020
435,AMK16160.1,Nucleosomal,,,,294671.0,Methanobrevibacter olleyae,Methanobacteriota,Methanobacteria,,,MSIPVAPIGRIIKEAGAERVSEDAKKELNAYVEAQATEIAKKAIKF...,,AMK16160.1,stevens_histone_2020
574,AWX31846.1,Nucleosomal,,,,1789762.0,Methanosphaera sp. BMS,Methanobacteriota,Methanobacteria,,,MSENRELILNSIKTNGKQITDDALDNLEELLNVVEEDPFETFNDKI...,,AWX31846.1,stevens_histone_2020
717,CDF28750.1,Nucleosomal,,,,1263088.0,Methanobrevibacter smithii CAG:186,Methanobacteriota,Methanobacteria,,,MELPIAPVGRILKNAGAQRVSDDAKIALTEAIEECGNEIAQKAVGF...,,CDF28750.1,stevens_histone_2020
721,CDF29678.1,Nucleosomal,,,,1263088.0,Methanobrevibacter smithii CAG:186,Methanobacteriota,Methanobacteria,,,MSEIPKAPIARIIKDTGAERVSEDAKAELAEYLEEVARDVAIEANN...,,CDF29678.1,stevens_histone_2020
1989,KUK00359.1,Nucleosomal,,,,1635284.0,Methanobacteriaceae archaeon 41_258,Methanobacteriota,Methanobacteria,,,MVELPIAPVGRIIKNAGAERISNDAREELAKALEKMGEEIAASAVK...,,KUK00359.1,stevens_histone_2020
1990,KUK00930.1,Nucleosomal,,,,1635284.0,Methanobacteriaceae archaeon 41_258,Methanobacteriota,Methanobacteria,,,MGQLPIAPVGRIIKNAGAQRISDDAREALAKALEEIGEEIAASAVK...,,KUK00930.1,stevens_histone_2020
1991,KUK01880.1,Nucleosomal,,,,1635284.0,Methanobacteriaceae archaeon 41_258,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKNAGAERVSDDAREALAKALEAKGEEIAAAAIKF...,,KUK01880.1,stevens_histone_2020
2000,KUK71850.1,Nucleosomal,,,,1641383.0,Methanobacterium sp. 42_16,Methanobacteriota,Methanobacteria,,,MLSTMADYCMKTDIFKYYRVIAKLGGENMAELPIAPVGRIIKNAGA...,,KUK71850.1,stevens_histone_2020


# Update variant for capstones

In [27]:
capstones_methanobrevibacter = [
    "WP_042694673.1",
    "ALT67815.1",
    "ABQ87465.1",
    "CDF29678.1",
    "AMK16160.1",
    "ADC45863.1",
    "ADC47581.1",
    "AMK15744.1",
    "ADC47610.1",
    "WP_042707783.1",
    "WP_040681874.1",
    "AGN16739.1",
]

capstones_methanosphaera = [
    "AWX32523.1",
    "RAP54124.1",
    "RAP51258.1",
    "PAV07855.1",
    "OED30073.1",
    "RAP44639.1",
    "RAP47010.1",
    "OEC87236.1",
    "ABC56784.1",
]

len(capstones_methanobrevibacter+capstones_methanosphaera)

21

In [28]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(capstones_methanobrevibacter+capstones_methanosphaera)]['accession'].unique().size

21

In [29]:
failed_toupd = []
for acc in capstones_methanobrevibacter:
    try:
        query = f"UPDATE sequence SET variant='Capstones_(Methanobrevibacter)' WHERE accession='{acc}'"
        cursor.execute(query)
    except Exception as e:
        print(acc)
        print(e)
        failed_toupd.append(acc)

In [30]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(capstones_methanobrevibacter)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
95,ABQ87465.1,Capstones_(Methanobrevibacter),,,,420247.0,Methanobrevibacter smithii ATCC 35061,Methanobacteriota,Methanobacteria,,,MSEIPKAPIARIIKDTGAERVSEDAKAELAEYLEEVARDVAIEANN...,,ABQ87465.1,schwab_histones_2024
96,ABQ87465.1,Capstones_(Methanobrevibacter),,,,420247.0,Methanobrevibacter smithii ATCC 35061,Methanobacteriota,Methanobacteria,,,MSEIPKAPIARIIKDTGAERVSEDAKAELAEYLEEVARDVAIEANN...,,ABQ87465.1,stevens_histone_2020
157,ADC45863.1,Capstones_(Methanobrevibacter),,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MSIPVAPIGRIIKDAGAERVSEDAKKELNAYVTAQAEAVAKKAIEF...,,ADC45863.1,schwab_histones_2024
158,ADC45863.1,Capstones_(Methanobrevibacter),,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MSIPVAPIGRIIKDAGAERVSEDAKKELNAYVTAQAEAVAKKAIEF...,,ADC45863.1,stevens_histone_2020
167,ADC47581.1,Capstones_(Methanobrevibacter),,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAIPKAPVNRIIKDAGAERVSAEAVDALVAYLEEDAAAISKKAIEY...,,ADC47581.1,schwab_histones_2024
168,ADC47581.1,Capstones_(Methanobrevibacter),,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAIPKAPVNRIIKDAGAERVSAEAVDALVAYLEEDAAAISKKAIEY...,,ADC47581.1,stevens_histone_2020
169,ADC47610.1,Capstones_(Methanobrevibacter),,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAIPKAPVKRIMKEEGAERVSAEAVDALVDYLETDADAIARKAIDY...,,ADC47610.1,mattiroli_structure_2017
170,ADC47610.1,Capstones_(Methanobrevibacter),,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAIPKAPVKRIMKEEGAERVSAEAVDALVDYLETDADAIARKAIDY...,,ADC47610.1,stevens_histone_2020
303,AGN16739.1,Capstones_(Methanobrevibacter),,,,224719.0,Methanobrevibacter sp. AbM4,Methanobacteriota,Methanobacteria,,,MGIAKAPINRIIKEAGAERVSGDATDALVKYLEEEAEAIATKAIEY...,,AGN16739.1,schwab_histones_2024
304,AGN16739.1,Capstones_(Methanobrevibacter),,,,224719.0,Methanobrevibacter sp. AbM4,Methanobacteriota,Methanobacteria,,,MGIAKAPINRIIKEAGAERVSGDATDALVKYLEEEAEAIATKAIEY...,,AGN16739.1,stevens_histone_2020


In [31]:
failed_toupd = []
for acc in capstones_methanosphaera:
    try:
        query = f"UPDATE sequence SET variant='Capstone_(Methanosphaera)' WHERE accession='{acc}'"
        cursor.execute(query)
    except Exception as e:
        print(acc)
        print(e)
        failed_toupd.append(acc)

In [32]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(capstones_methanosphaera)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
66,ABC56784.1,Capstone_(Methanosphaera),,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MSELPLTPLGRIIKNGGAERVSEDAKVELSAFLEDTAEELAKLALN...,,ABC56784.1,stevens_histone_2020
585,AWX32523.1,Capstone_(Methanosphaera),,,,1789762.0,Methanosphaera sp. BMS,Methanobacteriota,Methanobacteria,,,MGELPLTPLGRIIKKGGAERVSEDAKEELSDFLEEEAETIAKLALD...,,AWX32523.1,schwab_histones_2024
586,AWX32523.1,Capstone_(Methanosphaera),,,,1789762.0,Methanosphaera sp. BMS,Methanobacteriota,Methanobacteria,,,MGELPLTPLGRIIKKGGAERVSEDAKEELSDFLEEEAETIAKLALD...,,AWX32523.1,stevens_histone_2020
3358,OEC87236.1,Capstone_(Methanosphaera),,,,1860157.0,Methanosphaera sp. A6,Methanobacteriota,Methanobacteria,,,MSELPLTPLGRIIKNGGAERVSEDAKVELSAFLEDTAEELAKLALN...,,OEC87236.1,stevens_histone_2020
3388,OED30073.1,Capstone_(Methanosphaera),,,,1561964.0,Methanosphaera sp. WGK6,Methanobacteriota,Methanobacteria,,,MTKLPLTPLGRIMKNGGAERVSEDAKEELSSFLEDQASELAKIALN...,,OED30073.1,schwab_histones_2024
3389,OED30073.1,Capstone_(Methanosphaera),,,,1561964.0,Methanosphaera sp. WGK6,Methanobacteriota,Methanobacteria,,,MTKLPLTPLGRIMKNGGAERVSEDAKEELSSFLEDQASELAKIALN...,,OED30073.1,stevens_histone_2020
3763,PAV07855.1,Capstone_(Methanosphaera),,,,1077256.0,Methanosphaera cuniculi,Methanobacteriota,Methanobacteria,,,MTRLPLTPLGRILKHGGAERVSESAKEELAKYLEEQAAAITEIALE...,,PAV07855.1,stevens_histone_2020
4320,RAP44639.1,Capstone_(Methanosphaera),,,,1945632.0,Methanosphaera sp. SHI1033,Methanobacteriota,Methanobacteria,,,GRIIKNGGAERVSEDAKVELSEFLEDQAAELAKLALDNAKENSRKT...,,RAP44639.1,stevens_histone_2020
4339,RAP47010.1,Capstone_(Methanosphaera),,,,1945580.0,Methanosphaera sp. rholeuAM6,Methanobacteriota,Methanobacteria,,,MAKLPLTPLGRIIKNGGAERVSENAKVALSEYLEDVSVDITKYALK...,,RAP47010.1,stevens_histone_2020
4364,RAP51258.1,Capstone_(Methanosphaera),,,,1945631.0,Methanosphaera sp. SHI613,Methanobacteriota,Methanobacteria,,,MGELPLTPLGRIIKKGGAERVSEDAKQELSVFLEQEAGVIAKLALD...,,RAP51258.1,schwab_histones_2024


In [33]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
12,AAB84760.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MRWLIMELPIAPIGRIIKNAGAEIVSDDAREALAKVLEAKGEEIAE...,,AAB84760.1,schwab_histones_2024
13,AAB84760.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MRWLIMELPIAPIGRIIKNAGAEIVSDDAREALAKVLEAKGEEIAE...,,AAB84760.1,stevens_histone_2020
14,AAB85321.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAKEALAKALEEMGEEISRKAVE...,,AAB85321.1,schwab_histones_2024
15,AAB85321.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAKEALAKALEEMGEEISRKAVE...,,AAB85321.1,stevens_histone_2020
16,AAB86168.1,Nucleosomal,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAREALAKILEEKGEEIAKEAVK...,,AAB86168.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5366,WP_048190788.1,Nucleosomal,,,,1495336.0,Methanobacterium sp. SMA-27,Methanobacteriota,Methanobacteria,,,MGELPIAPIGRIIKNAGGLRVSEGAEITLDKYLEEYGENISRQAVK...,,WP_048190788.1,stevens_histone_2020
5367,WP_048191214.1,Nucleosomal,,,,2160.0,Methanobacterium,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKNAGAERVSDDAREALAKALEEKGEMIASEAVKL...,,WP_048191214.1,stevens_histone_2020
5368,WP_048191238.1,Nucleosomal,,,,2160.0,Methanobacterium,Methanobacteriota,Methanobacteria,,,MTELPVAPVGRIIKNAGAQRISDDARDELAKVLEEAGEKIAVEAVK...,,WP_048191238.1,stevens_histone_2020
5369,WP_048191910.1,Nucleosomal,,,,1495336.0,Methanobacterium sp. SMA-27,Methanobacteriota,Methanobacteria,,,MNELPIAPIGRIIKHADANIRISEDAKEALGKVLEECGEDISKQAL...,,WP_048191910.1,stevens_histone_2020


In [34]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [35]:
cursor.close()
conn.close()
tunnel.stop()