In [40]:
from urllib.error import HTTPError

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

40783


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [7]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [6]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add other archaeal sequences as Nucleosomal

These seqs from [article](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747315/)

In [12]:
accessions = [
    "BAI60563",
    "ADC47610",
    "WP_02966277",
    "KYK38613",
    "WP_048125684",
    "WP_011973395",
    "WP_010871171",
    "KYH40538",
    "KPV63666",
    "WP_04104690",
    "AIE90726",
    "BAJ48508",
    "KON33214",
    "WP_052884954",
    "WP_012186746",
    "OLS16336",
    "OLS15619",
    "OLS12771",
    "KKK44894",
    "KKK41688",
    "KKK45508",
    "EGQ43804",
    "AOV94489",
    "EHK02195",
    "OLS26110",
    "OLS24625",
    "OLS19133",
    "OLS18443",
    "OLS19133",
    "OLS22331",
    "AAC72546",
    "WP_008091782",
    "WP_015792102",
    "AAR39136",
]

In [13]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [42]:
data_sequence = []
accession_versions = []
for acc in accessions:
    print("------------------------------------------------")
    print(acc)
    try:
        with Entrez.efetch(
            db="protein", id=acc, rettype="gb", retmode="text"
        ) as handle:
            record = SeqIO.read(handle, "genbank")
        #     print(record)
        # print(record.seq)
        accession_versions.append(record.id)
        taxonomy_data = get_taxonomy_data(record)
        data_sequence.append(
            {
                "accession": record.id,
                "variant": "Bacterial dimers",  # ОШИБКА !!!
                "gi": None,
                "ncbi_gene_id": None,
                "hgnc_gene_name": None,
                "taxonomy_id": 9606,
                "organism": None,
                "phylum": None,
                "class": None,
                "taxonomy_group": None,
                "info": None,
                "sequence": str(record.seq),
                "variant_under_consideration": None,
            }
        )
        data_sequence[-1].update(taxonomy_data)
        # for k, v in data_sequence[-1].items():
        #     print(k, v, type(v))
    except HTTPError as err:
        if err.code != 400:
            raise
        print(err)

------------------------------------------------
BAI60563
Fetched taxid from NCBI 304371
------------------------------------------------
ADC47610
Fetched taxid from NCBI 634498
------------------------------------------------
WP_02966277
HTTP Error 400: Bad Request
------------------------------------------------
KYK38613
Fetched taxid from NCBI 1803813
------------------------------------------------
WP_048125684
Fetched taxid from NCBI 2644672
------------------------------------------------
WP_011973395
Fetched taxid from NCBI 42879
------------------------------------------------
WP_010871171
Fetched taxid from NCBI 2190
------------------------------------------------
KYH40538
Fetched taxid from NCBI 1779371
------------------------------------------------
KPV63666
Fetched taxid from NCBI 1700836
Unexpected error: <class 'urllib.error.HTTPError'>, Retrying, attempt 0
------------------------------------------------
WP_04104690
HTTP Error 400: Bad Request
-------------------------

**Не найдено записей для WP_02966277 и WP_04104690**

In [43]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accession_versions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
44,AAR39136.1,Nucleosomal,,,,228908.0,Nanoarchaeum equitans Kin4-M,Nanobdellota,Candidatus Nanoarchaeia,,,MPAKRDRGIPLAAVERILKEEAKKVGVTRVSDKAVRLLKEKLEQIY...,,AAR39136.1,schwab_histones_2024
250,AIE90726.1,Nucleosomal,,,,1455884.0,uncultured marine thaumarchaeote AD1000_06_A03,Nitrososphaerota,,,,MSDLEFGLAAVYRIIKKTGAERVGDDAAEELRTVLEEFGIKIAEQA...,,AIE90726.1,schwab_histones_2024
352,AOV94489.1,Nucleosomal,,,,1737403.0,Nanohaloarchaea archaeon SG9,Methanobacteriota,Candidatus Nanohaloarchaea,,,MEFSVAKMKDMIKTQGDKRVSEDGAEELGQVLEMFAGDVAEETIAI...,,AOV94489.1,schwab_histones_2024
494,BAI60563.1,Nucleosomal,,,,304371.0,Methanocella paludicola SANAE,Methanobacteriota,Methanomicrobia,,,MTEISKAPIARLLSQAGGDRISAEAVDEMVKYTEDYVLKVGTEASK...,,BAI60563.1,schwab_histones_2024
497,BAJ48508.1,Nucleosomal,,,,311458.0,Candidatus Caldarchaeum subterraneum,Nitrososphaerota,,,,MSEKEADIPSAPIHRIMKKAGAARVSEDAADELRKILENVGAMIAK...,,BAJ48508.1,schwab_histones_2024
1756,KKK44894.1,Nucleosomal,,,,1538547.0,Candidatus Lokiarchaeum sp. GC14_75,Promethearchaeota,Promethearchaeia,,,MWNFAWSPIRRLMKQQGASIVARNAVDLLIDHLEKTATGLTEQART...,,KKK44894.1,schwab_histones_2024
1757,KKK45508.1,Nucleosomal,,,,1538547.0,Candidatus Lokiarchaeum sp. GC14_75,Promethearchaeota,Promethearchaeia,,,MAAFAWSPLRALMKKAGAEIVSRAAVDKLMDYLEEYAKSLTGCALD...,,KKK45508.1,schwab_histones_2024
1777,KON33214.1,Nucleosomal,,,,1685126.0,miscellaneous Crenarchaeota group-6 archaeon A...,Candidatus Bathyarchaeota,Candidatus Bathyarchaeia,,,MTNSELAVAPMHRLCKKAGADRVSEAAAKELAKALEGIGIKIAKEA...,,KON33214.1,schwab_histones_2024
1790,KPV63666.1,Nucleosomal,,,,1700836.0,Candidatus Bathyarchaeota archaeon BA2,Candidatus Bathyarchaeota,,,,MVDSELAVAPMHRICKKAGANRVSEAATKALAKELEDVGIKIAKEA...,,KPV63666.1,schwab_histones_2024
1902,KYH40538.1,Nucleosomal,,,,1779371.0,Candidatus Bathyarchaeota archaeon B26-2,Candidatus Bathyarchaeota,,,,MVRKGSISVAAMHKICKKAGAERVSKSAAAELAEIIEEVGIKIAKE...,,KYH40538.1,schwab_histones_2024


In [44]:
len(accession_versions), df[df["accession"].isin(accession_versions)].shape[0]

(32, 17)

## Add reference for existing sequences

In [45]:
existing = df[df["accession"].isin(accession_versions)]["accession"].values
existing

array(['AAR39136.1', 'AIE90726.1', 'AOV94489.1', 'BAI60563.1',
       'BAJ48508.1', 'KKK44894.1', 'KKK45508.1', 'KON33214.1',
       'KPV63666.1', 'KYH40538.1', 'KYK38613.1', 'OLS12771.1',
       'OLS15619.1', 'OLS18443.1', 'OLS19133.1', 'OLS24625.1',
       'OLS26110.1'], dtype=object)

In [46]:
pid = "mattiroli_structure_2017"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,mattiroli_structure_2017,Structure of Histone-based Chromatin in Archaea,10.1126/science.aaj1849,,2017


In [47]:
failed_toadd_publication = []
for ex_acc in existing:
    try:
        cursor.execute(add_sequence_has_publication, (ex_acc, pid))
    except:
        print(ex_acc)
        failed_toadd_publication(ex_acc)

In [48]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accession_versions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
44,AAR39136.1,Nucleosomal,,,,228908.0,Nanoarchaeum equitans Kin4-M,Nanobdellota,Candidatus Nanoarchaeia,,,MPAKRDRGIPLAAVERILKEEAKKVGVTRVSDKAVRLLKEKLEQIY...,,AAR39136.1,mattiroli_structure_2017
45,AAR39136.1,Nucleosomal,,,,228908.0,Nanoarchaeum equitans Kin4-M,Nanobdellota,Candidatus Nanoarchaeia,,,MPAKRDRGIPLAAVERILKEEAKKVGVTRVSDKAVRLLKEKLEQIY...,,AAR39136.1,schwab_histones_2024
251,AIE90726.1,Nucleosomal,,,,1455884.0,uncultured marine thaumarchaeote AD1000_06_A03,Nitrososphaerota,,,,MSDLEFGLAAVYRIIKKTGAERVGDDAAEELRTVLEEFGIKIAEQA...,,AIE90726.1,mattiroli_structure_2017
252,AIE90726.1,Nucleosomal,,,,1455884.0,uncultured marine thaumarchaeote AD1000_06_A03,Nitrososphaerota,,,,MSDLEFGLAAVYRIIKKTGAERVGDDAAEELRTVLEEFGIKIAEQA...,,AIE90726.1,schwab_histones_2024
354,AOV94489.1,Nucleosomal,,,,1737403.0,Nanohaloarchaea archaeon SG9,Methanobacteriota,Candidatus Nanohaloarchaea,,,MEFSVAKMKDMIKTQGDKRVSEDGAEELGQVLEMFAGDVAEETIAI...,,AOV94489.1,mattiroli_structure_2017
355,AOV94489.1,Nucleosomal,,,,1737403.0,Nanohaloarchaea archaeon SG9,Methanobacteriota,Candidatus Nanohaloarchaea,,,MEFSVAKMKDMIKTQGDKRVSEDGAEELGQVLEMFAGDVAEETIAI...,,AOV94489.1,schwab_histones_2024
497,BAI60563.1,Nucleosomal,,,,304371.0,Methanocella paludicola SANAE,Methanobacteriota,Methanomicrobia,,,MTEISKAPIARLLSQAGGDRISAEAVDEMVKYTEDYVLKVGTEASK...,,BAI60563.1,mattiroli_structure_2017
498,BAI60563.1,Nucleosomal,,,,304371.0,Methanocella paludicola SANAE,Methanobacteriota,Methanomicrobia,,,MTEISKAPIARLLSQAGGDRISAEAVDEMVKYTEDYVLKVGTEASK...,,BAI60563.1,schwab_histones_2024
501,BAJ48508.1,Nucleosomal,,,,311458.0,Candidatus Caldarchaeum subterraneum,Nitrososphaerota,,,,MSEKEADIPSAPIHRIMKKAGAARVSEDAADELRKILENVGAMIAK...,,BAJ48508.1,mattiroli_structure_2017
502,BAJ48508.1,Nucleosomal,,,,311458.0,Candidatus Caldarchaeum subterraneum,Nitrososphaerota,,,,MSEKEADIPSAPIHRIMKKAGAARVSEDAADELRKILENVGAMIAK...,,BAJ48508.1,schwab_histones_2024


## Add other sequences

**Не найдено записей для WP_02966277 и WP_04104690**

In [55]:
not_exist = set(accession_versions) - set(
    df["accession"]
)  # один accession в исходном списке (и в статье) дублируется
len(not_exist)

14

In [59]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


Ошиблась, нужно указать корректное название варианта гистона

In [63]:
for ds in data_sequence:
    ds["variant"] = "Nucleosomal"

In [65]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession ADC47610.1 <class 'str'>
variant Nucleosomal <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 634498 <class 'int'>
organism Methanobrevibacter ruminantium M1 <class 'str'>
phylum Methanobacteriota <class 'str'>
class Methanobacteria <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MAIPKAPVKRIMKEEGAERVSAEAVDALVDYLETDADAIARKAIDYAKLAKRQTVKAEDIALAIGRPETSESTAENPHNLLEVVQKVLDAAADGKGFEEIIKSFMKLEKKE <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [66]:
failed_toadd = []
for ds in data_sequence:
    if ds["accession"] not in not_exist:
        continue
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [68]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
30,AAC72546.1,Nucleosomal,,,,2320.0,Methanopyrus kandleri,Methanobacteriota,Methanopyri,,,MAVELPKAAIERIFRQGIGERRLSQDAKDTIYDFVPTMAEYVANAA...,,,
137,ADC47610.1,Nucleosomal,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAIPKAPVKRIMKEEGAERVSAEAVDALVDYLETDADAIARKAIDY...,,,
657,EGQ43804.1,Nucleosomal,,,,889948.0,Candidatus Nanosalina sp. J07AB43,Methanobacteriota,Candidatus Nanohaloarchaea,,,MEFSTSKMKEMIKDEGDKRVSEESAEELGQVIEMFAGDVAEEATAI...,,,
675,EHK02195.1,Nucleosomal,,,,1072681.0,Candidatus Haloredivivus sp. G17,Methanobacteriota,Candidatus Nanohaloarchaea,,,MEFSISKMKEVMKARTGKRVSREAAEELSADLESKGQEITASAIEI...,,,
1763,KKK41688.1,Nucleosomal,,,,1538547.0,Candidatus Lokiarchaeum sp. GC14_75,Promethearchaeota,Promethearchaeia,,,MAGSEYISWSPIRRLMKHNGALIVARDAVNELVDWMGRSAEKLTKT...,,,
3335,OLS16336.1,Nucleosomal,,,,1849166.0,Promethearchaeota archaeon CR_4,Promethearchaeota,,,,MPEKTKNLYFSKTPLRRLMKQAGAGPVSEDAIQSLITQLEKRGREI...,,,
3345,OLS22331.1,Nucleosomal,,,,1841598.0,Candidatus Heimdallarchaeota archaeon LC_3,Candidatus Heimdallarchaeota,,,,MEVLAKEQIINQKQERESIFYNLTRSVCPTCKKSIDAQILIRNNKV...,,,
5032,WP_008091782.1,Nucleosomal,,,,2251.0,Haloferax,Methanobacteriota,Halobacteria,,,MSVELPFAPVDAIIRQNAGELRVSAGAAEALARRIQDHGAELAIDA...,,,
5033,WP_010871171.1,Nucleosomal,,,,2190.0,Methanocaldococcus jannaschii,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,,
5036,WP_011973395.1,Nucleosomal,,,,42879.0,Methanococcus aeolicus,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKQNTDMNVSAESVVKIVEILQEYIVTTTRLAEENA...,,,


In [69]:
failed_toadd_publication = []
for nex_acc in not_exist:
    try:
        cursor.execute(add_sequence_has_publication, (nex_acc, pid))
    except:
        print(nex_acc)
        failed_toadd_publication(nex_acc)

In [70]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
30,AAC72546.1,Nucleosomal,,,,2320.0,Methanopyrus kandleri,Methanobacteriota,Methanopyri,,,MAVELPKAAIERIFRQGIGERRLSQDAKDTIYDFVPTMAEYVANAA...,,AAC72546.1,mattiroli_structure_2017
137,ADC47610.1,Nucleosomal,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAIPKAPVKRIMKEEGAERVSAEAVDALVDYLETDADAIARKAIDY...,,ADC47610.1,mattiroli_structure_2017
657,EGQ43804.1,Nucleosomal,,,,889948.0,Candidatus Nanosalina sp. J07AB43,Methanobacteriota,Candidatus Nanohaloarchaea,,,MEFSTSKMKEMIKDEGDKRVSEESAEELGQVIEMFAGDVAEEATAI...,,EGQ43804.1,mattiroli_structure_2017
675,EHK02195.1,Nucleosomal,,,,1072681.0,Candidatus Haloredivivus sp. G17,Methanobacteriota,Candidatus Nanohaloarchaea,,,MEFSISKMKEVMKARTGKRVSREAAEELSADLESKGQEITASAIEI...,,EHK02195.1,mattiroli_structure_2017
1763,KKK41688.1,Nucleosomal,,,,1538547.0,Candidatus Lokiarchaeum sp. GC14_75,Promethearchaeota,Promethearchaeia,,,MAGSEYISWSPIRRLMKHNGALIVARDAVNELVDWMGRSAEKLTKT...,,KKK41688.1,mattiroli_structure_2017
3335,OLS16336.1,Nucleosomal,,,,1849166.0,Promethearchaeota archaeon CR_4,Promethearchaeota,,,,MPEKTKNLYFSKTPLRRLMKQAGAGPVSEDAIQSLITQLEKRGREI...,,OLS16336.1,mattiroli_structure_2017
3345,OLS22331.1,Nucleosomal,,,,1841598.0,Candidatus Heimdallarchaeota archaeon LC_3,Candidatus Heimdallarchaeota,,,,MEVLAKEQIINQKQERESIFYNLTRSVCPTCKKSIDAQILIRNNKV...,,OLS22331.1,mattiroli_structure_2017
5032,WP_008091782.1,Nucleosomal,,,,2251.0,Haloferax,Methanobacteriota,Halobacteria,,,MSVELPFAPVDAIIRQNAGELRVSAGAAEALARRIQDHGAELAIDA...,,WP_008091782.1,mattiroli_structure_2017
5033,WP_010871171.1,Nucleosomal,,,,2190.0,Methanocaldococcus jannaschii,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,WP_010871171.1,mattiroli_structure_2017
5036,WP_011973395.1,Nucleosomal,,,,42879.0,Methanococcus aeolicus,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKQNTDMNVSAESVVKIVEILQEYIVTTTRLAEENA...,,WP_011973395.1,mattiroli_structure_2017


# Add other archaeal sequences as Nucleosomal 2

These seqs from [article](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1007582)

pid='henneman_structure_2018'

In [85]:
accessions = [
    "OLS22332.1",
    "OLS24873.1",
    "OLS21974.1",
    "KKK41979.1",
    "OLS16336.1",
    "OLS18261.1",
    "KXH71038.1",
    "OIO61677.1",
    "OIO41945.1",
    "PJB03565.1",
    "PJB04497.1",
    "PJA17623.1",
    "OIN88081.1",
    "EET90461.1",
    "EHK01841.1",
    "EGQ42849.1",
    "EGQ43804.1",
    "AAR39197.1",
    "AFU59009.1",
    "KYH36356.1",
    "KYH37304.1",
    "KON27866.1",
    "ABW02527.1",
    "ABL77757.1",
    "ADN51226.1",
    "WP_42707783.1",
    "WP_42706862.1",
    "AAB99668.1",
    "KGK98166.1",
    "BAD86478.1",
    "ADP77985.1",
]

In [86]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
21,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,
42,AAR39197.1,Nucleosomal,,,,228908.0,Nanoarchaeum equitans Kin4-M,Nanobdellota,Candidatus Nanoarchaeia,,,MAKRKYPFPVAPLYRIMRQAGAKRVTKDAKEAFVEVAVEIAKRVAR...,
63,ABL77757.1,Nucleosomal,,,,368408.0,Thermofilum pendens Hrk 5,Thermoproteota,Thermoprotei,,,MSEKSPRTRQHEIPLAPLRRIFRSQGAERISDDAVVFLREYLEKLA...,
84,ABW02527.1,Nucleosomal,,,,397948.0,Caldivirga maquilingensis IC-167,Thermoproteota,Thermoprotei,,,MPEIPLAPIERIFKKAGAERVGEDAVIALRDVLENVAYEVSVKSIE...,
160,ADN51226.1,Nucleosomal,,,,572478.0,Vulcanisaeta distributa DSM 14429,Thermoproteota,Thermoprotei,,,MPELPLAPIDRIFHKAGAERVGEDAIQALRDILEYIAFDIASKSIE...,
163,ADP77985.1,HMfB_(Methanothermus_fervidus),,,,523846.0,Methanothermus fervidus DSM 2088,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,
216,AFU59009.1,Nucleosomal,,,,1237085.0,Candidatus Nitrososphaera gargensis Ga9.2,Nitrososphaerota,Nitrososphaeria,,,MSSSGPEFGLAAMYRVMKKSGAERVSDDAADELRKVLEEVAERIAK...,
484,BAD86478.1,HTkB_(Thermococcus_kodakarensis),,,,69014.0,Thermococcus kodakarensis KOD1,Methanobacteriota,Thermococci,,,MAELPIAPVDRLIRKAGAARVSEEAAKVLAEHLEEKALEIAKKAVA...,
614,EET90461.1,Nucleosomal,,,,425595.0,Candidatus Micrarchaeum acidiphilum ARMAN-2,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MYITKSTVKKMLKGAGATRVSESALSYFQEQLEKIALKAASNSVKL...,
629,EGQ42849.1,Nucleosomal,,,,889948.0,Candidatus Nanosalina sp. J07AB43,Methanobacteriota,Candidatus Nanohaloarchaea,,,MDLPNAPVERIIRKAGAERVSQDAVEELRQALEDLGQEIAVDANQM...,


In [74]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE accession='AAB99668.1'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,AAB99668.1,Bridge_(Methanococcales),,,,243232,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,Ofer_dna-bridging_2023
1,AAB99668.1,Bridge_(Methanococcales),,,,243232,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,schwab_histones_2024


In [87]:
len(accessions), df[df["accession"].isin(accessions)].shape[0]

(31, 24)

## Add reference for existing sequences

In [88]:
existing = df[df["accession"].isin(accessions)]["accession"].values
existing.size

24

In [77]:
pid = "henneman_structure_2018"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,henneman_structure_2018,Structure and function of archaeal histones,10.1371/journal.pgen.1007582,,2018


In [78]:
failed_toadd_publication = []
for ex_acc in existing:
    try:
        cursor.execute(add_sequence_has_publication, (ex_acc, pid))
    except:
        print(ex_acc)
        failed_toadd_publication(ex_acc)

In [79]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
21,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,henneman_structure_2018
22,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,Ofer_dna-bridging_2023
23,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,schwab_histones_2024
48,AAR39197.1,Nucleosomal,,,,228908.0,Nanoarchaeum equitans Kin4-M,Nanobdellota,Candidatus Nanoarchaeia,,,MAKRKYPFPVAPLYRIMRQAGAKRVTKDAKEAFVEVAVEIAKRVAR...,,AAR39197.1,henneman_structure_2018
49,AAR39197.1,Nucleosomal,,,,228908.0,Nanoarchaeum equitans Kin4-M,Nanobdellota,Candidatus Nanoarchaeia,,,MAKRKYPFPVAPLYRIMRQAGAKRVTKDAKEAFVEVAVEIAKRVAR...,,AAR39197.1,schwab_histones_2024
70,ABL77757.1,Nucleosomal,,,,368408.0,Thermofilum pendens Hrk 5,Thermoproteota,Thermoprotei,,,MSEKSPRTRQHEIPLAPLRRIFRSQGAERISDDAVVFLREYLEKLA...,,ABL77757.1,henneman_structure_2018
71,ABL77757.1,Nucleosomal,,,,368408.0,Thermofilum pendens Hrk 5,Thermoproteota,Thermoprotei,,,MSEKSPRTRQHEIPLAPLRRIFRSQGAERISDDAVVFLREYLEKLA...,,ABL77757.1,schwab_histones_2024
92,ABW02527.1,Nucleosomal,,,,397948.0,Caldivirga maquilingensis IC-167,Thermoproteota,Thermoprotei,,,MPEIPLAPIERIFKKAGAERVGEDAVIALRDVLENVAYEVSVKSIE...,,ABW02527.1,henneman_structure_2018
93,ABW02527.1,Nucleosomal,,,,397948.0,Caldivirga maquilingensis IC-167,Thermoproteota,Thermoprotei,,,MPEIPLAPIERIFKKAGAERVGEDAVIALRDVLENVAYEVSVKSIE...,,ABW02527.1,schwab_histones_2024
169,ADN51226.1,Nucleosomal,,,,572478.0,Vulcanisaeta distributa DSM 14429,Thermoproteota,Thermoprotei,,,MPELPLAPIDRIFHKAGAERVGEDAIQALRDILEYIAFDIASKSIE...,,ADN51226.1,henneman_structure_2018


## Add other sequences

**Не найдено записей для WP_42706862.1 и WP_42707783.1**

In [89]:
not_exist = set(accessions) - set(df["accession"])
not_exist

{'EHK01841.1',
 'KON27866.1',
 'KYH36356.1',
 'OLS18261.1',
 'OLS21974.1',
 'WP_42706862.1',
 'WP_42707783.1'}

In [93]:
data_sequence = []
for acc in not_exist:
    print("------------------------------------------------")
    print(acc)
    try:
        with Entrez.efetch(
            db="protein", id=acc, rettype="gb", retmode="text"
        ) as handle:
            record = SeqIO.read(handle, "genbank")
        #     print(record)
        # print(record.seq)
        taxonomy_data = get_taxonomy_data(record)
        data_sequence.append(
            {
                "accession": record.id,
                "variant": "Nucleosomal",
                "gi": None,
                "ncbi_gene_id": None,
                "hgnc_gene_name": None,
                "taxonomy_id": 9606,
                "organism": None,
                "phylum": None,
                "class": None,
                "taxonomy_group": None,
                "info": None,
                "sequence": str(record.seq),
                "variant_under_consideration": None,
            }
        )
        data_sequence[-1].update(taxonomy_data)
        # for k, v in data_sequence[-1].items():
        #     print(k, v, type(v))
    except HTTPError as err:
        if err.code != 400:
            raise
        print(err)

------------------------------------------------
WP_42706862.1
HTTP Error 400: Bad Request
------------------------------------------------
EHK01841.1
Fetched taxid from NCBI 1072681
------------------------------------------------
KYH36356.1
Fetched taxid from NCBI 1779367
------------------------------------------------
OLS18261.1
Fetched taxid from NCBI 1841599
------------------------------------------------
WP_42707783.1
HTTP Error 400: Bad Request
------------------------------------------------
OLS21974.1
Fetched taxid from NCBI 1841598
------------------------------------------------
KON27866.1
Fetched taxid from NCBI 1685135


In [94]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [95]:
failed_toadd = []
for ds in data_sequence:
    if ds["accession"] not in not_exist:
        continue
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [96]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
685,EHK01841.1,Nucleosomal,,,,1072681.0,Candidatus Haloredivivus sp. G17,Methanobacteriota,Candidatus Nanohaloarchaea,,,MKARTGKRVSREAAEELSADLESKGQEITASAIEIAERKGRVTVRA...,,,
1802,KON27866.1,Nucleosomal,,,,1685135.0,miscellaneous Crenarchaeota group archaeon SMT...,Candidatus Bathyarchaeota,,,,MKDNCLSNPAVHRLIELAGAERVGDDAVEELKKVLEEVAFFISKDA...,,,
1930,KYH36356.1,Nucleosomal,,,,1779367.0,Candidatus Bathyarchaeota archaeon B23,Candidatus Bathyarchaeota,,,,MEPGTGSRAASTKIRLSIVQKINRSRSLKGSGLAEEFTLAPMRRLL...,,,
3359,OLS18261.1,Nucleosomal,,,,1841599.0,Candidatus Odinarchaeum yellowstonii,Candidatus Odinarchaeota,Candidatus Odinarchaeia,,,MPKEKAERIIPLAPLDRLIRKAKVERVSEKAASELGKILEEIGLEI...,,,
3365,OLS21974.1,Nucleosomal,,,,1841598.0,Candidatus Heimdallarchaeota archaeon LC_3,Candidatus Heimdallarchaeota,,,,MPDIPLSSIDRIIRYTGATRIKPGATESLRNITENIIIIIAKKALE...,,,


In [98]:
failed_toadd_publication = []
for nex_acc in not_exist:
    try:
        cursor.execute(add_sequence_has_publication, (nex_acc, pid))
    except:
        print(nex_acc)
        failed_toadd_publication.append(nex_acc)

WP_42706862.1
WP_42707783.1


In [99]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(not_exist)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
685,EHK01841.1,Nucleosomal,,,,1072681.0,Candidatus Haloredivivus sp. G17,Methanobacteriota,Candidatus Nanohaloarchaea,,,MKARTGKRVSREAAEELSADLESKGQEITASAIEIAERKGRVTVRA...,,EHK01841.1,henneman_structure_2018
1802,KON27866.1,Nucleosomal,,,,1685135.0,miscellaneous Crenarchaeota group archaeon SMT...,Candidatus Bathyarchaeota,,,,MKDNCLSNPAVHRLIELAGAERVGDDAVEELKKVLEEVAFFISKDA...,,KON27866.1,henneman_structure_2018
1930,KYH36356.1,Nucleosomal,,,,1779367.0,Candidatus Bathyarchaeota archaeon B23,Candidatus Bathyarchaeota,,,,MEPGTGSRAASTKIRLSIVQKINRSRSLKGSGLAEEFTLAPMRRLL...,,KYH36356.1,henneman_structure_2018
3359,OLS18261.1,Nucleosomal,,,,1841599.0,Candidatus Odinarchaeum yellowstonii,Candidatus Odinarchaeota,Candidatus Odinarchaeia,,,MPKEKAERIIPLAPLDRLIRKAKVERVSEKAASELGKILEEIGLEI...,,OLS18261.1,henneman_structure_2018
3365,OLS21974.1,Nucleosomal,,,,1841598.0,Candidatus Heimdallarchaeota archaeon LC_3,Candidatus Heimdallarchaeota,,,,MPDIPLSSIDRIIRYTGATRIKPGATESLRNITENIIIIIAKKALE...,,OLS21974.1,henneman_structure_2018


In [100]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [110]:
cursor.close()
conn.close()
tunnel.stop()