In [15]:
!pip install uniprot-id-mapper

Collecting uniprot-id-mapper
  Downloading uniprot_id_mapper-1.1.4-py3-none-any.whl.metadata (12 kB)
Downloading uniprot_id_mapper-1.1.4-py3-none-any.whl (45 kB)
[0mInstalling collected packages: uniprot-id-mapper
Successfully installed uniprot-id-mapper-1.1.4


In [11]:
import pandas as pd
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

from Bio import Entrez
from Bio import SeqIO
Entrez.email = "l.singh@intbio.org"

In [16]:
from UniProtMapper import ProtMapper

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

41075


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [24]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [10]:
def get_taxonomy_data(record):
    import sys
    import re
    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print("Unexpected error: {}, Retrying, attempt {}".format(sys.exc_info()[0], i))
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None: taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None: taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add sequences from schwab_histones_2024

[Article](https://www.nature.com/articles/s41467-024-52337-y)

## Adding Bacterial dimers

In [14]:
accessions_uniprot = ['A0A2A4X5H8', 'A0A3M1VH40', 'A0A7C7W092', 'A0A2D7HW20',
       'A0A2D7PIZ8', 'A0A2E7MAJ9', 'A0A4P5VUA6', 'A0A7V3FMV2',
       'A0A7W1B921', 'A0A353ESF2', 'A0A2H0PEF3', 'A0A1F8WE83',
       'A0A1F9FWF6', 'A0A1F9GKK6', 'A0A1F9H5X7', 'A0A1F9HMM2',
       'A0A4Q5XI17', 'A0A7C7ZM11', 'A0A7C8CPF4', 'A0A7Y3DDR4',
       'A0A7V0PNE8', 'A0A257SVB7', 'A0A7C1A0V2', 'A0A7C5NCB0',
       'A0A150WQJ8', 'Q6MRM1', 'K7YT41', 'M4V527', 'A0A258VAP8',
       'A0A0B8WV74', 'A0A2H0WGB2', 'A0A514WSE3', 'A0A3T0RMK7',
       'A0A254Q8K0', 'A0A514XD40', 'A0A2D5ZUQ7', 'A0A2D6JE81',
       'A0A2E3WWW7', 'A0A849WP48', 'A0A3D3GDM7', 'A0A838F3U4',
       'A0A849WY59', 'A0A1F3T376', 'A0A1F3TPU5', 'A0A1F3YCA2',
       'A0A3A0G721', 'A0A4Q6CE65', 'A0A424JF99', 'A0A6M1YKP4',
       'A0A6M1YTN2', 'A0A6M1Z6N0', 'A0A6M1ZI03', 'A0A6M1ZS67',
       'A0A6G4ZSJ3', 'A0A6M2A456', 'A0A7X5TDC4', 'A0A7X5TFU2',
       'A0A7Y3N6P9', 'A0A832DPN7', 'A0A2H0VRH6', 'F8L7X8', 'D6YWW1',
       'F8LDV9', 'A0A2E3LXH5', 'A0A136MFT4', 'A0A0C9PZ15', 'A0A399WQ92',
       'A0A640XSG0', 'A0A2A4PEU7', 'A0A2E1VTJ4', 'A0A7Y3U233',
       'A0A5C1QUP3', 'A0A370AY42', 'A0A841RH88', 'A0A2D7DN09',
       'A0A6I7PFK7', 'A0A8B5WPZ2', 'A0A1W9V968', 'A0A2G6I076',
       'A0A5C1QGA0', 'A0A7X8HDP6', 'A0A2N1R427', 'A0A496R340',
       'A0A660SUQ8', 'A0A660TA47', 'A0A1V6BQ79', 'A0A1G3KL33',
       'A0A7V7WKS1', 'A0A2H6JAE9', 'A0A1Z9BSG9', 'A0A523UV19',
       'A0A2A4U4F1', 'A0A3B9YFL9', 'A0A3C1EPS7', 'A0A3C1EX96',
       'A0A3D1N0D2', 'A0A7C4EZV5', 'A0A7V4QPX6', 'A0A7Y3L400',
       'A0A350W5Z9', 'A0A1J4YS72', 'A0A2H0DKN7', 'A0A1F9R7C0',
       'A0A1F9RV46', 'A0A2N2EYA9', 'A0A1F9VBW1', 'A0A1F9U0R5',
       'A0A1F9URQ3', 'A0A1F9X2L8', 'A0A1F9WVL4', 'A0A1F9VK35',
       'A0A1F9Z557', 'A0A1F9YS31', 'A0A2M8FXL5', 'A0A1F9MU12',
       'A0A5C7ZTX2', 'T0CBS3', 'T0RDQ6', 'T0RYW6', 'A0A1J5K7E3', 'T0RAZ7',
       'A0A1F3X3W9', 'A0A1F3XVV8', 'A0A2S3QLP3', 'A0A4Z0L0F4',
       'A0A1W9HT56', 'A0A1G1IVY5', 'A0A6M1ZCZ0', 'A0A6M2ACT9',
       'A0A6M1Z3K5', 'A0A6M2A084', 'A0A1F8JWK4', 'A0A286TUA2', 'Q1Q6V0',
       'A0A2E1H726', 'A0A1W9VSC9', 'A0A522YSX4', 'A0A2M7DT38',
       'A0A2E3W395', 'A0A7H9MJT3', 'A0A150WCZ0', 'A0A2E1Q3M8',
       'A0A831W2Z9', 'A0A2M6X6B6', 'A0A3D4Z4S7', 'A0A4Q6EML3',
       'A0A1G2X1Z7', 'A0A1G3BII4', 'A0A1G3CEY0', 'A0A1G3CIR7',
       'A0A1G3CMW1', 'A0A1G3C302', 'A0A2E5YFT3', 'A0A1F3Y6H6',
       'A0A3M2ATE9', 'A0A0B0EK37', 'A0A5C6D6E3', 'A0A1W9HK35',
       'A0A0F9MHI1', 'A0A7C1KLX5', 'A0A523SLD0', 'A0A1G3PAP0',
       'A0A352V7Z6', 'A0A7V1N960', 'A0A2E7GHL6', 'A0A7C7ZJY5',
       'A0A7C3XTR3', 'A0A0F9AH49', 'A0A2D6MMU7', 'A0A2N1SIW4',
       'A0A1F8JBD1', 'A0A8B3ST15', 'A0A6G7GTJ9', 'A0A1G2ZJ06',
       'A0A2C9CBK7', 'A0A7X9FPL7', 'A0A7V4S4G2', 'A0A2E9PQC6',
       'A0A524K7R2', 'A0A539ELE4', 'A0A6M1YF76', 'A0A7Y3J330',
       'A0A355E283', 'A0A2A5DHI0', 'A0A1F9L9J2', 'A0A1G3YFZ0',
       'A0A3D3FS87', 'A0A0F9T4K7', 'A0A2D5ZU05', 'A0A1G2XTA8',
       'A0A6M1ZCZ7', 'A0A850GJH6', 'A0A1F9WYX0', 'A0A1F3SVH3',
       'A0A2M7S363', 'A0A1W9N6V1', 'A0A2E6ELJ5', 'A0A1F9XAK9',
       'A0A1F9UH74', 'A0A1F9HA48', 'A0A3D2C7U9', 'A0A2E7RLV0',
       'A0A1F9XKH0', 'A0A1F3V8J0', 'A0A1F9U9V2', 'A0A1F9V6N1',
       'A0A1G2WND1', 'A0A1G3B2D1', 'A0A1G3C6W1', 'A0A2M7B213',
       'A0A2I0Q7B9', 'A0A2E3NG42']

In [17]:
mapper = ProtMapper()

accessions, failed = mapper.get(
    ids=accessions_uniprot, from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS"
)
failed

Fetched: 211 / 220


['A0A7C7W092',
 'A0A424JF99',
 'A0A136MFT4',
 'A0A0C9PZ15',
 'A0A2D7DN09',
 'A0A4Z0L0F4',
 'A0A7H9MJT3',
 'A0A2E7GHL6',
 'A0A1F9L9J2']

accessions - GenBank accessions (для некоторых белков из UniProt может быть найдено несколько GenBank записей, мы будем брать все)

failed - идентификаторы UniProt, которые не были обнаружены. Однако, для них есть запись в UniParc. Брать их или нет??

In [None]:
data_sequence = []
for acc in accessions['To']:
    print('------------------------------------------------')
    print(acc)
    with Entrez.efetch(
        db="protein", id=acc, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": acc,
        "variant": "Bacterial dimers",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": 9606,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

In [21]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except:
        print(ds['accession'])
        failed_toadd(ds['accession'])

In [22]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
22,ADI38622.1,Bacterial dimers,,,,716544.0,Waddlia chondrophila WSU 86-1044,Chlamydiota,Chlamydiia,,,MNENLVVVSKVKKYIKSKAGMNTSANVMDQLSKIVEKEIEKAVQNA...,,,
23,AFX99764.1,Bacterial dimers,,,,1069642.0,Bdellovibrio bacteriovorus str. Tiberius,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,,,
24,AGH94288.1,Bacterial dimers,,,,1184267.0,Pseudobdellovibrio exovorus JSS,Bdellovibrionota,Bdellovibrionia,,,MSEEVVLVVTSKVKKFIKEKGEMNTSAETIDMLSKAIERLCLKGIE...,,,
25,ASD65320.1,Bacterial dimers,,,,959.0,Bdellovibrio bacteriovorus,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,,,
26,AZZ35344.1,Bacterial dimers,,,,1916293.0,Bdellovibrio sp. qaytius,Bdellovibrionota,Bdellovibrionia,,,MSDDILVVTSKVKKYIKEKGQMNTSAETIDMLTKAVERLCAKGIES...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,TVQ38025.1,Bacterial dimers,,,,1898206.0,Spirochaetaceae bacterium,Spirochaetota,Spirochaetia,,,MSDKESLVIASKVKSYIKNTGDLKCSAAVMDVLSDKIRAICDEAIR...,,,
732,TVR55163.1,Bacterial dimers,,,,1898206.0,Spirochaetaceae bacterium,Spirochaetota,Spirochaetia,,,MRYIMSQHFFEGVSMGEKEVLVIASKVKSYIKSKGDLKCSAAVADV...,,,
733,TWU31271.1,Bacterial dimers,,,,2528033.0,Candidatus Brocadiaceae bacterium S225,Planctomycetota,Candidatus Brocadiia,,,MSDSNSEKEVLVVTSKLKKYIRESSGMSTSANVAPALSDTIRNLCN...,,,
734,TXI77600.1,Bacterial dimers,,,,2291710.0,Dokdonella sp.,Pseudomonadota,Gammaproteobacteria,,,MAETLVVVSKIKKMVKDKGLRTGGDYIEGLSKKVEDIVNAAVAKVQ...,,,


### Add publications

In [23]:
pid = "schwab_histones_2024"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [25]:
data_publication = {
    "id": pid,
    "title": 'Histones and histone variant families in prokaryotes',
    "doi": '10.1038/s41467-024-52337-y',
    "author": None,
    "year": '2024',
}
cursor.execute(add_publication, data_publication)

In [26]:
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,schwab_histones_2024,Histones and histone variant families in proka...,10.1038/s41467-024-52337-y,,2024


In [28]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds['accession'], pid))
    except:
        print(ds['accession'])
        failed_toadd_publication(ds['accession'])

In [29]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
22,ADI38622.1,Bacterial dimers,,,,716544.0,Waddlia chondrophila WSU 86-1044,Chlamydiota,Chlamydiia,,,MNENLVVVSKVKKYIKSKAGMNTSANVMDQLSKIVEKEIEKAVQNA...,,ADI38622.1,schwab_histones_2024
23,AFX99764.1,Bacterial dimers,,,,1069642.0,Bdellovibrio bacteriovorus str. Tiberius,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,,AFX99764.1,schwab_histones_2024
24,AGH94288.1,Bacterial dimers,,,,1184267.0,Pseudobdellovibrio exovorus JSS,Bdellovibrionota,Bdellovibrionia,,,MSEEVVLVVTSKVKKFIKEKGEMNTSAETIDMLSKAIERLCLKGIE...,,AGH94288.1,schwab_histones_2024
25,ASD65320.1,Bacterial dimers,,,,959.0,Bdellovibrio bacteriovorus,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,,ASD65320.1,schwab_histones_2024
26,AZZ35344.1,Bacterial dimers,,,,1916293.0,Bdellovibrio sp. qaytius,Bdellovibrionota,Bdellovibrionia,,,MSDDILVVTSKVKKYIKEKGQMNTSAETIDMLTKAVERLCAKGIES...,,AZZ35344.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,TVQ38025.1,Bacterial dimers,,,,1898206.0,Spirochaetaceae bacterium,Spirochaetota,Spirochaetia,,,MSDKESLVIASKVKSYIKNTGDLKCSAAVMDVLSDKIRAICDEAIR...,,TVQ38025.1,schwab_histones_2024
732,TVR55163.1,Bacterial dimers,,,,1898206.0,Spirochaetaceae bacterium,Spirochaetota,Spirochaetia,,,MRYIMSQHFFEGVSMGEKEVLVIASKVKSYIKSKGDLKCSAAVADV...,,TVR55163.1,schwab_histones_2024
733,TWU31271.1,Bacterial dimers,,,,2528033.0,Candidatus Brocadiaceae bacterium S225,Planctomycetota,Candidatus Brocadiia,,,MSDSNSEKEVLVVTSKLKKYIRESSGMSTSANVAPALSDTIRNLCN...,,TWU31271.1,schwab_histones_2024
734,TXI77600.1,Bacterial dimers,,,,2291710.0,Dokdonella sp.,Pseudomonadota,Gammaproteobacteria,,,MAETLVVVSKIKKMVKDKGLRTGGDYIEGLSKKVEDIVNAAVAKVQ...,,TXI77600.1,schwab_histones_2024


### Update publication for Bd0055 (UniProt: Q6MRM1, GeneBank: CAE77736.1)

In [30]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession']=='CAE77736.1']

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
33,CAE77736.1,Bacterial dimers,,,,264462.0,Bdellovibrio bacteriovorus HD100,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,,CAE77736.1,schwab_histones_2024


In [36]:
pid = "hocher_histones_2023"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [37]:
data_publication = {
    "id": pid,
    "title": 'Histones with an unconventional DNA-binding mode in vitro are major chromatin constituents in the bacterium Bdellovibrio bacteriovorus',
    "doi": '10.1038/s41564-023-01492-x',
    "author": None,
    "year": '2023',
}
cursor.execute(add_publication, data_publication)

In [38]:
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,hocher_histones_2023,Histones with an unconventional DNA-binding mo...,10.1038/s41564-023-01492-x,,2023


In [39]:
cursor.execute(add_sequence_has_publication, ('CAE77736.1', pid))

In [40]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession']=='CAE77736.1']

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
33,CAE77736.1,Bacterial dimers,,,,264462.0,Bdellovibrio bacteriovorus HD100,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,,CAE77736.1,hocher_histones_2023
34,CAE77736.1,Bacterial dimers,,,,264462.0,Bdellovibrio bacteriovorus HD100,Bdellovibrionota,Bdellovibrionia,,,MAEVLVVTSKVKKLIKEKGQMNTSAETIDVLSKAIEQLCLKGVESA...,,CAE77736.1,schwab_histones_2024


In [41]:
# Make sure data is committed to the database
conn.commit()

## Correct description for Bacterial Dimers (change citation noauthor_histones_2023 to hocher_histones_2023)

In [53]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='Bacterial dimers'"
)
cursor.execute(query)
curr_data = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[['id', 'summary']].values[0]
curr_data

array(['Bacterial dimers', 215,
       'Bacterial dimers have been defined as a group of bacterial histones capable of binding to DNA in order to form nucleohistone filaments. These histones have the capacity to dimerise, yet do not form tetramers. For the bacterial singlet Bd0055, which is highly expressed, a crystallographic structure was obtained in which one monomer is covalently linked to its partner via double symmetry [noauthor_histones_2023]. This group of bacterial histones also includes histones predicted using AlphaFold2 and CLANS clustering, which are characterised by a shortened α3-helix and form stable dimers but do not form tetramers due to the absence of conserved residues, similar to Bd0055 [schwab_histones_2024].'],
      dtype=object)

In [55]:
new_desc = curr_data[2].replace('noauthor_histones_2023', 'hocher_histones_2023')
new_desc

'Bacterial dimers have been defined as a group of bacterial histones capable of binding to DNA in order to form nucleohistone filaments. These histones have the capacity to dimerise, yet do not form tetramers. For the bacterial singlet Bd0055, which is highly expressed, a crystallographic structure was obtained in which one monomer is covalently linked to its partner via double symmetry [hocher_histones_2023]. This group of bacterial histones also includes histones predicted using AlphaFold2 and CLANS clustering, which are characterised by a shortened α3-helix and form stable dimers but do not form tetramers due to the absence of conserved residues, similar to Bd0055 [schwab_histones_2024].'

In [56]:
query = f"UPDATE histone_description SET summary='{new_desc}' WHERE id=215"
cursor.execute(query)

In [57]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='Bacterial dimers'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[['id', 'summary']].values[0]

array(['Bacterial dimers', 215,
       'Bacterial dimers have been defined as a group of bacterial histones capable of binding to DNA in order to form nucleohistone filaments. These histones have the capacity to dimerise, yet do not form tetramers. For the bacterial singlet Bd0055, which is highly expressed, a crystallographic structure was obtained in which one monomer is covalently linked to its partner via double symmetry [hocher_histones_2023]. This group of bacterial histones also includes histones predicted using AlphaFold2 and CLANS clustering, which are characterised by a shortened α3-helix and form stable dimers but do not form tetramers due to the absence of conserved residues, similar to Bd0055 [schwab_histones_2024].'],
      dtype=object)

In [58]:
# Make sure data is committed to the database
conn.commit()

## Adding Coiled-coil histones

In [59]:
accessions_uniprot = ['A0A0U3E656', 'A0A150IHS7', 'A0A150IVM8', 'A0A150JGX6',
       'A0A151F470', 'A0A1J4REA8', 'A0A256YV44', 'A0A2D6JDB7',
       'A0A2H9MJ69', 'A0A2H9N012', 'A0A2H9NSA3', 'A0A2H9PHL5',
       'A0A2H9R306', 'A0A328SHR9', 'A0A3A5HKR9', 'A0A497IEL6',
       'A0A497S6Y3', 'A0A497SER2', 'A0A497SQN0', 'A0A497SRR2',
       'A0A497T2K0', 'A0A497T889', 'A0A497TAR1', 'A0A5E4JAX1',
       'A0A662QPX2', 'A0A7J2Z1W4', 'A0A7K4BVA4', 'A0A7K4DAI2',
       'A0A7L4PLV1', 'A0A7L4RRS9', 'A0A832UFM2', 'A0A832URZ6',
       'A0A832UYX0', 'A0A832UZ08', 'A0A832UZA3', 'A0A832XHQ1',
       'A0A832XJ01', 'A0A842WE24', 'A0A497T9W6', 'A0A256Y696',
       'A0A497RIW1', 'A0A497S061', 'A0A497S380', 'A0A256ZMA0',
       'A0A256YK67', 'A0A256XIQ4', 'A0A2D6LPF2', 'A0A7K4BZ95',
       'A0A7T9I1T4', 'A0A2H9LC46', 'A0A2G9P593', 'A0A7L4RG07',
       'A0A7L4RP69', 'A0A1J4USV2', 'A0A1J4UP21', 'A0A2G9PSX9',
       'A0A1F6R5L2', 'A0A2E7LTP1', 'A0A7J3RBG4', 'A0A662FES9',
       'A0A7J4JBI4', 'A0A5Q0UGA9', 'A0A5E4HUZ5', 'A0A5E4LHE8',
       'A0A075H085', 'A0A519BX03', 'A0A7J3S711', 'A0A7L4NYF7',
       'A0A117KQY0', 'A0A2I0PSE5', 'A0A1D3L3H9', 'A0A090I715', 'F0TAE6',
       'F6D3C8', 'A0A6A8RRC5', 'A0A7C6FZU1', 'A0A1D2WCC4', 'A0A347APG5',
       'A0A347AGT4', 'U6EBV9', 'A0A7J4TN52', 'A0A166CV13', 'A0A126QYE7',
       'A0A166ATU3', 'D3E1R1', 'D3E233', 'A5UK87', 'A0A219AJ82',
       'A0A125RBG3', 'A0A125RBG4', 'A0A315XP18', 'A0A315Y9D0',
       'A0A2U1S748', 'A0A2A2HET8', 'A0A2V2BQ81', 'A0A328SDM2',
       'A0A328SM80', 'A0A328RTI1', 'Q2NH93', 'A0A3G9CXG4', 'A0A6B9TEG1',
       'O27861', 'E3GZL0', 'A0A662QQ12', 'A0A662QRY3', 'H0A9P6',
       'A0A6N0NMA0', 'A0A2P6WBG2', 'A0A2P6W4G1', 'A0A2P6W6G1', 'G0QHT5',
       'G0QCR8', 'A0A7J3DLA7', 'A0A1V5QN68', 'A0A2H9LQL7', 'A0A2G9P8H6',
       'A0A0B5HDF4', 'A0A2G9PKJ3', 'A0A2D6M204', 'A0A5E4JAP0',
       'A0A7J2JKL8', 'A0A2A2H2E5', 'A0A832PC39', 'A0A2G9N406',
       'A0A832PFC8', 'A0A328PGV4', 'A0A842LAD1', 'A0A842M8H4',
       'A0A7J4IW72', 'A0A1G5VWZ1', 'A0A1V5MCH4', 'A0A089ZGR6',
       'A0A1D2WX89', 'A0A2H4U578', 'A0A843M5U2', 'B9AH23', 'D2ZML2',
       'A0A2H4VFB3', 'A0A2H4VLQ6', 'A0A843DEG8', 'A0A2H4VTQ1',
       'A0A843FCI8', 'A0A7L4P5S5', 'A0A842KRL2', 'A0A8J7X768',
       'A0A7C6YLQ9', 'A0A832VTF8', 'A0A842KV68', 'A0A843III2',
       'A0A1D2WKA2', 'A0A1H7KCX7', 'A0A3N5B4F0', 'A0A1V4TAQ1',
       'A0A223ZIG0', 'A0A371NBT6', 'A0A7J4MXG2', 'A0A842LRX0',
       'A0A1V6N220', 'A0A328SV21', 'A0A497RT10', 'A0A843APP9', 'D9PUV7',
       'T2GK27', 'A0A842YNW0', 'A0A7J2IEY3', 'A0A2Z4LB54', 'A0A843DBA4',
       'A0A1I4H134', 'A0A843EMQ5', 'A0A1V4YRH4', 'A0A5B9M0W5',
       'A0A6B9TJ95', 'K6U379', 'R9SMH5', 'A0A166AH38', 'A0A842Q7A3',
       'K2Q9P0', 'A0A843HD17', 'A0A843IP09', 'A0A7J4K2J8', 'A0A7J4KVC5',
       'A0A832RLN8', 'A0A842ULJ6', 'A0A1D2WJD0', 'A0A328Q1U1',
       'A0A8J7X7X0', 'A0A519BTD5', 'A0A2D6P6U7', 'A0A2H9QFL4',
       'A0A382EM18', 'A0A5E4JHR8', 'A0A842UMQ4', 'A0A2H9M0R4',
       'A0A848E5V1', 'A0A848ENE1', 'A0A843E4A0', 'A0A843J6T8',
       'A0A843G4C6', 'A0A1D8MTF9', 'A0A842K9Z9', 'A0A2P6WB21',
       'A0A7K4G440', 'A0A2P6W2M6', 'A0A1J4UUT3', 'A0A2G9P0L8',
       'A0A2G9PFR4', 'A0A2H9P6S2', 'A0A2H9Q701', 'A0A5E4ITA2',
       'A0A842V462', 'A0A5E4IZP4', 'A0A2H9L273', 'A0A7J4JXK6',
       'A0A7C3Z5J5', 'A0A7J2ZH22', 'A0A7L4RQ55', 'A0A7D6BGC7',
       'A0A2H9Q828', 'A0A2H9SAZ9', 'A0A7L4RAK1', 'R7PTX2', 'A0A1J4ULD3']

In [60]:
mapper = ProtMapper()

accessions, failed = mapper.get(
    ids=accessions_uniprot, from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS"
)
failed

Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 228 / 237


['A0A7K4BVA4',
 'A0A117KQY0',
 'A0A6A8RRC5',
 'A0A6N0NMA0',
 'A0A842LAD1',
 'A0A842M8H4',
 'A0A1V5MCH4',
 'A0A832VTF8',
 'A0A842K9Z9']

accessions - GenBank accessions (для некоторых белков из UniProt может быть найдено несколько GenBank записей, мы будем брать все)

failed - идентификаторы UniProt, которые не были обнаружены. Однако, для них есть запись в UniParc. Брать их или нет??

In [61]:
data_sequence = []
for acc in accessions['To']:
    # print('------------------------------------------------')
    print(acc)
    with Entrez.efetch(
        db="protein", id=acc, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": acc,
        "variant": "Bacterial dimers",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": 9606,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

ALT68400.1
Fetched taxid from NCBI 230361
KYC44541.1
Fetched taxid from NCBI 1705564
KYC46628.1
Fetched taxid from NCBI 1705564
KYC49116.1
Fetched taxid from NCBI 1705564
KYC48928.1
Fetched taxid from NCBI 1705564
KYC54340.1
Fetched taxid from NCBI 1705564
KYC56479.1
Fetched taxid from NCBI 1705564
KYK35687.1
Fetched taxid from NCBI 1803813
OIN85151.1
Fetched taxid from NCBI 1803500
OYT43604.1
Fetched taxid from NCBI 2012505
MAF89615.1
Fetched taxid from NCBI 2026739
PIV69099.1
Fetched taxid from NCBI 1974382
PIW40901.1
Fetched taxid from NCBI 1974383
PIY35550.1
Fetched taxid from NCBI 1974379
PIZ34779.1
Fetched taxid from NCBI 1974378
PJB74674.1
Fetched taxid from NCBI 1974381
RAP51354.1
Fetched taxid from NCBI 1945631
RJS48357.1
Fetched taxid from NCBI 2164
RLG21210.1
Fetched taxid from NCBI 2250274
RLI96176.1
Fetched taxid from NCBI 2250256
RLI97775.1
Fetched taxid from NCBI 3121606
RLJ01743.1
Fetched taxid from NCBI 3121606
RLJ01124.1
Fetched taxid from NCBI 3121606
RLJ06386.1
Fetc

In [62]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except:
        print(ds['accession'])
        failed_toadd(ds['accession'])

In [63]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2,AAB86299.1,Bacterial dimers,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MEVEIMENEEFQNNNADVTETEEETEELPFAKAEVVRLMKQHLDSD...,,,
14,ABC56810.1,Bacterial dimers,,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MDDFEEMNKTTEVEDVEVDKTEANTEEYMDENEEKLPFAKAEVVRL...,,,
15,ABQ86615.1,Bacterial dimers,,,,420247.0,Methanobrevibacter smithii ATCC 35061,Methanobacteriota,Methanobacteria,,,MSDEEMIEETTEEFIEEDEEKLPFAKAEVVRLMKENLDDDKMIRER...,,,
25,ADC46472.1,Bacterial dimers,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAEDEKIEEIEEIEDKEEENLPFAKAEVVRLMKENLDDDKMIRERV...,,,
26,ADC46594.1,Bacterial dimers,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MQDEPNKKINRAKDDENLLFARGEVIRLMKDHLDKDKMITERVKVE...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,VVB70230.1,Bacterial dimers,,,,115547.0,uncultured archaeon,,,,,MEDEQTQTPQEDQSEDELDDDSENLPFPNARVVKIIKSNLVKEHQL...,,,
961,VVB73592.1,Bacterial dimers,,,,115547.0,uncultured archaeon,,,,,MADDSNEPINENGEAELSPEMDEKLPFPTARVVRLMKEGMTKPHQI...,,,
962,VVB74229.1,Bacterial dimers,,,,115547.0,uncultured archaeon,,,,,MSEVEAKEMAQDADSDSEEDVIDEPEEEGGVTATENTAEDAEKLPF...,,,
963,VVB76490.1,Bacterial dimers,,,,2885756.0,Candidatus Tiddalikarchaeum anstoanum,Nanobdellota,Candidatus Nanoarchaeia,,,MADDEKKEATEEEIDEAAENLAFTNAEIVRLIKKNLPEGRMIKKRV...,,,


### Add publications

In [64]:
pid = "schwab_histones_2024"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,schwab_histones_2024,Histones and histone variant families in proka...,10.1038/s41467-024-52337-y,,2024


In [65]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds['accession'], pid))
    except:
        print(ds['accession'])
        failed_toadd_publication(ds['accession'])

In [66]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2,AAB86299.1,Bacterial dimers,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MEVEIMENEEFQNNNADVTETEEETEELPFAKAEVVRLMKQHLDSD...,,AAB86299.1,schwab_histones_2024
14,ABC56810.1,Bacterial dimers,,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MDDFEEMNKTTEVEDVEVDKTEANTEEYMDENEEKLPFAKAEVVRL...,,ABC56810.1,schwab_histones_2024
15,ABQ86615.1,Bacterial dimers,,,,420247.0,Methanobrevibacter smithii ATCC 35061,Methanobacteriota,Methanobacteria,,,MSDEEMIEETTEEFIEEDEEKLPFAKAEVVRLMKENLDDDKMIRER...,,ABQ86615.1,schwab_histones_2024
25,ADC46472.1,Bacterial dimers,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAEDEKIEEIEEIEDKEEENLPFAKAEVVRLMKENLDDDKMIRERV...,,ADC46472.1,schwab_histones_2024
26,ADC46594.1,Bacterial dimers,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MQDEPNKKINRAKDDENLLFARGEVIRLMKDHLDKDKMITERVKVE...,,ADC46594.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,VVB70230.1,Bacterial dimers,,,,115547.0,uncultured archaeon,,,,,MEDEQTQTPQEDQSEDELDDDSENLPFPNARVVKIIKSNLVKEHQL...,,VVB70230.1,schwab_histones_2024
961,VVB73592.1,Bacterial dimers,,,,115547.0,uncultured archaeon,,,,,MADDSNEPINENGEAELSPEMDEKLPFPTARVVRLMKEGMTKPHQI...,,VVB73592.1,schwab_histones_2024
962,VVB74229.1,Bacterial dimers,,,,115547.0,uncultured archaeon,,,,,MSEVEAKEMAQDADSDSEEDVIDEPEEEGGVTATENTAEDAEKLPF...,,VVB74229.1,schwab_histones_2024
963,VVB76490.1,Bacterial dimers,,,,2885756.0,Candidatus Tiddalikarchaeum anstoanum,Nanobdellota,Candidatus Nanoarchaeia,,,MADDEKKEATEEEIDEAAENLAFTNAEIVRLIKKNLPEGRMIKKRV...,,VVB76490.1,schwab_histones_2024


### Correct mistake - update variant feild to Coiled-coil

In [74]:
failed_toadd = []
for ds in data_sequence:
    # try:
    query = f"UPDATE sequence SET variant='Coiled-coil' WHERE accession='{ds['accession']}'"
    cursor.execute(query)
    # except:
    #     print(ds['accession'])
    #     failed_toadd.append(ds['accession'])

In [75]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2,AAB86299.1,Coiled-coil,,,,187420.0,Methanothermobacter thermautotrophicus str. De...,Methanobacteriota,Methanobacteria,,,MEVEIMENEEFQNNNADVTETEEETEELPFAKAEVVRLMKQHLDSD...,,AAB86299.1,schwab_histones_2024
14,ABC56810.1,Coiled-coil,,,,339860.0,Methanosphaera stadtmanae DSM 3091,Methanobacteriota,Methanobacteria,,,MDDFEEMNKTTEVEDVEVDKTEANTEEYMDENEEKLPFAKAEVVRL...,,ABC56810.1,schwab_histones_2024
15,ABQ86615.1,Coiled-coil,,,,420247.0,Methanobrevibacter smithii ATCC 35061,Methanobacteriota,Methanobacteria,,,MSDEEMIEETTEEFIEEDEEKLPFAKAEVVRLMKENLDDDKMIRER...,,ABQ86615.1,schwab_histones_2024
25,ADC46472.1,Coiled-coil,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MAEDEKIEEIEEIEDKEEENLPFAKAEVVRLMKENLDDDKMIRERV...,,ADC46472.1,schwab_histones_2024
26,ADC46594.1,Coiled-coil,,,,634498.0,Methanobrevibacter ruminantium M1,Methanobacteriota,Methanobacteria,,,MQDEPNKKINRAKDDENLLFARGEVIRLMKDHLDKDKMITERVKVE...,,ADC46594.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,VVB70230.1,Coiled-coil,,,,115547.0,uncultured archaeon,,,,,MEDEQTQTPQEDQSEDELDDDSENLPFPNARVVKIIKSNLVKEHQL...,,VVB70230.1,schwab_histones_2024
961,VVB73592.1,Coiled-coil,,,,115547.0,uncultured archaeon,,,,,MADDSNEPINENGEAELSPEMDEKLPFPTARVVRLMKEGMTKPHQI...,,VVB73592.1,schwab_histones_2024
962,VVB74229.1,Coiled-coil,,,,115547.0,uncultured archaeon,,,,,MSEVEAKEMAQDADSDSEEDVIDEPEEEGGVTATENTAEDAEKLPF...,,VVB74229.1,schwab_histones_2024
963,VVB76490.1,Coiled-coil,,,,2885756.0,Candidatus Tiddalikarchaeum anstoanum,Nanobdellota,Candidatus Nanoarchaeia,,,MADDEKKEATEEEIDEAAENLAFTNAEIVRLIKKNLPEGRMIKKRV...,,VVB76490.1,schwab_histones_2024


## Adding Face-to-face histones

In [76]:
accessions_uniprot = ['A0A063ZTS2', 'A0A081ES94', 'A0A097QWD9', 'A0A0B3AAJ7',
       'A0A0B3AHA4', 'A0A0B3AMP7', 'A0A0B3AXW7', 'A0A0B3B4K7',
       'A0A0B5GM27', 'A0A0B5HZ25', 'A0A0B5I1L7', 'A0A0D6JND7',
       'A0A0F7PG32', 'A0A0K1IPY9', 'A0A0M9AMB6', 'A0A0M9AS20',
       'A0A0N0BQL8', 'A0A0P7GMZ7', 'A0A0Q2QRD6', 'A0A0U5D191',
       'A0A0U5D251', 'A0A0U5H5P3', 'A0A0W1R3Z6', 'A0A0W1STI9',
       'A0A0X1KKW0', 'A0A124EBG8', 'A0A133VKQ4', 'A0A135VIL6',
       'A0A135VRW9', 'A0A142CXH1', 'A0A151A8D9', 'A0A151A8F6',
       'A0A151A9H5', 'A0A151AAS1', 'A0A151AGZ3', 'A0A161ZNN7',
       'A0A165L5R7', 'A0A165LI09', 'A0A166G4L7', 'A0A166RM48',
       'A0A1D2Q8Q6', 'A0A1D2QB17', 'A0A1D2QBN8', 'A0A1D2QII9',
       'A0A1D2QK59', 'A0A1D8MQW2', 'A0A1D8S744', 'A0A1F6Z251',
       'A0A1F6ZC06', 'A0A1G6NUX2', 'A0A1G7GY70', 'A0A1G7RI57',
       'A0A1G8TKY1', 'A0A1G9GB26', 'A0A1G9PJQ8', 'A0A1G9SIM0',
       'A0A1H1EAJ6', 'A0A1H1HWW5', 'A0A1H2SGR0', 'A0A1H3ABB9',
       'A0A1H3EK51', 'A0A1H3FXI0', 'A0A1H3J092', 'A0A1H3NXB2',
       'A0A1H3Z6Q1', 'A0A1H3ZW45', 'A0A1H4AA03', 'A0A1H5WDE9',
       'A0A1H6CCZ6', 'A0A1H6G4T3', 'A0A1H6HSI5', 'A0A1H6RCI3',
       'A0A1H7L5E4', 'A0A1H8IL87', 'A0A1H8MQB7', 'A0A1H8QLS1',
       'A0A1H8VL74', 'A0A1H9IX86', 'A0A1I0PU93', 'A0A1I0QGT7',
       'A0A1I1H7F4', 'A0A1I1L4Q6', 'A0A1I2LLW9', 'A0A1I2WB69',
       'A0A1I3B8U1', 'A0A1I3NNZ6', 'A0A1I4G9M3', 'A0A1I6FK86',
       'A0A1I6FUI6', 'A0A1I6G030', 'A0A1I6KB96', 'A0A1I6P7E5',
       'A0A1J4LPZ6', 'A0A1J4UJF4', 'A0A1J4VXJ4', 'A0A1J4W1A3',
       'A0A1J4W298', 'A0A1J4XXG0', 'A0A1J4ZF27', 'A0A1M5QHD7',
       'A0A1M6ZY81', 'A0A1M7B0G7', 'A0A1N7BQP2', 'A0A1N7CJQ4',
       'A0A1N7ELE7', 'A0A1N7EMX5', 'A0A1N7EWW1', 'A0A1N7EYK6',
       'A0A1N7EZX5', 'A0A1N7F9P1', 'A0A1N7FFA6', 'A0A1Q1FJI3',
       'A0A1Q9NJ58', 'A0A1Q9P034', 'A0A1Q9PF12', 'A0A1S6GXX4',
       'A0A1S6H1E5', 'A0A1S6H9N7', 'A0A1S6HEP4', 'A0A1S8B0B8',
       'A0A1U7EU63', 'A0A1V5TQF5', 'A0A1X4H9D8', 'A0A1Y3A6C6',
       'A0A1Y3AJJ9', 'A0A1Z2TMB6', 'A0A202E5N3', 'A0A218P3Q6',
       'A0A218P8Q1', 'A0A218PEF3', 'A0A238WDZ3', 'A0A238X234',
       'A0A238X3F0', 'A0A256HEQ5', 'A0A256HII9', 'A0A256HTZ0',
       'A0A256HV59', 'A0A256HYL0', 'A0A256IG49', 'A0A256JLX2',
       'A0A256KDR0', 'A0A256KI76', 'A0A256XU48', 'A0A256Y3L8',
       'A0A256YB44', 'A0A256ZZW0', 'A0A285P3L2', 'A0A2A2F9W7',
       'A0A2A2FDG5', 'A0A2A5QYS2', 'A0A2B7GTP7', 'A0A2D5V1C9',
       'A0A2D5XD57', 'A0A2D6F200', 'A0A2D6KD04', 'A0A2D6L4A9',
       'A0A2D6MJ46', 'A0A2D6NCI1', 'A0A2D6P4N4', 'A0A2D6PD18',
       'A0A2D6Q8E4', 'A0A2D6RKK9', 'A0A2D6SQC9', 'A0A2D6TIU0',
       'A0A2D6TNM1', 'A0A2D6U115', 'A0A2D6W4P7', 'A0A2E9LGY5',
       'A0A2G1WKG9', 'A0A2G1WM39', 'A0A2G1WMH2', 'A0A2G1WVC5',
       'A0A2G1WXA7', 'A0A2G1WZV7', 'A0A2G1X8N7', 'A0A2G9LUD0',
       'A0A2G9LVZ1', 'A0A2G9M6A5', 'A0A2G9MBT1', 'A0A2G9MDS7',
       'A0A2G9MFA4', 'A0A2G9MNS0', 'A0A2G9MS88', 'A0A2G9N4L4',
       'A0A2G9NGQ0', 'A0A2G9NHB9', 'A0A2G9NI47', 'A0A2G9NMS0',
       'A0A2G9NQI6', 'A0A2G9PBT6', 'A0A2G9PZ11', 'A0A2G9Q074',
       'A0A2H5A1U4', 'A0A2H5UVZ0', 'A0A2H5V7B8', 'A0A2H6H102',
       'A0A2H9L4R6', 'A0A2H9LQZ2', 'A0A2H9MH16', 'A0A2H9MZL1',
       'A0A2H9NGD1', 'A0A2H9NRT6', 'A0A2H9NVD1', 'A0A2H9P042',
       'A0A2H9PGU7', 'A0A2H9PKZ8', 'A0A2H9PST3', 'A0A2H9PW86',
       'A0A2H9Q037', 'A0A2H9Q3V2', 'A0A2H9QJH0', 'A0A2H9R221',
       'A0A2H9R6V4', 'A0A2H9RKA8', 'A0A2H9RN49', 'A0A2H9TER5',
       'A0A2I8VNF8', 'A0A2I8VRE2', 'A0A2J6H128', 'A0A2P4NUV2',
       'A0A2P6VYB6', 'A0A2R4WYC8', 'A0A2R6CGK9', 'A0A2R6CM35',
       'A0A2R6CWP3', 'A0A2R6D732', 'A0A2R6DCI8', 'A0A2R6DE40',
       'A0A2R6DGN1', 'A0A2R6DPU3', 'A0A2R6DTY3', 'A0A2R6DWQ1',
       'A0A2R6E1F2', 'A0A2R6EF64', 'A0A2R6ENL6', 'A0A2R6EQW3',
       'A0A2R6EZI6', 'A0A2R6F0P5', 'A0A2R6FGE0', 'A0A2R6FK99',
       'A0A2R6FQP5', 'A0A2R6FYR7', 'A0A2R6FZ97', 'A0A2R6G8U7',
       'A0A2R6GLD8', 'A0A2R6GMU3', 'A0A2R6GY04', 'A0A2R6H8C7',
       'A0A2R6H9V3', 'A0A2R6HBG3', 'A0A2R6HNW8', 'A0A2R6HWX7',
       'A0A2R6IEH7', 'A0A2R6INQ1', 'A0A2R6IQ80', 'A0A2R6J3N5',
       'A0A2R6JC94', 'A0A2R6JCB2', 'A0A2R6JF73', 'A0A2R6JL56',
       'A0A2R6K0P9', 'A0A2R6K7K5', 'A0A2R6KFC3', 'A0A2R6KR86',
       'A0A2R6KUJ0', 'A0A2R6KYT2', 'A0A2R6KZ23', 'A0A2R6L2G9',
       'A0A2R6LHP4', 'A0A2R6LMP3', 'A0A2R6LXV8', 'A0A2R6M5W0',
       'A0A2R6MJH8', 'A0A2R6ML42', 'A0A2R6MM89', 'A0A2R6MN12',
       'A0A2R6MXJ6', 'A0A2R6N0R0', 'A0A2R6N1N6', 'A0A2R6N893',
       'A0A2R6N921', 'A0A2Z2HU79', 'A0A2Z2MD79', 'A0A2Z2MDC9',
       'A0A2Z2MDI3', 'A0A2Z2MLN1', 'A0A2Z2N0L2', 'A0A329TBS8',
       'A0A343TJT5', 'A0A345E9C5', 'A0A346PA30', 'A0A365T8C9',
       'A0A368NAG4', 'A0A371K7B7', 'A0A371K8G2', 'A0A371KE49',
       'A0A371KRZ6', 'A0A371L674', 'A0A371L9H6', 'A0A371LA76',
       'A0A371LYX9', 'A0A371M3P9', 'A0A371M3Q8', 'A0A371M6R0',
       'A0A371MPL9', 'A0A371N421', 'A0A384KSN1', 'A0A384LG55',
       'A0A3A5JBC3', 'A0A3A5JDR3', 'A0A3A5JGU7', 'A0A3A5JIP1',
       'A0A3A5JV04', 'A0A3A6PP67', 'A0A3A6QA42', 'A0A3A6QKJ7',
       'A0A3L6JE44', 'A0A3L6JM86', 'A0A3M0CIK5', 'A0A3M0D146',
       'A0A3M0Y8H2', 'A0A3M1DGF3', 'A0A3M1F829', 'A0A3M1FC41',
       'A0A3M1HAI1', 'A0A3M1NJN7', 'A0A3N6LWK9', 'A0A3N6N0C5',
       'A0A3N6P7W4', 'A0A3P3R5D5', 'A0A3Q9EEG7', 'A0A3R7DBX8',
       'A0A3R7IAD7', 'A0A3R7IM04', 'A0A410TR08', 'A0A419FS79',
       'A0A421E540', 'A0A421EGY8', 'A0A421ELT2', 'A0A482T6M6',
       'A0A482TFH5', 'A0A482Y4A9', 'A0A495R235', 'A0A497EJ12',
       'A0A497PDA6', 'A0A497PSE8', 'A0A497PTC2', 'A0A497S989',
       'A0A497SC40', 'A0A497SNL9', 'A0A498E3E6', 'A0A498E545',
       'A0A498E9U8', 'A0A498EN39', 'A0A498ER75', 'A0A498FEQ6',
       'A0A498FM33', 'A0A498FNA3', 'A0A498G459', 'A0A498G6P0',
       'A0A498GA91', 'A0A498GC44', 'A0A498GJH2', 'A0A498KRS4',
       'A0A4C2EJH4', 'A0A4C2EMS7', 'A0A4C2EN04', 'A0A4C2EP19',
       'A0A4D6GYW0', 'A0A4D6HA07', 'A0A4D6HNT3', 'A0A4D6KCN0',
       'A0A4D7GYY6', 'A0A4P8WPY4', 'A0A4P8WQ42', 'A0A4S3TT61',
       'A0A4U5JDD9', 'A0A4U7C176', 'A0A4U7C1W4', 'A0A4U7C8U0',
       'A0A4U7CBD2', 'A0A4U7CYJ6', 'A0A4U7DBG4', 'A0A4U7DFJ1',
       'A0A4U7DPA0', 'A0A4U7DX51', 'A0A4U7DYQ2', 'A0A4U7DZA5',
       'A0A4U7EI78', 'A0A4U7EWV6', 'A0A4U7F007', 'A0A4U7F2V0',
       'A0A4V1IFG0', 'A0A4V6E172', 'A0A4V6E676', 'A0A4Y5SLD7',
       'A0A510NAH2', 'A0A518S9R1', 'A0A519BR08', 'A0A519BS89',
       'A0A521CHI1', 'A0A521ET06', 'A0A523XQR6', 'A0A524D048',
       'A0A524D1T5', 'A0A524DE19', 'A0A524DIH9', 'A0A524DTX6',
       'A0A524EQI0', 'A0A524ESI0', 'A0A524G590', 'A0A524L8G8',
       'A0A524LZ65', 'A0A532T4Y2', 'A0A544QNQ4', 'A0A554NC87',
       'A0A558GAR3', 'A0A5C0SI94', 'A0A5C0XQU7', 'A0A5C9E547',
       'A0A5C9EHF2', 'A0A5C9ER26', 'A0A5C9ETQ6', 'A0A5C9EZL2',
       'A0A5D5AS44', 'A0A5E4I285', 'A0A5E4IMX6', 'A0A5E4JRR1',
       'A0A5E4JRX4', 'A0A5E4JZQ5', 'A0A5E4K289', 'A0A5E4K6L5',
       'A0A5J5LB27', 'A0A5N1KWG6', 'A0A5N1L1M0', 'A0A5N1LAG1',
       'A0A5N5U4Y8', 'A0A5N5U9G2', 'A0A5P8LWJ1', 'A0A5P9P1Y3',
       'A0A5Q0UI58', 'A0A5Q0UIZ6', 'A0A643JUZ0', 'A0A643KCR1',
       'A0A650A1I4', 'A0A662EX90', 'A0A662FE67', 'A0A662FLX2',
       'A0A662K6T1', 'A0A662L487', 'A0A662QIX8', 'A0A662QNS9',
       'A0A662QUL5', 'A0A6A8G7R3', 'A0A6A8GDD5', 'A0A6A9S847',
       'A0A6A9SBS2', 'A0A6A9SJA9', 'A0A6A9SQ22', 'A0A6A9T0A4',
       'A0A6A9T9M9', 'A0A6B0GP28', 'A0A6B0HI17', 'A0A6B0SS20',
       'A0A6B0SWJ4', 'A0A6B0SZB7', 'A0A6B0T1E1', 'A0A6B0T3C7',
       'A0A6B0T9H2', 'A0A6B0VJ64', 'A0A6B1IRP3', 'A0A6B9FAN3',
       'A0A6B9FDY2', 'A0A6B9Y5B1', 'A0A6C0UN02', 'A0A6G0EF38',
       'A0A6G1YZI6', 'A0A6G2FX32', 'A0A6G2HCB3', 'A0A6G3ZER8',
       'A0A6G4RA29', 'A0A6G8T455', 'A0A6N0NSD0', 'A0A7C1MIP5',
       'A0A7C1NDG7', 'A0A7C1R3J6', 'A0A7C4E4D7', 'A0A7C4RMI9',
       'A0A7C4UZ97', 'A0A7C4WZT2', 'A0A7C5DKU2', 'A0A7C5K0B8',
       'A0A7C5YAT2', 'A0A7D3Y0A3', 'A0A7D4CUY9', 'A0A7D4XZZ6',
       'A0A7D4Y7F3', 'A0A7D4Y915', 'A0A7D5BZ01', 'A0A7D5GM11',
        'A0A7D5IIG8', 'A0A7D5IY97', 'A0A7D5KNH3', 'A0A7D5KP22',
       'A0A7D5L8T8', 'A0A7D5LAP3', 'A0A7D5NXK7', 'A0A7D5T6R6',
       'A0A7D5T910', 'A0A7D5TH01', 'A0A7D5TUJ4', 'A0A7D5TXP0',
       'A0A7D6CNW7', 'A0A7G2DBC0', 'A0A7J2GYZ3', 'A0A7J2IYX5',
       'A0A7J2JKI1', 'A0A7J2TZW3', 'A0A7J2TZX2', 'A0A7J2VTG8',
       'A0A7J2VWR3', 'A0A7J3CV08', 'A0A7J3LN08', 'A0A7J3MJ20',
       'A0A7J3UHZ5', 'A0A7J4HVR3', 'A0A7J4I3Z7', 'A0A7J4I8G3',
       'A0A7J4IMR8', 'A0A7J4J350', 'A0A7J4J7I9', 'A0A7J4JG57',
       'A0A7J4K2Q3', 'A0A7J4K7Y6', 'A0A7J4KSR3', 'A0A7J4L151',
       'A0A7J4LBP3', 'A0A7J4LY84', 'A0A7J4QDZ5', 'A0A7J4QGV4',
       'A0A7J4S1R1', 'A0A7J4U564', 'A0A7J4UK75', 'A0A7J9Q4U8',
       'A0A7J9SLQ7', 'A0A7K4C800', 'A0A7K4GKQ9', 'A0A7K4HAN7',
       'A0A7K4IH93', 'A0A7T3KU30', 'A0A7T5R8J5', 'A0A7T5UJN9',
       'A0A7T9I0X7', 'A0A830DPE4', 'A0A830EHH7', 'A0A830EUY3',
       'A0A830EX68', 'A0A830F6V0', 'A0A830F7V6', 'A0A830FBB9',
       'A0A830FDK8', 'A0A830FEU3', 'A0A830FTF4', 'A0A830FWT8',
       'A0A830FYC6', 'A0A830G3U1', 'A0A830G4X0', 'A0A830GAX3',
       'A0A830GEQ1', 'A0A830GFD9', 'A0A830GPV9', 'A0A833A2Z5',
       'A0A841HC11', 'A0A841HFF3', 'A0A841HFY0', 'A0A842K380',
       'A0A842NMV4', 'A0A842PLA1', 'A0A842Q6I5', 'A0A842QN38',
       'A0A842SAN4', 'A0A842TR04', 'A0A842U389', 'A0A842UL59',
       'A0A842US40', 'A0A842UZA4', 'A0A842V0K1', 'A0A842V7F3',
       'A0A842V8I8', 'A0A842W799', 'A0A842WBN8', 'A0A842WZD3',
       'A0A843JZZ9', 'A0A846PMY8', 'A0A846SXF2', 'A0A847UMM4',
       'A0A848EQ92', 'A0A849PVR0', 'A0A849QDU6', 'A0A849R7C1',
       'A0A895ADF3', 'A0A895AEX1', 'A0A897MUW8', 'A0A897N342',
       'A0A897NBV1', 'A0A897NL67', 'A0A8A2U385', 'A0A8A2VJ00', 'B0R7H1',
       'B0R8M9', 'B6YVF4', 'B7QZX8', 'B9LU10', 'C5A4R6', 'C7NV10',
       'C7P233', 'D2RS00', 'D3SVB1', 'D4GQ55', 'D4GZE0', 'D6GVQ6',
       'D8J532', 'D8JB14', 'D8JBI5', 'D8JCH2', 'E4NRI4', 'E6N2V7',
       'E6NA72', 'E7QMR0', 'F7PMF2', 'F7PPY7', 'F8DCN4', 'G0HKU4',
       'G0HTK4', 'G0LFQ3', 'G0QB19', 'G0QEU7', 'G2MHG6', 'G2MPE8',
       'H0ACJ8', 'I3R126', 'I3ZWQ0', 'I6V1Y9', 'I7CNZ7', 'I7CP19',
       'J3EX64', 'L0AI84', 'L0I8V8', 'L0JT31', 'L0JT86', 'L0K6J0',
       'L5NX63', 'L9VFZ5', 'L9WAH4', 'L9WKX4', 'L9X283', 'L9XBJ4',
       'L9XSE1', 'L9XVQ5', 'L9Y2W9', 'L9YQ59', 'L9YZP0', 'L9YZY6',
       'L9Z1D3', 'L9ZH87', 'L9ZR38', 'L9ZWA1', 'M0AHJ1', 'M0AXD1',
       'M0BAJ7', 'M0BL41', 'M0C7N5', 'M0CHU8', 'M0CX07', 'M0D6B5',
       'M0DBL8', 'M0DS10', 'M0DV55', 'M0E1N7', 'M0EBW8', 'M0EDY1',
       'M0EIJ4', 'M0EX78', 'M0F3S4', 'M0FH50', 'M0FSH5', 'M0FV11',
       'M0FZU4', 'M0GF17', 'M0GNY5', 'M0GUV1', 'M0HLI2', 'M0HW30',
       'M0I0R5', 'M0I5J9', 'M0II18', 'M0J5C0', 'M0J7X1', 'M0J9V1',
       'M0K0M4', 'M0K294', 'M0K3I4', 'M0K7B2', 'M0K7K6', 'M0L0Q3',
       'M0L6G7', 'M0LMY2', 'M0LQ94', 'M0M032', 'M0M6B8', 'M0M9N2',
       'M0MGB9', 'M0MGQ0', 'M0MKX2', 'M0MRS0', 'M0MSS2', 'M0MWD4',
       'M0N2Y1', 'M0N4K5', 'M0N5C0', 'M0N972', 'M0NA51', 'M0NBB3',
       'M0NDB0', 'M0NDM6', 'M0NDX9', 'M0NF95', 'M0NFK4', 'M0NGJ0',
       'M0NGU9', 'M0NRF5', 'M0NSL3', 'M0P8Y8', 'M0P9D2', 'M0PE95',
       'M0PLK7', 'M1XMW9', 'Q18E68', 'Q5JDW7', 'Q5UZD8', 'Q5V751',
       'Q8U1D0', 'Q9HHD8', 'Q9HN32', 'Q9UWP8', 'R4W3D9', 'U1NMU9',
       'U1NY90', 'U1NYI6', 'U1PU03', 'U1QKF4', 'V4GN65', 'V4HAY6',
       'V4X669', 'V4XTY4', 'V4XWK1', 'V4YBJ4', 'V5TQR0', 'V6DR83',
       'V6DVC9', 'V6DY05', 'W0JPW1', 'W0JZB9', 'W0K3X0', 'W0K8H7',
       'W8NTX3', 'A0A532THE7', 'A0A497P9U3', 'A0A497SR00', 'A0A497TFM2',
       'A0A256YVI3', 'A0A2D6M217', 'A0A7J4LID0', 'A0A2D6PIX6',
       'A0A7J4JBN3', 'A0A5E4JEQ3', 'A0A147JYK9', 'M0NBJ3', 'A0A2E5QFX1',
       'A0A6A8AM72', 'A0A0B3AJY9', 'A0A3M1VXD1', 'A0A7V3H538',
       'A0A7Y2E8Q7', 'A0A0G0ZRF0', 'A0A554IMG3', 'A0A554M6I4',
       'A0A0S1SPV4', 'A0A349D2I3', 'A0A2A4U237', 'A0A846PDF4',
       'A0A7Y2FGJ3', 'A0A3M2E106', 'A0A534W9A3', 'A0A534YH26',
       'A0A534YS08', 'A0A848ZJC5', 'A0A496XR62', 'A0A7X7C7S9',
       'A0A7X9HPB6', 'A0A450YNR7', 'A0A450Z010', 'A0A661EN10', 'T0CDL6',
       'T0REW7', 'T0SJP8', 'A0A1J5KGZ7', 'T0SMF1', 'A0A2D6JKW5',
       'A0A1F3VLF8', 'A0A2E1RDX6', 'A0A2E2UD47', 'A0A2E3W2R9',
       'A0A2E4D8G2', 'A0A2E4Y315', 'E1WYC3', 'A0A2E6H7C5', 'A0A2S3QU58',
       'A0A2G8HV32', 'A0A523CMH0', 'A0A7C2BU76', 'A0A7C4T0X9',
       'A0A7Y5VQT6', 'A0A1W9TAA6', 'A0A2A5DAK1', 'A0A2A5DLG9',
       'A0A2E1H2P9', 'A0A3M1TRV9', 'A0A7Y3XLQ0', 'A0A1G3BJX8',
       'A0A3D3KQ33', 'A0A7C3TW14', 'A0A533SKR9', 'A0A1V5NCE7',
       'A0A2E6NJW5', 'A0A2E7NCY3', 'A0A7C2ETS7', 'A0A1F4Q587',
       'A0A2M6WZT5', 'A0A1G1X2W2', 'A0A1F5YCB8', 'A0A1F6LPN0',
       'A0A2H0S0K3', 'A0A2H0RP58', 'A0A7Y4QFM2', 'A0A3M1S117',
       'A0A554K9C6', 'A0A0G1LC34', 'A0A2H0NZH6', 'A0A1F9UH79',
       'A0A1J4V3K3', 'A0A3N5W0L3', 'A0A2D6E7R0', 'A0A2D6BK07',
       'A0A2D6MPL4', 'A0A2D7I2C2', 'A0A2E3NEB3', 'A0A2E9GR49',
       'A0A3M1CW59', 'A0A3M1CXZ6', 'A0A3M2AIN5', 'A0A4Q3L5F5',
       'A0A7C2SBP3', 'A0A7C4PUZ6', 'A0A7V1IMG8', 'A0A7V3SQP7',
       'A0A523J6M5', 'A0A523JB45', 'A0A523KJV0', 'A0A534PT43',
       'A0A534Q2D1', 'A0A534Q5W1', 'A0A661NYP2', 'A0A1Q7M5D7',
       'A0A2H0PCU9', 'A0A2H0P3F2', 'A0A1F9H5K8', 'A0A1F9LHM1',
       'A0A0C2D3N4', 'A0A2S9XMT8', 'A0A2S9YIA0', 'D0LYZ1', 'A0A2E6DHA5',
       'A0A2E6VRS9', 'A0A7C7U4K3', 'A0A7C7Y5J0', 'A0A7Y3DDG0',
       'A0A7Y3GWD6', 'A0A1I2CCL3', 'A6GF99', 'A6GJS9', 'A0A2D5ZV39',
       'A0A2E3XN53', 'A0A1Y5F7X1', 'A0A2D5YQG1', 'A0A2E6H2U3',
       'A0A4V2B398', 'A0A1W9I3K5', 'A0A1U9NLH9', 'A0A0B0EII8',
       'A0A1Q2MET2', 'A0A0S8E952', 'A0A0S8KK53', 'A0A3D3UL95',
       'A0A7C1IS36', 'A0A7C3T5T2', 'A0A7V2N2F1', 'A0A7Y4RAC5',
       'A0A353P6V5', 'A0A2A4P2F3', 'A0A2D5NES6', 'A0A2D6A2E9',
       'A0A2E1VRS0', 'A0A2E1X7U2', 'A0A2E3A8C0', 'A0A2E5X7L9',
       'A0A2E6D4G7', 'A0A2E6NPX7', 'A0A2E7RLJ0', 'A0A2G6JFU2',
       'A0A3M2ACD4', 'A0A4P5XE30', 'A0A7C2AKV4', 'A0A7C3EKQ6',
       'A0A7C5EDQ6', 'A0A7C5Q091', 'A0A7C7YVX2', 'A0A7C8AGR3',
       'A0A7V2XGK7', 'A0A7V7D512', 'A0A7V7MIF8', 'A0A7X8IDD5',
       'A0A353FL24', 'A0A521H8P0', 'A0A521KA39', 'A0A523PUF1',
       'A0A660UTZ6', 'A0A660UZL0', 'A0A660V9Q7', 'A0A660VQL6',
       'A0A800E729', 'A0A800KK32', 'A0A800N249', 'A0A1V6G224',
       'A0A1V5N4A3', 'A0A1V5MYS4', 'A0A1G2X6B3', 'A0A1G2XG06',
       'A0A1G2YF46', 'A0A518D1Z8', 'A0A518EQB7', 'A0A1G2YMK7',
       'A0A1G3BGJ0', 'A0A0S8G1S7', 'A0A0S8J1C7', 'A0A1Z8SJX4',
       'A0A1W6LM19', 'B0SMR4', 'Q04ST8', 'A0A6N4QEX4', 'A0A4R9K957',
       'A0A2M9ZN55', 'M6YVB3', 'A0A2M8HRQ6', 'N1VM61', 'A0A2E9Q575',
       'A0A5C7LNW3', 'A0A1F6RHI5', 'A0A1F6QJS1', 'A0A388TCI8',
       'A0A388TEX5', 'A0A2D6IMW3', 'A0A496UR37', 'A0A661A437',
       'A0A2M7YQL6', 'A0A2G6G5I3', 'A0A1Z9BVY6', 'A0A2H0C4V8',
       'A0A1G2Y0R5', 'A0A357BV16', 'A0A0S7X332', 'A0A519CE04',
       'A0A1Z9I2D8', 'A0A2E7GY55', 'A0A3A5V4U5', 'A0A2A5VNN0',
       'A0A327GVC1', 'A0A2E5YCP5', 'A0A2D6DMN4', 'A0A2E7LR89',
       'A0A519DGH4', 'A0A7J4HKK3', 'A0A2D7BGF3', 'A0A2E5ZBY9',
       'A0A2E4CWL9', 'A0A2D7UYD2', 'A0A2E4WW87', 'A0A2E4JW32',
       'A0A2E8NPL2', 'A0A1V5QNA2', 'A0A829BVW3', 'M6UGI0', 'M6VS38',
       'M6Y516', 'T0FC67', 'A0A8J7RFZ6', 'A0A098MYJ0', 'A0A0C5XCQ1',
       'A0A0E2B405', 'A0A0E2D4E9', 'A0A0F6H834', 'A0A0F6I995',
       'A0A0M4N9I8', 'A0A1B9FHM7', 'A0A1N6WIY6', 'A0A1T1DVW9',
       'A0A2H1XGH2', 'A0A6L6R0V8', 'A0A828Y2Y9', 'A0A829D6S4',
       'A0A867ZGV8', 'A0A868AW84', 'A0A8I0U3R0', 'M3EU49', 'M3H7F9',
       'M3I354', 'M6F8L0', 'M6G861', 'M6HN23', 'M6KRD7', 'M6RIA4',
       'M6X1T2', 'N1UX82', 'Q8F3E8', 'A0A1D7V2G0', 'A0A2M9XTC8',
       'A0A2M9YM31', 'A0A2M9YZ09', 'A0A2N0BC82', 'A0A4R9L0L4',
       'A0A5F1YRQ6', 'A0A5F2BBU8', 'M6CMM3', 'N1W8F4', 'T0FUQ2',
       'A0A0E2BBQ2', 'A0A0E2DTF5', 'A0A0E3B7N1', 'A0A0G8BJ00',
       'A0A0M1YU07', 'A0A3S9KJR8', 'A0A5F2EJC2', 'A0A828Z049', 'K8Y1I2',
       'M3FMC7', 'M3GRV7', 'M6AHX1', 'M6BZX1', 'M6EFV1', 'M6FE73',
       'M6JFW5', 'M6QHV7', 'M6UUJ8', 'M6W4W9', 'M6WQT4', 'N1TYR4',
       'V6HVC6', 'A0A1E3XAB4', 'Q72S85', 'K6J159', 'M5V6J6', 'A0A8J8P565',
       'A0A7X7ET62', 'A0A8J7YBS1', 'A0A8J8GHG9', 'A0A8E8EX11',
       'A0A8J7UMW7', 'A0A8J7YKT9', 'A0A8G1H6G7', 'A0A7K4BZI2',
       'A0A8J8JMP9', 'A0A8J8TR97', 'A0A2E5RZU0', 'A0A2E8PVV0',
       'A0A8J8F1F2', 'A0A8J8DFG9', 'A0A8J8EGP3', 'A0A8J8A031',
       'A0A8J8P961', 'A0A8J8CTP3', 'A0A8J8TBP6', 'A0A524E8H0',
       'A0A6A8ATV4', 'A0A8J8E3T7', 'A0A1G2YIV9', 'A0A5E4JHT2',
       'A0A831P1B3', 'A0A1F9GLG1', 'A0A8J8CVC0', 'A0A8J7YJM1',
       'A0A8J7X5Y6', 'A0A842RMV7', 'A0A842UFQ0', 'A0A382ZDL5',
       'A0A8J8J1P3', 'A0A532TVR1', 'A0A7J4IVH2', 'A0A842WWZ7',
       'A0A2M9ZDC3', 'A0A4R9GML6', 'A0A4V3JCI2', 'S3UQY5', 'S3W750',
       'T0FHF5', 'V6HHX7', 'A0A5C9EMZ4', 'X1EPX6', 'A0A7K4GKH2',
       'A0A2E3XV65', 'A0A7K4HIJ1', 'X0ZCW4', 'A0A2N1UCC2', 'A0A2N0AHZ0',
       'A0A4R9I195', 'A0A4R9II19', 'A0A5F2DK70', 'A0A6H3P2J7',
       'A0A4R8MQL3', 'A0A4R9IY18', 'A0A6D2A794', 'A0A4R9JG39',
       'A0A4Z1AAV8', 'X1RQN5', 'A0A2M9XYU3', 'A0A2N0AUS7', 'A0A2P2DCH1',
       'A0A4R9JU60', 'A0A4R9LJ18', 'A0A4Z1A137', 'A0A5E8HCX5',
       'A0A7I0HSB2', 'N1W469', 'R9A929', 'A0A532UGV6', 'A0A2M6WB37',
       'A0A497Q9G7', 'A0A2D6F216', 'A0A2M9XC52', 'A0A2M9YF63',
       'A0A2M9ZW33', 'A0A2N0BT62', 'A0A2P2CZG5', 'A0A4R9FQ18',
       'A0A4R9FXZ6', 'A0A4R9HBA0', 'A0A4R9J4L6', 'A0A4R9K179',
       'A0A4Z1AFK2', 'A0A5F1ZPV8', 'A0A5F2DZT8', 'I0XMV3', 'M6CYJ4',
       'A0A2H0QFA2', 'A0A0B5HNK5', 'A0A7J4ISK7', 'A0A3C0VFA2',
       'A0A8J8BP93', 'A0A8J8A9S1', 'A0A524C7F5', 'A0A8J8EW64', 'X1C569',
       'A0A2G9N2E6', 'A0A2G9P5J0', 'A0A2H9QEH3', 'A0A8J8A6E3',
       'A0A842R1P2', 'A0A8J8ECR9', 'A0A524EYX9', 'A0A4R9M2J2',
       'A0A534NEC9', 'A0A5C6DKL3', 'A0A497SHD0', 'A0A5C9E226',
       'A0A0F9N7H2', 'A0A7J2QWE2', 'A0A7Y6PXF0', 'A0A1H3IXF0',
       'A0A8J8DXT3', 'A0A286U2I5', 'X0SUQ6', 'A0A0F9JBD6', 'A0A7C1LPF3',
       'A0A7C1KMI7', 'A0A7J4BB95', 'A0A534NQI6', 'A0A0F9SH49',
       'A0A7J2PRZ4', 'A0A0F9KN99', 'A0A1Q7EU55', 'A0A1Q7JG93',
       'A0A1Q7Q3U2', 'A0A1Q7Z2V5', 'A0A534UTD8', 'A0A538QLF7',
       'A0A524FH96', 'A0A7W0WZ13', 'A0A8J8E2X5', 'A0A8J8EVA8',
       'A0A8J8JHE4', 'A0A2E9CFW5', 'A0A2D5XZJ0', 'A0A497Q3V1',
       'A0A842U6F9', 'A0A8I2A1S4', 'A0A7W0VK42', 'A0A524DMA9',
       'A0A8J8P872', 'A0A1G2WP93', 'A0A1G2XKE9', 'A0A1G3B3S3',
       'A0A1G3C3L5', 'A0A7W0Q2Q3', 'A0A842URA9', 'A0A1N7F860',
       'A0A0F9ELD6', 'A0A7J2R5Q5', 'A0A842UHL0', 'A0A850GNR3',
       'A0A0B3AUF0', 'A0A0M0BKB4', 'A0A8J8AAC0', 'A0A523U8B5',
       'A0A850G3Q3', 'A0A651GSQ8', 'A0A8J3AB48', 'A0A7J4I684',
       'A0A2P2E1Q2', 'A0A7J3E3C0', 'A0A8J8AIE8', 'A0A7K4GWT1',
       'A0A7M3MNJ9', 'A0A2W4J0W2', 'A0A1Q9PCV8', 'A0A524FRC2',
       'A0A7W1RLJ1', 'A0A7W0VSJ5', 'X1A3E6', 'A0A0F9G6G4', 'A0A7C1LPI8',
       'A0A7C1UYJ2', 'A0A4V3JWN3', 'A0A7J4IA08', 'A0A1F9MS20',
       'A0A523W175', 'A0A532TFW0', 'A0A524E2Z5', 'A0A842RXA6',
       'A0A524G9Q8', 'A0A524CUC3', 'A0A7K4GXS5', 'A0A0C2CN73',
       'A0A8J7YEH9', 'A0A842QEA7', 'A0A7J2RB97', 'A0A0F9KZN1',
       'A0A534NQU5', 'A0A522Z6R6', 'A0A1Q2HPZ7', 'A0A539EFE7',
       'A0A0F8VEB5', 'A0A0F9PQ32', 'A0A7J2PRK2', 'A0A7C5J2E6',
       'A0A832UPC6', 'A0A832V5Z9', 'A0A524M6I4', 'A0A523REB4',
       'A0A534VYK3', 'A0A2D6WX84', 'A0A7K1I956', 'A0A524FFR6',
       'A0A1Q9NIW5', 'A0A2D6NEM9', 'A0A0B3AIX9', 'A0A842NSY6',
       'A0A0K1PEE4', 'A0A1Q9N5X0', 'A0A7C3WHW2', 'A0A842T4M6',
       'A0A7C3M469', 'A0A534PZD2', 'A0A3L6JLC1', 'A0A523VWN5',
       'A0A1Q7M6V4', 'A0A534UNA7', 'M0NGC3', 'A0A523S0R4', 'A0A2P6W1N7',
       'A0A2P6W4W1', 'A0A2P6W9G8', 'A0A2P6WB43', 'A0A849QGL7',
       'A0A1Q9NEZ9', 'M0MY02', 'A0A524LK47', 'M0NET3', 'A0A1S6H1R8',
       'A0A1Q9P2N0', 'A0A353CX55', 'A0A8J7TJK6', 'A0A356KCT9',
       'A0A7L4RC86', 'M0MUD7', 'A0A832UX73', 'A0A365T7K8', 'A0A7C7YQN0',
       'A0A523Z8N1', 'A0A7C4GHG0', 'A0A382C7D0', 'A0A151EEH2',
       'A0A842XSH6', 'A0A524G416', 'A0A842SQS3', 'A0A2D6NWY2',
       'A0A432IE34', 'A0A432H5E1', 'A0A7C3R9M7', 'A0A7K4CMB7',
       'A0A8J6XZ43', 'A0A524J7R6', 'A0A7C3ZG32', 'A0A660ZXP1',
       'A0A2H0RYN2', 'A0A8J7U6J6', 'A0A365T5Q6', 'A0A7J4V546', 'H2CFS2',
       'A0A7C1TCQ0', 'A0A833H2J8', 'A0A7C7H6C6', 'A0A842SE10',
       'A0A0F9UZQ9', 'A0A534QV65', 'A0A7J2ZLQ5', 'A0A8J7QKA2',
       'A0A523VQR1', 'A0A2A5A6T5', 'A0A842MTQ9', 'A0A7C4F236',
       'A0A842YQ84', 'A0A1J4UU12', 'A0A2G9NZ39', 'A0A3M2F8B9',
       'A0A7C5AAS0', 'A0A5E4I5J6', 'A0A2D7PHW0', 'A0A2G9PKP6',
       'A0A2H9Q9I2', 'A0A2H9SB56', 'A0A1F9YZ31', 'A0A5E4JTX3',
       'A0A522E257', 'A0A1F9ZV37', 'A0A2E5YTP9', 'A0A2M6Z2P5',
       'A0A358V0T9', 'A0A7C7ZK60', 'A0A554IS48', 'A0A2D7DKM0',
       'A0A424KA57', 'A0A7C3U3I2', 'D2EFI3', 'A0A2D5WBB2', 'A0A2D6Q7B9',
       'A0A5E4LTC7', 'A0A7C4JA02', 'A0A5E4LPW5', 'A0A2D5B233',
       'A0A7J3XE31', 'A0A7C7YJF2', 'A0A523DGR1', 'A0A5E4JFQ7',
       'A0A0F9RHQ8', 'A0A518BNL0', 'A0A523IPI7', 'A0A849PKT5',
       'A0A2H9L751', 'A0A2H9LXD1', 'A0A2D6HHA6', 'A0A6C0BZR0',
       'A0A1S6H1G1', 'A0A1J4VY00', 'A0A2G9NV83', 'A0A2D5XMR4',
       'A0A0F9NID7']

In [77]:
len(accessions_uniprot)

1395

In [78]:
mapper = ProtMapper()

accessions, failed = mapper.get(
    ids=accessions_uniprot, from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS"
)
failed

Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 568
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 567
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 393 / 422


['A0A256KDR0',
 'A0A256KI76',
 'A0A2R6MN12',
 'A0A662FLX2',
 'A0A662QNS9',
 'A0A6G8T455',
 'A0A6N0NSD0',
 'A0A7J3UHZ5',
 'A0A842K380',
 'G2MHG6',
 'G2MPE8',
 'M0GUV1',
 'V6DR83',
 'V6DVC9',
 'V6DY05',
 'W0JZB9',
 'W0K3X0',
 'W0K8H7',
 'A0A497TFM2',
 'A0A2E5QFX1',
 'A0A2E1RDX6',
 'A0A523CMH0',
 'A0A2D6E7R0',
 'A0A1F9LHM1',
 'A0A2E6DHA5',
 'A0A2D5NES6',
 'A0A4P5XE30',
 'A0A800E729',
 'A0A1V6G224',
 'A0A1Z9I2D8',
 'A0A2D7BGF3',
 'A0A829BVW3',
 'A0A098MYJ0',
 'A0A0C5XCQ1',
 'A0A1N6WIY6',
 'A0A2H1XGH2',
 'A0A6L6R0V8',
 'A0A867ZGV8',
 'A0A868AW84',
 'A0A8I0U3R0',
 'M6X1T2',
 'A0A3S9KJR8',
 'A0A5F2EJC2',
 'S3UQY5',
 'A0A5F2DK70',
 'A0A6D2A794',
 'A0A4R9LJ18',
 'A0A2N0BT62',
 'A0A5F2DZT8',
 'I0XMV3',
 'A0A8I2A1S4',
 'A0A0M0BKB4',
 'A0A7W1RLJ1',
 'A0A2D6WX84',
 'A0A7K1I956',
 'A0A1Q9N5X0',
 'A0A7C1TCQ0',
 'A0A842YQ84',
 'A0A2D7DKM0',
 'A0A424KA57']

In [79]:
len(accessions)

1497

accessions - GenBank accessions (для некоторых белков из UniProt может быть найдено несколько GenBank записей, мы будем брать все)

failed - идентификаторы UniProt, которые не были обнаружены. Однако, для них есть запись в UniParc. Брать их или нет??

In [80]:
data_sequence = []
for acc in accessions['To']:
    # print('------------------------------------------------')
    print(acc)
    with Entrez.efetch(
        db="protein", id=acc, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": acc,
        "variant": "FtF",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

KDE60276.1
Fetched taxid from NCBI 1495067
KDS90282.1
Fetched taxid from NCBI 2248
AIU70766.1
Fetched taxid from NCBI 1505907
KHO45689.1
Fetched taxid from NCBI 1579366
KHO49565.1
Fetched taxid from NCBI 1579371
KHO49746.1
Fetched taxid from NCBI 1579368
KHO53464.1
Fetched taxid from NCBI 1579372
KHO54282.1
Fetched taxid from NCBI 1579376
AJF25556.1
Fetched taxid from NCBI 1592728
AJF62573.1
Fetched taxid from NCBI 1579378
AJF62009.1
Fetched taxid from NCBI 1579373
CQR49364.1
Fetched taxid from NCBI 1476858
AKH98539.1
Fetched taxid from NCBI 1604004
ALG82981.1
Fetched taxid from NCBI 1604004
AKU06373.1
Fetched taxid from NCBI 35746
QOS10347.1
Fetched taxid from NCBI 35746
KOX94947.1
Fetched taxid from NCBI 1705562
NLV05077.1
Fetched taxid from NCBI 1705562
KOX97669.1
Fetched taxid from NCBI 1765655
KOX95396.1
Fetched taxid from NCBI 1765655
KPN29935.1
Fetched taxid from NCBI 699431
SFP36003.1
Fetched taxid from NCBI 699431
ASJ13525.1
Fetched taxid from NCBI 277988
KQH82541.1
Fetched ta

In [85]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

1062 (23000): Duplicate entry 'KDE60276.1' for key 'PRIMARY'
KDE60276.1
1062 (23000): Duplicate entry 'KDS90282.1' for key 'PRIMARY'
KDS90282.1
1062 (23000): Duplicate entry 'AIU70766.1' for key 'PRIMARY'
AIU70766.1
1062 (23000): Duplicate entry 'KHO45689.1' for key 'PRIMARY'
KHO45689.1
1062 (23000): Duplicate entry 'KHO49565.1' for key 'PRIMARY'
KHO49565.1
1062 (23000): Duplicate entry 'KHO49746.1' for key 'PRIMARY'
KHO49746.1
1062 (23000): Duplicate entry 'KHO53464.1' for key 'PRIMARY'
KHO53464.1
1062 (23000): Duplicate entry 'KHO54282.1' for key 'PRIMARY'
KHO54282.1
1062 (23000): Duplicate entry 'AJF25556.1' for key 'PRIMARY'
AJF25556.1
1062 (23000): Duplicate entry 'AJF62573.1' for key 'PRIMARY'
AJF62573.1
1062 (23000): Duplicate entry 'AJF62009.1' for key 'PRIMARY'
AJF62009.1
1062 (23000): Duplicate entry 'CQR49364.1' for key 'PRIMARY'
CQR49364.1
1062 (23000): Duplicate entry 'AKH98539.1' for key 'PRIMARY'
AKH98539.1
1062 (23000): Duplicate entry 'ALG82981.1' for key 'PRIMARY'
ALG

In [91]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
9,AAF03226.1,FtF,,,,2261.0,Pyrococcus furiosus,Methanobacteriota,Thermococci,,,MEVKEMVELLVKSKVKEFVSSIDKDMRVSPEFYEALEAEVKALIEK...,,,
10,AAG20389.1,FtF,,,,64091.0,Halobacterium salinarum NRC-1,Methanobacteriota,Halobacteria,,,MADLIVKAAVKEALDDKNVASDFYEALDEEVDDLLADAAERAEANG...,,,
11,AAG20741.1,FtF,,,,64091.0,Halobacterium salinarum NRC-1,Methanobacteriota,Halobacteria,,,MAESRSENLRTFMWWAGETCDGMSDLIVKAAVKDELSEHNVSADFY...,,,
12,AAG21067.1,FtF,,,,64091.0,Halobacterium salinarum NRC-1,Methanobacteriota,Halobacteria,,,MAESRSENLRTFMWWAGETCDGMSDLIVKAAVKDELSEHNVSADFY...,,,
14,AAL81408.1,FtF,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MVELLVKSKVKEFVSSIDKDMRVSPEFYEALEAEVKALIEKAVKRA...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2440,VVB82006.1,FtF,,,,115547.0,uncultured archaeon,,,,,MGIIIKSNIRPLVKELDKDNAVSSVADEVEMALERKVENILSDAIK...,,,
2441,VVB83325.1,FtF,,,,115547.0,uncultured archaeon,,,,,MSLIVKSNIRKVVKELDKENEISSVADEVGTTLERKVEDLLINAIE...,,,
2442,VVB83693.1,FtF,,,,115547.0,uncultured archaeon,,,,,MGIIIKSQIRPLIKELDKENAISSVADEVATALDKKVEDILVNAIE...,,,
2444,VVC03116.1,FtF,,,,2885759.0,Candidatus Bilamarchaeum dharawalense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MKSNLIVSKNRVRIFLQQNGKRTSLDFYDALDAEVRALLKKAAKRA...,,,


### Add publications

In [92]:
pid = "schwab_histones_2024"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,schwab_histones_2024,Histones and histone variant families in proka...,10.1038/s41467-024-52337-y,,2024


In [94]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds['accession'], pid))
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

1062 (23000): Duplicate entry 'KDE60276.1-schwab_histones_2024' for key 'PRIMARY'
KDE60276.1
1062 (23000): Duplicate entry 'KDS90282.1-schwab_histones_2024' for key 'PRIMARY'
KDS90282.1
1062 (23000): Duplicate entry 'AIU70766.1-schwab_histones_2024' for key 'PRIMARY'
AIU70766.1
1062 (23000): Duplicate entry 'KHO45689.1-schwab_histones_2024' for key 'PRIMARY'
KHO45689.1
1062 (23000): Duplicate entry 'KHO49565.1-schwab_histones_2024' for key 'PRIMARY'
KHO49565.1
1062 (23000): Duplicate entry 'KHO49746.1-schwab_histones_2024' for key 'PRIMARY'
KHO49746.1
1062 (23000): Duplicate entry 'KHO53464.1-schwab_histones_2024' for key 'PRIMARY'
KHO53464.1
1062 (23000): Duplicate entry 'KHO54282.1-schwab_histones_2024' for key 'PRIMARY'
KHO54282.1
1062 (23000): Duplicate entry 'AJF25556.1-schwab_histones_2024' for key 'PRIMARY'
AJF25556.1
1062 (23000): Duplicate entry 'AJF62573.1-schwab_histones_2024' for key 'PRIMARY'
AJF62573.1
1062 (23000): Duplicate entry 'AJF62009.1-schwab_histones_2024' for ke

In [95]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
9,AAF03226.1,FtF,,,,2261.0,Pyrococcus furiosus,Methanobacteriota,Thermococci,,,MEVKEMVELLVKSKVKEFVSSIDKDMRVSPEFYEALEAEVKALIEK...,,AAF03226.1,schwab_histones_2024
10,AAG20389.1,FtF,,,,64091.0,Halobacterium salinarum NRC-1,Methanobacteriota,Halobacteria,,,MADLIVKAAVKEALDDKNVASDFYEALDEEVDDLLADAAERAEANG...,,AAG20389.1,schwab_histones_2024
11,AAG20741.1,FtF,,,,64091.0,Halobacterium salinarum NRC-1,Methanobacteriota,Halobacteria,,,MAESRSENLRTFMWWAGETCDGMSDLIVKAAVKDELSEHNVSADFY...,,AAG20741.1,schwab_histones_2024
12,AAG21067.1,FtF,,,,64091.0,Halobacterium salinarum NRC-1,Methanobacteriota,Halobacteria,,,MAESRSENLRTFMWWAGETCDGMSDLIVKAAVKDELSEHNVSADFY...,,AAG21067.1,schwab_histones_2024
14,AAL81408.1,FtF,,,,186497.0,Pyrococcus furiosus DSM 3638,Methanobacteriota,Thermococci,,,MVELLVKSKVKEFVSSIDKDMRVSPEFYEALEAEVKALIEKAVKRA...,,AAL81408.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2440,VVB82006.1,FtF,,,,115547.0,uncultured archaeon,,,,,MGIIIKSNIRPLVKELDKDNAVSSVADEVEMALERKVENILSDAIK...,,VVB82006.1,schwab_histones_2024
2441,VVB83325.1,FtF,,,,115547.0,uncultured archaeon,,,,,MSLIVKSNIRKVVKELDKENEISSVADEVGTTLERKVEDLLINAIE...,,VVB83325.1,schwab_histones_2024
2442,VVB83693.1,FtF,,,,115547.0,uncultured archaeon,,,,,MGIIIKSQIRPLIKELDKENAISSVADEVATALDKKVEDILVNAIE...,,VVB83693.1,schwab_histones_2024
2444,VVC03116.1,FtF,,,,2885759.0,Candidatus Bilamarchaeum dharawalense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MKSNLIVSKNRVRIFLQQNGKRTSLDFYDALDAEVRALLKKAAKRA...,,VVC03116.1,schwab_histones_2024


In [96]:
# Make sure data is committed to the database
conn.commit()

## Adding Methanococcales histones

In [97]:
accessions_uniprot = ['A0A076LDI7', 'A0A2L1CD04', 'A0A2Z5PDL5', 'A0A2Z5PL52',
       'A0A7J9PFZ2', 'A0A7J9S218', 'A0A832SV22', 'A4FXG4', 'A6UP30',
       'A6UUU1', 'A6VFV1', 'A9AAT3', 'C7P6C0', 'C9REV3', 'D3S432',
       'D5VTW4', 'D7DTQ7', 'F6BAU5', 'F8AK59', 'G0H0G5', 'H1KZH1',
       'N6VQ57', 'Q03576', 'Q59041', 'Q6LYH6', 'A0A8D6SUC3', 'A0A8D6T0P6']

In [98]:
mapper = ProtMapper()

accessions, failed = mapper.get(
    ids=accessions_uniprot, from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS"
)
failed

Retrying in 3s
Fetched: 39 / 40


['A0A8D6SUC3']

accessions - GenBank accessions (для некоторых белков из UniProt может быть найдено несколько GenBank записей, мы будем брать все)

failed - идентификаторы UniProt, которые не были обнаружены. Однако, для них есть запись в UniParc. Брать их или нет??

In [99]:
data_sequence = []
for acc in accessions['To']:
    # print('------------------------------------------------')
    print(acc)
    with Entrez.efetch(
        db="protein", id=acc, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": acc,
        "variant": "Bridge_(Methanococcales)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

AIJ06296.1
Fetched taxid from NCBI 1301915
AVB77209.1
Fetched taxid from NCBI 39152
MBA2846904.1
Fetched taxid from NCBI 39152
MBA2850582.1
Fetched taxid from NCBI 39152
MBA2858017.1
Fetched taxid from NCBI 39152
MBA2863719.1
Fetched taxid from NCBI 39152
MBB6066762.1
Fetched taxid from NCBI 39152
MBB6496275.1
Fetched taxid from NCBI 39152
MBG0768609.1
Fetched taxid from NCBI 39152
MBM7409049.1
Fetched taxid from NCBI 39152
MBP2218765.1
Fetched taxid from NCBI 39152
BAP61292.1
Fetched taxid from NCBI 637914
BAP63195.1
Fetched taxid from NCBI 637915
MBA2862031.1
Fetched taxid from NCBI 39152
MBA2840206.1
Fetched taxid from NCBI 39152
MBA2852813.1
Fetched taxid from NCBI 39152
MBA2859918.1
Fetched taxid from NCBI 39152
MBA2868559.1
Fetched taxid from NCBI 39152
MBB6400837.1
Fetched taxid from NCBI 39152
HII59409.1
Fetched taxid from NCBI 2190
ABO34893.1
Fetched taxid from NCBI 402880
ABR54252.1
Fetched taxid from NCBI 406327
ABR56263.1
Fetched taxid from NCBI 419665
ABR65327.1
Fetched ta

In [100]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [101]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,,
26,ABO34893.1,Bridge_(Methanococcales),,,,402880.0,Methanococcus maripaludis C5,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKENTEMNVSAESVAALVEILQEMVVTTTKIAEENA...,,,
28,ABR54252.1,Bridge_(Methanococcales),,,,406327.0,Methanococcus vannielii SB,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKQHTEMNVSAESVEKLVELLQEIIVTTTQIAEQNA...,,,
29,ABR56263.1,Bridge_(Methanococcales),,,,419665.0,Methanococcus aeolicus Nankai-3,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKQNTDMNVSAESVVKIVEILQEYIVTTTRLAEENA...,,,
30,ABR65327.1,Bridge_(Methanococcales),,,,426368.0,Methanococcus maripaludis C7,Methanobacteriota,Methanococci,,,MIPKGTIKRIMKENTDMNVSAESVAALVEILQEMVVTTTKIAEENA...,,,
31,ABX02456.1,Bridge_(Methanococcales),,,,444158.0,Methanococcus maripaludis C6,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKENTDMNVSAESVAALVEILQEMVVTTTKIAEENA...,,,
46,ACV24102.1,Bridge_(Methanococcales),,,,573064.0,Methanocaldococcus fervens AG86,Methanobacteriota,Methanococci,,,MLPKATVKRLMKQYTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,,
48,ACX72105.1,Bridge_(Methanococcales),,,,579137.0,Methanocaldococcus vulcanius M7,Methanobacteriota,Methanococci,,,MLPKATIKRIMKEHTDFNISSEAVDELCNMLEEIIKITTEVAEQNA...,,,
53,ADC69596.1,Bridge_(Methanococcales),,,,644281.0,Methanocaldococcus sp. FS406-22,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQYTNFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,,
57,ADG14017.1,Bridge_(Methanococcales),,,,573063.0,Methanocaldococcus infernus ME,Methanobacteriota,Methanococci,,,MLPKTTIKRVMKNYTDLNISSEAVDELINLLEEMIKVTTEVAEKNA...,,,


In [102]:
df[df['accession'].isin(accessions['To'])].shape

(39, 15)

### Add publications

In [103]:
pid = "schwab_histones_2024"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,schwab_histones_2024,Histones and histone variant families in proka...,10.1038/s41467-024-52337-y,,2024


In [104]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds['accession'], pid))
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [105]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,schwab_histones_2024
26,ABO34893.1,Bridge_(Methanococcales),,,,402880.0,Methanococcus maripaludis C5,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKENTEMNVSAESVAALVEILQEMVVTTTKIAEENA...,,ABO34893.1,schwab_histones_2024
28,ABR54252.1,Bridge_(Methanococcales),,,,406327.0,Methanococcus vannielii SB,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKQHTEMNVSAESVEKLVELLQEIIVTTTQIAEQNA...,,ABR54252.1,schwab_histones_2024
29,ABR56263.1,Bridge_(Methanococcales),,,,419665.0,Methanococcus aeolicus Nankai-3,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKQNTDMNVSAESVVKIVEILQEYIVTTTRLAEENA...,,ABR56263.1,schwab_histones_2024
30,ABR65327.1,Bridge_(Methanococcales),,,,426368.0,Methanococcus maripaludis C7,Methanobacteriota,Methanococci,,,MIPKGTIKRIMKENTDMNVSAESVAALVEILQEMVVTTTKIAEENA...,,ABR65327.1,schwab_histones_2024
31,ABX02456.1,Bridge_(Methanococcales),,,,444158.0,Methanococcus maripaludis C6,Methanobacteriota,Methanococci,,,MIPKGTVKRIMKENTDMNVSAESVAALVEILQEMVVTTTKIAEENA...,,ABX02456.1,schwab_histones_2024
46,ACV24102.1,Bridge_(Methanococcales),,,,573064.0,Methanocaldococcus fervens AG86,Methanobacteriota,Methanococci,,,MLPKATVKRLMKQYTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,ACV24102.1,schwab_histones_2024
48,ACX72105.1,Bridge_(Methanococcales),,,,579137.0,Methanocaldococcus vulcanius M7,Methanobacteriota,Methanococci,,,MLPKATIKRIMKEHTDFNISSEAVDELCNMLEEIIKITTEVAEQNA...,,ACX72105.1,schwab_histones_2024
53,ADC69596.1,Bridge_(Methanococcales),,,,644281.0,Methanocaldococcus sp. FS406-22,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQYTNFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,ADC69596.1,schwab_histones_2024
57,ADG14017.1,Bridge_(Methanococcales),,,,573063.0,Methanocaldococcus infernus ME,Methanobacteriota,Methanococci,,,MLPKTTIKRVMKNYTDLNISSEAVDELINLLEEMIKVTTEVAEKNA...,,ADG14017.1,schwab_histones_2024


### Update publication for MJ1647  (UniProt: Q59041, GeneBank: AAB99668.1)

In [106]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession']=='AAB99668.1']

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,schwab_histones_2024


In [107]:
pid = "Ofer_dna-bridging_2023"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [108]:
data_publication = {
    "id": pid,
    "title": 'DNA-bridging by an archaeal histone variant via a unique tetramerisation interface',
    "doi": '10.1038/s42003-023-05348-2',
    "author": None,
    "year": '2023',
}
cursor.execute(add_publication, data_publication)

In [109]:
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,Ofer_dna-bridging_2023,DNA-bridging by an archaeal histone variant vi...,10.1038/s42003-023-05348-2,,2023


In [110]:
cursor.execute(add_sequence_has_publication, ('AAB99668.1', pid))

In [111]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession']=='AAB99668.1']

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,Ofer_dna-bridging_2023
4,AAB99668.1,Bridge_(Methanococcales),,,,243232.0,Methanocaldococcus jannaschii DSM 2661,Methanobacteriota,Methanococci,,,MLPKATVKRIMKQHTDFNISAEAVDELCNMLEEIIKITTEVAEQNA...,,AAB99668.1,schwab_histones_2024


In [113]:
# Make sure data is committed to the database
conn.commit()

## Adding Phage histones

In [114]:
accessions_uniprot = ['A0A2D6AQ05', 'A0A2D7X1W6', 'A0A516M1G1', 'A0A516L8B6',
       'A0A516LIY4', 'A0A516MJ05', 'A0A516L3F8']

In [115]:
mapper = ProtMapper()

accessions, failed = mapper.get(
    ids=accessions_uniprot, from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS"
)
failed

Retrying in 3s
Fetched: 7 / 7


[]

accessions - GenBank accessions (для некоторых белков из UniProt может быть найдено несколько GenBank записей, мы будем брать все)

failed - идентификаторы UniProt, которые не были обнаружены. Однако, для них есть запись в UniParc. Брать их или нет??

In [116]:
data_sequence = []
for acc in accessions['To']:
    # print('------------------------------------------------')
    print(acc)
    with Entrez.efetch(
        db="protein", id=acc, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": acc,
        "variant": "Phage-histones",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

MAE83510.1
Fetched taxid from NCBI 2026740
MAK51185.1
Fetched taxid from NCBI 50741
QDP60017.1
Fetched taxid from NCBI 2591644
QDP50189.1
Fetched taxid from NCBI 2591644
QDP53893.1
Fetched taxid from NCBI 2591644
QDP66192.1
Fetched taxid from NCBI 2591644
QDP48474.1
Fetched taxid from NCBI 2591644


In [117]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [118]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
854,MAE83510.1,Phage-histones,,,,2026740.0,Flammeovirgaceae bacterium,Bacteroidota,Cytophagia,,,MTLIQKSKVKRLLNDLGVRVNPDAFDGINRIVESALTQLAGKVRED...,,,
892,MAK51185.1,Phage-histones,,,,50741.0,Marinobacter sp.,Pseudomonadota,Gammaproteobacteria,,,MEYIQKSKVKKFINSKGYRLRPDALDGINRTVEDVITSMLNNVEQD...,,,
1994,QDP48474.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MLLKKSEIKKLIKNEGYRISPESYEGINRAVESTIKQMLVQVANDN...,,,
1995,QDP50189.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MAYVQKSSIKKLVKDKGFRISPSSYDGINRSVESIILQMLEKVDQD...,,,
1996,QDP53893.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MESLMNYIQKSKIKSLVKDQGFRLSPNAIDGINRSVENLIKQMLNN...,,,
1997,QDP60017.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MMIQKSKVKKLINSKGLSVSSKSYESIDRVVTEVIEALCENTTEDG...,,,
1998,QDP66192.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MERKKLMSYIQKTKIKALVRDQGYRISPESFDGINRAVEGLIKGML...,,,


### Add publications

In [119]:
pid = "schwab_histones_2024"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,schwab_histones_2024,Histones and histone variant families in proka...,10.1038/s41467-024-52337-y,,2024


In [120]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds['accession'], pid))
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [121]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
854,MAE83510.1,Phage-histones,,,,2026740.0,Flammeovirgaceae bacterium,Bacteroidota,Cytophagia,,,MTLIQKSKVKRLLNDLGVRVNPDAFDGINRIVESALTQLAGKVRED...,,MAE83510.1,schwab_histones_2024
892,MAK51185.1,Phage-histones,,,,50741.0,Marinobacter sp.,Pseudomonadota,Gammaproteobacteria,,,MEYIQKSKVKKFINSKGYRLRPDALDGINRTVEDVITSMLNNVEQD...,,MAK51185.1,schwab_histones_2024
1994,QDP48474.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MLLKKSEIKKLIKNEGYRISPESYEGINRAVESTIKQMLVQVANDN...,,QDP48474.1,schwab_histones_2024
1995,QDP50189.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MAYVQKSSIKKLVKDKGFRISPSSYDGINRSVESIILQMLEKVDQD...,,QDP50189.1,schwab_histones_2024
1996,QDP53893.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MESLMNYIQKSKIKSLVKDQGFRLSPNAIDGINRSVENLIKQMLNN...,,QDP53893.1,schwab_histones_2024
1997,QDP60017.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MMIQKSKVKKLINSKGLSVSSKSYESIDRVVTEVIEALCENTTEDG...,,QDP60017.1,schwab_histones_2024
1998,QDP66192.1,Phage-histones,,,,2591644.0,Prokaryotic dsDNA virus sp. Viruses.,,,,,MERKKLMSYIQKTKIKALVRDQGYRISPESFDGINRAVEGLIKGML...,,QDP66192.1,schwab_histones_2024


In [123]:
# Make sure data is committed to the database
conn.commit()

## Adding RdgC histones

In [124]:
accessions_uniprot = ['A0A1G7GE93', 'A0A1H4ACZ2', 'A0A1H9PRW3', 'A0A1N7H292',
       'A0A238XD76', 'A0A2R6LYY8', 'A0A384LIV1', 'A0A7D5GK18', 'D4GVY1',
       'A0A5J5LMT9', 'A0A151UT86', 'J8G8T7', 'A0A1Y5YTP1', 'A0A6I6Y755',
       'A0A854YNM5', 'A0A1G5ASE0', 'Q74P81', 'A0A7D6VNA5', 'A0A410X3G3',
       'A0A167ZPM0', 'A0A0F5R1Y5']

In [125]:
mapper = ProtMapper()

accessions, failed = mapper.get(
    ids=accessions_uniprot, from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS"
)
failed

Retrying in 3s
Fetched: 17 / 22


['A0A151UT86', 'J8G8T7', 'A0A6I6Y755', 'A0A854YNM5', 'A0A167ZPM0']

accessions - GenBank accessions (для некоторых белков из UniProt может быть найдено несколько GenBank записей, мы будем брать все)

failed - идентификаторы UniProt, которые не были обнаружены. Однако, для них есть запись в UniParc. Брать их или нет??

In [126]:
data_sequence = []
for acc in accessions['To']:
    # print('------------------------------------------------')
    print(acc)
    with Entrez.efetch(
        db="protein", id=acc, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": acc,
        "variant": "RdgC-histones",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

SDE86450.1
Fetched taxid from NCBI 660518
SEA33849.1
Fetched taxid from NCBI 555874
SER50848.1
Fetched taxid from NCBI 1186196
SIS18945.1
Fetched taxid from NCBI 308853
SNR56538.1
Fetched taxid from NCBI 63740
PSQ46114.1
Fetched taxid from NCBI 1919201
ELY33287.1
Fetched taxid from NCBI 309800
QLG51074.1
Fetched taxid from NCBI 1699371
ADE04068.1
Fetched taxid from NCBI 309800
KAA9410722.1
Fetched taxid from NCBI 51589
MDK7394879.1
Fetched taxid from NCBI 2026187
SMD63384.1
Fetched taxid from NCBI 2026187
SCX80814.1
Fetched taxid from NCBI 582692
AAS44863.1
Fetched taxid from NCBI 222523
QLY77849.1
Fetched taxid from NCBI 36845
QAV21149.1
Fetched taxid from NCBI 79263
KKC46951.1
Fetched taxid from NCBI 665792


In [127]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [128]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
21,AAS44863.1,RdgC-histones,,,,222523.0,Bacillus cereus ATCC 10987,Bacillota,Bacilli,,,MKEELERFLKDQQTNSGENETTNSGDDTEEILGDEEVLVDENKSLL...,,,
59,ADE04068.1,RdgC-histones,,,,309800.0,Haloferax volcanii DS2,Methanobacteriota,Halobacteria,,,MTDDPEDIPDPVKPTTLQKLLRAYTPEDKQVGGESADYLKLQVDNT...,,,
298,ELY33287.1,RdgC-histones,,,,309800.0,Haloferax volcanii DS2,Methanobacteriota,Halobacteria,,,MTDDPEDIPDPVKPTTLQKLLRAYTPEDKQVGGESADYLKLQVDNT...,,,
762,KAA9410722.1,RdgC-histones,,,,51589.0,Haloarcula hispanica,Methanobacteriota,Halobacteria,,,MTLEEVETEDDPIKHAPLKSYVKSQTPEMWEGEEMWANDEAIDMMQ...,,,
789,KKC46951.1,RdgC-histones,,,,665792.0,Paenibacillus sp. D9,Bacillota,Bacilli,,,MENKEKQELIEDTQIKTSEDELEAIEATEEEIEDTLISLLYGNRLR...,,,
1072,MDK7394879.1,RdgC-histones,,,,2026187.0,Bacillus pacificus,Bacillota,Bacilli,,,MKEELERFLKDQQTNSGENETTNSGDDTEEILGDEEVLVDENKSLL...,,,
1959,PSQ46114.1,RdgC-histones,,,,1919201.0,Halobacteriales archaeon SW_6_65_15,Methanobacteriota,Halobacteria,,,MSDDPTDEHDPIKENPLHDLLRAYTPEDMQVGGDSTDYVKLQLENT...,,,
1984,QAV21149.1,RdgC-histones,,,,79263.0,Paenibacillus chitinolyticus,Bacillota,Bacilli,,,MSDNKLNEDFKEEDNELLKESATLDEEEDIQDTIASLLYGNRLRDL...,,,
2042,QLG51074.1,RdgC-histones,,,,1699371.0,Natrinema halophilum,Methanobacteriota,Halobacteria,,,MTGDEEDSNSPASPVKPHSLKNFVKEQSDMRAGSDAVDELHHHLDF...,,,
2053,QLY77849.1,RdgC-histones,,,,36845.0,Clostridium intestinale,Bacillota,Clostridia,,,MDQLKMELDIECKELIDIVNDEPLEEDTQNLIYKNRLRDLVKDIAN...,,,


In [129]:
df[df['accession'].isin(accessions['To'])].shape

(17, 15)

### Add publications

In [130]:
pid = "schwab_histones_2024"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,schwab_histones_2024,Histones and histone variant families in proka...,10.1038/s41467-024-52337-y,,2024


In [131]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds['accession'], pid))
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [132]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
21,AAS44863.1,RdgC-histones,,,,222523.0,Bacillus cereus ATCC 10987,Bacillota,Bacilli,,,MKEELERFLKDQQTNSGENETTNSGDDTEEILGDEEVLVDENKSLL...,,AAS44863.1,schwab_histones_2024
59,ADE04068.1,RdgC-histones,,,,309800.0,Haloferax volcanii DS2,Methanobacteriota,Halobacteria,,,MTDDPEDIPDPVKPTTLQKLLRAYTPEDKQVGGESADYLKLQVDNT...,,ADE04068.1,schwab_histones_2024
298,ELY33287.1,RdgC-histones,,,,309800.0,Haloferax volcanii DS2,Methanobacteriota,Halobacteria,,,MTDDPEDIPDPVKPTTLQKLLRAYTPEDKQVGGESADYLKLQVDNT...,,ELY33287.1,schwab_histones_2024
762,KAA9410722.1,RdgC-histones,,,,51589.0,Haloarcula hispanica,Methanobacteriota,Halobacteria,,,MTLEEVETEDDPIKHAPLKSYVKSQTPEMWEGEEMWANDEAIDMMQ...,,KAA9410722.1,schwab_histones_2024
789,KKC46951.1,RdgC-histones,,,,665792.0,Paenibacillus sp. D9,Bacillota,Bacilli,,,MENKEKQELIEDTQIKTSEDELEAIEATEEEIEDTLISLLYGNRLR...,,KKC46951.1,schwab_histones_2024
1072,MDK7394879.1,RdgC-histones,,,,2026187.0,Bacillus pacificus,Bacillota,Bacilli,,,MKEELERFLKDQQTNSGENETTNSGDDTEEILGDEEVLVDENKSLL...,,MDK7394879.1,schwab_histones_2024
1959,PSQ46114.1,RdgC-histones,,,,1919201.0,Halobacteriales archaeon SW_6_65_15,Methanobacteriota,Halobacteria,,,MSDDPTDEHDPIKENPLHDLLRAYTPEDMQVGGDSTDYVKLQLENT...,,PSQ46114.1,schwab_histones_2024
1984,QAV21149.1,RdgC-histones,,,,79263.0,Paenibacillus chitinolyticus,Bacillota,Bacilli,,,MSDNKLNEDFKEEDNELLKESATLDEEEDIQDTIASLLYGNRLRDL...,,QAV21149.1,schwab_histones_2024
2042,QLG51074.1,RdgC-histones,,,,1699371.0,Natrinema halophilum,Methanobacteriota,Halobacteria,,,MTGDEEDSNSPASPVKPHSLKNFVKEQSDMRAGSDAVDELHHHLDF...,,QLG51074.1,schwab_histones_2024
2053,QLY77849.1,RdgC-histones,,,,36845.0,Clostridium intestinale,Bacillota,Clostridia,,,MDQLKMELDIECKELIDIVNDEPLEEDTQNLIYKNRLRDLVKDIAN...,,QLY77849.1,schwab_histones_2024


In [133]:
# Make sure data is committed to the database
conn.commit()

## Adding Nucleosomal histones

In [134]:
accessions_uniprot = ['A0A060HNU7', 'A0A062VBD0', 'A0A063ZDV2', 'A0A063ZF05',
       'A0A075FMJ7', 'A0A075GDY3', 'A0A075GE50', 'A0A075GP06',
       'A0A075H298', 'A0A075H2Z3', 'A0A075H4C9', 'A0A075HM24',
       'A0A075HM52', 'A0A075LSE8', 'A0A075LTS7', 'A0A075MS86',
       'A0A075WHX9', 'A0A075WLU9', 'A0A076LD81', 'A0A076LER7',
       'A0A076LGZ1', 'A0A081RNK8', 'A0A081S5T2', 'A0A087RPB5',
       'A0A087RPT5', 'A0A087RTC7', 'A0A087S1Q8', 'A0A089ZGW8',
       'A0A089ZI94', 'A0A090I773', 'A0A090JU23', 'A0A097QQZ5',
       'A0A097QTQ8', 'A0A099T1S2', 'A0A0A7GF90', 'A0A0A7GFH1',
       'A0A0A7UZZ0', 'A0A0B3A1U3', 'A0A0B3AB04', 'A0A0B3ADP2',
       'A0A0B3AFF2', 'A0A0B3AFK0', 'A0A0B3AHZ0', 'A0A0B3AT26',
       'A0A0B3ATL0', 'A0A0B3AVR6', 'A0A0B3AZ98', 'A0A0B3B0U1',
       'A0A0B5HVV8', 'A0A0B5HW46', 'A0A0C5BU64', 'A0A0D5C0N6',
       'A0A0E3H9F0', 'A0A0E3LHX2', 'A0A0E3NG37', 'A0A0E3NHW5',
       'A0A0E3NPC9', 'A0A0E3NZ12', 'A0A0E3P1N3', 'A0A0E3PAS7',
       'A0A0E3PKD3', 'A0A0E3PYN2', 'A0A0E3QE78', 'A0A0E3QQD1',
       'A0A0E3QYE2', 'A0A0E3R6J3', 'A0A0E3RAL4', 'A0A0E3RIW4',
       'A0A0E3RQE2', 'A0A0E3RVX7', 'A0A0E3S3N4', 'A0A0E3SIV7',
       'A0A0E3SKR9', 'A0A0E3X0P2', 'A0A0F7FKG7', 'A0A0F7IIG8',
       'A0A0F7IJ02', 'A0A0F8C434', 'A0A0F8CI72', 'A0A0F8E8R4',
       'A0A0F8FA02', 'A0A0F8FW05', 'A0A0F8JNF8', 'A0A0F8NBH6',
       'A0A0F8RXN7', 'A0A0F8VMC0', 'A0A0F8XIL9', 'A0A0F8XS92',
       'A0A0G3CDC7', 'A0A0H1QWJ6', 'A0A0H1R895', 'A0A0M0BHQ3',
       'A0A0M0BRN9', 'A0A0M0BXM3', 'A0A0N8KQP7', 'A0A0N8W8F9',
       'A0A0P8X8L7', 'A0A0P8X8R6', 'A0A0P9FXB0', 'A0A0Q0VKW9',
       'A0A0Q1AMB5', 'A0A0Q1FHD1', 'A0A0Q2QSA7', 'A0A0S1X8W5',
       'A0A0S1XAX1', 'A0A0U3E3M3', 'A0A0U3ED39', 'A0A0U3RZF1',
       'A0A0U3SZY8', 'A0A0X1KIH5', 'A0A0X1KKM7', 'A0A0X3BK74',
       'A0A0X3BM55', 'A0A100XXV4', 'A0A101DBV4', 'A0A101DNZ9',
       'A0A101DPW6', 'A0A101DRU7', 'A0A101E0F8', 'A0A101EMM8',
       'A0A101F003', 'A0A101F2M8', 'A0A101FUX7', 'A0A101J068',
       'A0A101WZH0', 'A0A101X6Q6', 'A0A101XB78', 'A0A101XF49',
       'A0A117L0W9', 'A0A117LQN1', 'A0A117SWT0', 'A0A120JZY5',
       'A0A124EAZ7', 'A0A124F9F5', 'A0A124G5Y5', 'A0A125RA37',
       'A0A125RD40', 'A0A126R090', 'A0A126R0E6', 'A0A126R110',
       'A0A127BCF9', 'A0A127BDW6', 'A0A128A571', 'A0A133U3G2',
       'A0A133U478', 'A0A133U5M3', 'A0A133U7S2', 'A0A133U8Y3',
       'A0A133U906', 'A0A133UCH2', 'A0A133UDS1', 'A0A133UFZ4',
       'A0A133UG88', 'A0A133UHB0', 'A0A133UMZ1', 'A0A133UNB9',
       'A0A133UNN0', 'A0A133US98', 'A0A133USZ1', 'A0A133UT36',
       'A0A133UVM9', 'A0A133UVQ9', 'A0A133UXA2', 'A0A133UYP5',
       'A0A133V0K3', 'A0A133V334', 'A0A133V410', 'A0A133V5Q9',
       'A0A133V6P2', 'A0A133V7X4', 'A0A133V8K5', 'A0A133VDP5',
       'A0A133VEP5', 'A0A133VF25', 'A0A133VHF9', 'A0A133VIH5',
       'A0A133VJV1', 'A0A133VQC7', 'A0A133VS39', 'A0A133VST3',
       'A0A135VEF4', 'A0A139CGA5', 'A0A139CVI3', 'A0A142CUD0',
       'A0A142CWH3', 'A0A147JRS5', 'A0A147JUX3', 'A0A147JVZ3',
       'A0A147JW97', 'A0A147JWG4', 'A0A147JY92', 'A0A150IXE6',
       'A0A150J7Z8', 'A0A150JIP2', 'A0A151B8U5', 'A0A151BBP6',
       'A0A151BJ84', 'A0A151BL46', 'A0A151BRZ6', 'A0A151E7M7',
       'A0A151EH72', 'A0A151EM82', 'A0A151EQV4', 'A0A151F2L7',
       'A0A151F3W7', 'A0A151F702', 'A0A151F7S4', 'A0A151F9P9',
       'A0A151FAR6', 'A0A151FCK3', 'A0A151FD80', 'A0A162FK86',
       'A0A165YUY4', 'A0A165ZGR7', 'A0A166A494', 'A0A166A9E0',
       'A0A166BDG4', 'A0A166BFF0', 'A0A166BMX3', 'A0A166BVE0',
       'A0A166CK19', 'A0A166CPM2', 'A0A166CVN8', 'A0A166CWZ9',
       'A0A170SU38', 'A0A170SYI5', 'A0A172WF87', 'A0A172WHT6',
       'A0A1B8X0S4', 'A0A1D2R4V2', 'A0A1D2RA51', 'A0A1D2RFK9',
       'A0A1D2UYK1', 'A0A1D2W468', 'A0A1D2WAA9', 'A0A1D2WAU0',
       'A0A1D2WC67', 'A0A1D2WCH3', 'A0A1D2WD20', 'A0A1D2WD29',
       'A0A1D2WG49', 'A0A1D2WH40', 'A0A1D2WHC1', 'A0A1D2WJA3',
       'A0A1D2WJL4', 'A0A1D2WML4', 'A0A1D2WNF3', 'A0A1D2WQ08',
       'A0A1D2WS93', 'A0A1D2WVU9', 'A0A1D2WVW6', 'A0A1D2X1F6',
       'A0A1D2X1S2', 'A0A1D2X2B9', 'A0A1D2X2I6', 'A0A1D2X3F1',
       'A0A1D3L0V3', 'A0A1D3L3N0', 'A0A1D3L3U5', 'A0A1D8MR63',
       'A0A1D8MR91', 'A0A1D8MTE2', 'A0A1E7GEG3', 'A0A1F2P6L7',
       'A0A1F2P867', 'A0A1F5CQV8', 'A0A1F5CTV2', 'A0A1F5CYU9',
       'A0A1F5DGN0', 'A0A1F5DGT1', 'A0A1F5DM47', 'A0A1F6QVP6',
       'A0A1F6QW03', 'A0A1F6R5J3', 'A0A1F6R5M8', 'A0A1G3XSU2',
       'A0A1G5UUZ6', 'A0A1G8XEC1', 'A0A1G8XZT9', 'A0A1H7F5F4',
       'A0A1H7H240', 'A0A1H7JA67', 'A0A1H9Y5W9', 'A0A1I0P664',
       'A0A1I4UAB6', 'A0A1I6YLZ9', 'A0A1J4RKY2', 'A0A1J4RM28',
       'A0A1J4UFM0', 'A0A1J4UIX7', 'A0A1J4UMG5', 'A0A1J4UNF5',
       'A0A1J4UPZ8', 'A0A1J4URK5', 'A0A1J4UV50', 'A0A1J4UWG2',
       'A0A1J4UZT7', 'A0A1J4VTS9', 'A0A1J4W3V9', 'A0A1J4XNT1',
       'A0A1J4XSR9', 'A0A1J4XZE5', 'A0A1J4Y2R3', 'A0A1J5CKR2',
       'A0A1J5CPR6', 'A0A1L3Q2D9', 'A0A1L9C3E0', 'A0A1L9GTC7',
       'A0A1L9GUI6', 'A0A1M2YGQ6', 'A0A1M2YIC3', 'A0A1M4MHQ6',
       'A0A1M4MJL2', 'A0A1Q6DU18', 'A0A1Q6DVQ6', 'A0A1Q7B955',
       'A0A1Q7CVK5', 'A0A1Q7G412', 'A0A1Q7IAN6', 'A0A1Q7KZN7',
       'A0A1Q7LTN1', 'A0A1Q7MCX9', 'A0A1Q7MFX0', 'A0A1Q7NB28',
       'A0A1Q7NLU4', 'A0A1Q7P4N1', 'A0A1Q7PX21', 'A0A1Q7ZL40',
       'A0A1Q9MSL6', 'A0A1Q9N0A2', 'A0A1Q9N0H4', 'A0A1Q9N418',
       'A0A1Q9N877', 'A0A1Q9N8N6', 'A0A1Q9NAM9', 'A0A1Q9NJR1',
       'A0A1Q9NJV8', 'A0A1Q9NR84', 'A0A1Q9NRJ0', 'A0A1Q9NRY6',
       'A0A1Q9NVH8', 'A0A1S6GZG5', 'A0A1S6HAZ0', 'A0A1S6HDW0',
       'A0A1V4TAU5', 'A0A1V4TAW0', 'A0A1V4TI62', 'A0A1V4TPJ7',
       'A0A1V4TWG7', 'A0A1V4TZ27', 'A0A1V4U1C6', 'A0A1V4U231',
       'A0A1V4U7M9', 'A0A1V4U8H4', 'A0A1V4UB28', 'A0A1V4UD94',
       'A0A1V4UDR7', 'A0A1V4UG65', 'A0A1V4UL24', 'A0A1V4UPZ5',
       'A0A1V4UQ44', 'A0A1V4UVH4', 'A0A1V4V0G8', 'A0A1V4V526',
       'A0A1V4V585', 'A0A1V4Y6S9', 'A0A1V4YHG5', 'A0A1V4YIQ6',
       'A0A1V4YK42', 'A0A1V4YMI9', 'A0A1V4YQP3', 'A0A1V4YRS3',
       'A0A1V4YST8', 'A0A1V4YYR5', 'A0A1V4Z5X7', 'A0A1V4Z7V1',
       'A0A1V4ZC95', 'A0A1V4ZG01', 'A0A1V4ZVF1', 'A0A1V4ZVZ5',
       'A0A1V4ZWX6', 'A0A1V4ZXA3', 'A0A1V4ZXH5', 'A0A1V5A0M7',
       'A0A1V5A1X1', 'A0A1V5A6D1', 'A0A1V5A9A2', 'A0A1V5AEC3',
       'A0A1V5AQ29', 'A0A1V5B3I1', 'A0A1V5B8F6', 'A0A1V5BA76',
       'A0A1V5IG55', 'A0A1V5IKR2', 'A0A1V5QLU4', 'A0A1V5SMG1',
       'A0A1V5SP42', 'A0A1V5SZX8', 'A0A1V5Z9I9', 'A0A1V6IPL5',
       'A0A1V6K4B5', 'A0A1V6K9M0', 'A0A1V6N1A9', 'A0A1V6N206',
       'A0A1V6N2G5', 'A0A1V6N405', 'A0A1Y3GG93', 'A0A1Z2TKZ9',
       'A0A1Z2TNJ4', 'A0A218NMX9', 'A0A218NNF1', 'A0A218P2E2',
       'A0A218P4F4', 'A0A218P4Y8', 'A0A218P7E2', 'A0A218PEJ1',
       'A0A218ZJ19', 'A0A218ZKU6', 'A0A219AKS3', 'A0A219ALA4',
       'A0A219ALW7', 'A0A219AM80', 'A0A223Z033', 'A0A223Z0W8',
       'A0A223ZDV6', 'A0A256XCC7', 'A0A256XD82', 'A0A256XJF0',
       'A0A256XU97', 'A0A256Y5A8', 'A0A256Y5A9', 'A0A256Y627',
       'A0A256Y9Q1', 'A0A256YNH4', 'A0A256YPT0', 'A0A256YR87',
       'A0A256YRF3', 'A0A256YYD9', 'A0A256ZEC3', 'A0A256ZSI3',
       'A0A256ZVM1', 'A0A256ZZW3', 'A0A257A093', 'A0A257A2F6',
       'A0A257AES5', 'A0A257AHD9', 'A0A257AJX6', 'A0A257ANS2',
       'A0A257AQV5', 'A0A284VSS1', 'A0A285GHI5', 'A0A2A2H1T0',
       'A0A2A2H272', 'A0A2A2H2I6', 'A0A2A2H7H2', 'A0A2A2H9P7',
       'A0A2A2HAD6', 'A0A2A2HAS3', 'A0A2A2HAV8', 'A0A2A2HB24',
       'A0A2A2HBF1', 'A0A2A2HBY3', 'A0A2A2HCD7', 'A0A2A2HDE4',
       'A0A2A2HDM6', 'A0A2A2HDW7', 'A0A2A2HH87', 'A0A2A2HXU9',
       'A0A2A3TCE9', 'A0A2D3C4B0', 'A0A2D5XDA9', 'A0A2D6EYS2',
       'A0A2D6JC81', 'A0A2D6LW70', 'A0A2D6NCT5', 'A0A2D6PIB2',
       'A0A2D6R6A0', 'A0A2D6RKX7', 'A0A2D6SN45', 'A0A2D6TIV0',
       'A0A2D6TJT5', 'A0A2D6V4T4', 'A0A2D6W4L7', 'A0A2D6WWJ0',
       'A0A2D6WXC0', 'A0A2D7BMS3', 'A0A2E5KGU4', 'A0A2E7RRS6',
       'A0A2E8EG63', 'A0A2E9MTI9', 'A0A2E9NZB2', 'A0A2E9U470',
       'A0A2E9U5A3', 'A0A2G4J1G2', 'A0A2G4JCS6', 'A0A2G9LIS2',
       'A0A2G9LK25', 'A0A2G9LS85', 'A0A2G9M3W5', 'A0A2G9M9C4',
        'A0A2G9MKD4', 'A0A2G9MLZ3', 'A0A2G9MQI8', 'A0A2G9MRI8',
       'A0A2G9N4L9', 'A0A2G9N5X3', 'A0A2G9NAC7', 'A0A2G9NZC8',
       'A0A2G9NZY5', 'A0A2G9P4M3', 'A0A2G9P9T1', 'A0A2G9PB37',
       'A0A2G9PGA5', 'A0A2G9PGE8', 'A0A2G9PHP3', 'A0A2G9PJN0',
       'A0A2G9PK59', 'A0A2G9PKV1', 'A0A2G9PN85', 'A0A2G9PSN2',
       'A0A2G9PTR3', 'A0A2H1EG24', 'A0A2H1FFW3', 'A0A2H4U5U6',
       'A0A2H4U7H9', 'A0A2H4U8L7', 'A0A2H4VCT0', 'A0A2H4VFB8',
       'A0A2H4VFH9', 'A0A2H4VG87', 'A0A2H4VLQ3', 'A0A2H4VM38',
       'A0A2H5UX19', 'A0A2H5V0N8', 'A0A2H5V7R5', 'A0A2H5VEN2',
       'A0A2H6GXE8', 'A0A2H6JX77', 'A0A2H6JXN1', 'A0A2H6K1K9',
       'A0A2H9HZD3', 'A0A2H9L1N0', 'A0A2H9L2P8', 'A0A2H9L354',
       'A0A2H9LBY9', 'A0A2H9LC79', 'A0A2H9LFS5', 'A0A2H9LM30',
       'A0A2H9M072', 'A0A2H9M7E3', 'A0A2H9MF74', 'A0A2H9MIY1',
       'A0A2H9MKT8', 'A0A2H9MM87', 'A0A2H9MMV6', 'A0A2H9MYR0',
       'A0A2H9MZW7', 'A0A2H9N872', 'A0A2H9NAU1', 'A0A2H9NE58',
       'A0A2H9NFX3', 'A0A2H9NRN7', 'A0A2H9NTP5', 'A0A2H9NW77',
       'A0A2H9P6F4', 'A0A2H9P7R4', 'A0A2H9PGN4', 'A0A2H9PGZ3',
       'A0A2H9PPU7', 'A0A2H9PXV7', 'A0A2H9Q6H7', 'A0A2H9Q7Y4',
       'A0A2H9Q9P0', 'A0A2H9QE47', 'A0A2H9QIF6', 'A0A2H9QMG2',
       'A0A2H9QWZ9', 'A0A2H9R330', 'A0A2H9R360', 'A0A2H9RSK2',
       'A0A2H9SBU6', 'A0A2I0CY31', 'A0A2I0NKU9', 'A0A2I0NP81',
       'A0A2I0NTY6', 'A0A2I0NWJ4', 'A0A2I0NZ22', 'A0A2I0P0J7',
       'A0A2I0P1Z0', 'A0A2I0P382', 'A0A2I0P8I0', 'A0A2I0P9V7',
       'A0A2I0PCM2', 'A0A2I0PGG9', 'A0A2I0PM58', 'A0A2I0PNB7',
       'A0A2I0PPQ5', 'A0A2I0PRL4', 'A0A2I0PUM5', 'A0A2I0PYJ6',
       'A0A2I0Q259', 'A0A2I0Q3J1', 'A0A2I0Q5R0', 'A0A2J2HHC6',
       'A0A2J4G7C7', 'A0A2J6H1A4', 'A0A2J6H1K4', 'A0A2J6MZ95',
       'A0A2J6N112', 'A0A2K2V4F4', 'A0A2K5AR63', 'A0A2L1C9N6',
       'A0A2L1CCC6', 'A0A2P0QLA5', 'A0A2P5K3D6', 'A0A2P6W055',
       'A0A2P6W073', 'A0A2P6W280', 'A0A2P6W315', 'A0A2P6W3I9',
       'A0A2P6W435', 'A0A2P6W5F6', 'A0A2P6W6T5', 'A0A2P6W7F2',
       'A0A2P6W7W7', 'A0A2P6W9A1', 'A0A2R6LU11', 'A0A2R6T6F8',
       'A0A2R7Y9E8', 'A0A2S2KRW9', 'A0A2T9WTH4', 'A0A2U0RYT9',
       'A0A2U0S3I8', 'A0A2U0S450', 'A0A2U1S7V9', 'A0A2U3CCH3',
       'A0A2U3CDB3', 'A0A2U3CDE4', 'A0A2U3CEV2', 'A0A2U3CIL6',
       'A0A2U3CIZ9', 'A0A2U3CL15', 'A0A2V2MRE4', 'A0A2V2N0P1',
       'A0A2V2N6I7', 'A0A2V2N989', 'A0A2V2ND76', 'A0A2V2NGN3',
       'A0A2V2NGN8', 'A0A2V2NKR6', 'A0A2V2U9P8', 'A0A2V3I9X5',
       'A0A2V3JXA2', 'A0A2Z2HIE0', 'A0A2Z2M6J5', 'A0A2Z2M7B4',
       'A0A2Z2MCL3', 'A0A2Z2MF54', 'A0A2Z2MFL5', 'A0A2Z2MHK2',
       'A0A2Z2MMR3', 'A0A2Z2MQ55', 'A0A2Z2MX31', 'A0A2Z2MY27',
       'A0A2Z2N618', 'A0A2Z4L757', 'A0A2Z4L782', 'A0A2Z4L7A5',
       'A0A2Z4L7N3', 'A0A2Z4L7R5', 'A0A2Z4L843', 'A0A2Z4L8Z7',
       'A0A2Z4L905', 'A0A2Z4L9W5', 'A0A2Z4LB91', 'A0A2Z5PCT8',
       'A0A2Z5PRB8', 'A0A2Z5PS53', 'A0A2Z5PTC0', 'A0A314ZYS6',
       'A0A315XKH9', 'A0A315XLE0', 'A0A315XNQ5', 'A0A315XQH9',
       'A0A315Y1M9', 'A0A328PCD0', 'A0A328PD16', 'A0A328PGI2',
       'A0A328PYC9', 'A0A328Q199', 'A0A328Q7S4', 'A0A328Q8U6',
       'A0A328Q9D9', 'A0A328QAY5', 'A0A328RR29', 'A0A328RVS8',
       'A0A328RX71', 'A0A328RZ87', 'A0A328RZS2', 'A0A328S0N1',
       'A0A328S0V0', 'A0A328S198', 'A0A328S2M7', 'A0A328S4G9',
       'A0A328S510', 'A0A328S5C9', 'A0A328S7E4', 'A0A328SA05',
       'A0A328SAD4', 'A0A328SAQ2', 'A0A328SBW4', 'A0A328SC08',
       'A0A328SD46', 'A0A328SEH2', 'A0A328SF46', 'A0A328SFG9',
       'A0A328SGZ5', 'A0A328SIF7', 'A0A328SIN8', 'A0A328SIW3',
       'A0A328SJL0', 'A0A328SJV5', 'A0A328SKI7', 'A0A328SL91',
       'A0A328SLK5', 'A0A328SMD9', 'A0A328SMI7', 'A0A328SNE1',
       'A0A328SPP1', 'A0A328SQE7', 'A0A328SQG9', 'A0A328SQJ3',
       'A0A328SUA1', 'A0A328SVZ7', 'A0A347ADK0', 'A0A347AG96',
       'A0A347AHX4', 'A0A347APC5', 'A0A347APW5', 'A0A347AQ50',
       'A0A366MA96', 'A0A366MEH5', 'A0A366MGH2', 'A0A368PFD0',
       'A0A368TD29', 'A0A368TDC7', 'A0A368TFZ9', 'A0A368TGL6',
       'A0A368TJ71', 'A0A368TJ81', 'A0A369SZS3', 'A0A369T538',
       'A0A370LI64', 'A0A371NBC5', 'A0A371NDF6', 'A0A371NGJ3',
       'A0A3A4UXJ7', 'A0A3A5HEZ9', 'A0A3A5HNI8', 'A0A3A5HNX1',
       'A0A3G1A6N1', 'A0A3G1B837', 'A0A3G9CYJ1', 'A0A3G9CZ30',
       'A0A3G9D0U5', 'A0A3L6JGJ4', 'A0A3L6JK82', 'A0A3L6JR83',
       'A0A3M0XED0', 'A0A3M1F950', 'A0A3M1H8D3', 'A0A3M1JJI6',
       'A0A3M1LHR8', 'A0A3M1NK61', 'A0A3M1REI3', 'A0A3M6K0G6',
       'A0A3M6K3P3', 'A0A3M6KFE9', 'A0A3M9LKV9', 'A0A3M9ZN94',
       'A0A3M9ZWA7', 'A0A3N0A1I8', 'A0A3N5BBP1', 'A0A3N5BTQ0',
       'A0A3N5C5D6', 'A0A3N5MSP9', 'A0A3N5NK59', 'A0A3N5RIN3',
       'A0A3N5V7R6', 'A0A3N9UP75', 'A0A3R7A682', 'A0A3R7B350',
       'A0A3R7DDS6', 'A0A3R7DRV3', 'A0A3R7E505', 'A0A3R7ERN9',
       'A0A3R7WEE9', 'A0A3R7XDV1', 'A0A3R9G6T4', 'A0A3R9QS97',
       'A0A3R9RIX5', 'A0A3S3RNM7', 'A0A401HRT1', 'A0A419FTD6',
       'A0A419JA72', 'A0A419JDF5', 'A0A419KFS9', 'A0A419KU74',
       'A0A419KZA8', 'A0A424YS37', 'A0A424Z3Q3', 'A0A429G5C5',
       'A0A429GPR3', 'A0A432Q392', 'A0A432Q8B2', 'A0A447G5G2',
       'A0A483CPA3', 'A0A483CYC1', 'A0A484I7B8', 'A0A497EIC5',
       'A0A497ERI4', 'A0A497FB77', 'A0A497FDY4', 'A0A497FPR7',
       'A0A497FQ09', 'A0A497FTV2', 'A0A497G7W9', 'A0A497GE10',
       'A0A497GGR7', 'A0A497GKR9', 'A0A497GQR5', 'A0A497HXV0',
       'A0A497HYU2', 'A0A497I0D0', 'A0A497I0D4', 'A0A497I2I7',
       'A0A497IC36', 'A0A497IC70', 'A0A497IKT8', 'A0A497IT48',
       'A0A497IVI6', 'A0A497IY74', 'A0A497IYM3', 'A0A497J867',
       'A0A497JCV3', 'A0A497JE91', 'A0A497JI14', 'A0A497JPK4',
       'A0A497K1D8', 'A0A497K8V0', 'A0A497KT42', 'A0A497KY88',
       'A0A497L1N7', 'A0A497LBH6', 'A0A497LLI7', 'A0A497LPE5',
       'A0A497LSP0', 'A0A497M5V2', 'A0A497MDE0', 'A0A497MLU2',
       'A0A497N0W1', 'A0A497NQV0', 'A0A497P9F5', 'A0A497PKU2',
       'A0A497Q5G6', 'A0A497QAM5', 'A0A497QKD9', 'A0A497QLP9',
       'A0A497QQA5', 'A0A497QQU3', 'A0A497QRM8', 'A0A497R864',
       'A0A497RDA0', 'A0A497RE85', 'A0A497RMK7', 'A0A497RNW9',
       'A0A497RZ18', 'A0A497S1Y9', 'A0A497S2W2', 'A0A497S6Z9',
       'A0A497S7X4', 'A0A497SE32', 'A0A497SE96', 'A0A497SPR0',
       'A0A497SQ46', 'A0A497SUS5', 'A0A497T2M2', 'A0A497T7X0',
       'A0A497T9M8', 'A0A497TCG4', 'A0A497TCL9', 'A0A498GU95',
       'A0A498H2R0', 'A0A498H6R1', 'A0A498HAH4', 'A0A4E0PW25',
       'A0A4P2VDV9', 'A0A4Y3FUX3', 'A0A4Y5SJB9', 'A0A4Y5SMB0',
       'A0A510BDN0', 'A0A514LBM4', 'A0A519BUX0', 'A0A519BV35',
       'A0A519C1I8', 'A0A519CE89', 'A0A520JV22', 'A0A520K3R6',
       'A0A520K659', 'A0A520K7N0', 'A0A520K964', 'A0A520K9K9',
       'A0A520KCW3', 'A0A520KG22', 'A0A520KPH9', 'A0A520KRE4',
       'A0A520KSW6', 'A0A520KYR6', 'A0A522E284', 'A0A522E2L9',
       'A0A522UE37', 'A0A522XX67', 'A0A523A8K9', 'A0A523A9Y0',
       'A0A523AEZ1', 'A0A523APY9', 'A0A523AR20', 'A0A523AR98',
       'A0A523ASH6', 'A0A523B1Q8', 'A0A523B765', 'A0A523BF84',
       'A0A523BKV4', 'A0A523QDQ7', 'A0A523R5Q7', 'A0A523R5S4',
       'A0A523RBP4', 'A0A523RBW8', 'A0A523RHE9', 'A0A523RJT3',
       'A0A523SPF5', 'A0A523T657', 'A0A523VTN5', 'A0A523VTX2',
       'A0A523W3L3', 'A0A523W4P4', 'A0A523WEW0', 'A0A524A405',
       'A0A524ACZ6', 'A0A524C4X0', 'A0A524C8P9', 'A0A524CB36',
       'A0A524CCQ1', 'A0A524CIF1', 'A0A524CK09', 'A0A524CKE8',
       'A0A524CPY4', 'A0A524CWT0', 'A0A524D286', 'A0A524D471',
       'A0A524D6E4', 'A0A524D8X3', 'A0A524DA31', 'A0A524DD47',
       'A0A524DEH8', 'A0A524DHF1', 'A0A524DHQ4', 'A0A524DIF9',
       'A0A524DKK1', 'A0A524DPR3', 'A0A524DQL5', 'A0A524E3J8',
       'A0A524E3Q8', 'A0A524E5G3', 'A0A524E5M2', 'A0A524EDY6',
       'A0A524EQ11', 'A0A524ET96', 'A0A524ETE0', 'A0A524F2R0',
       'A0A524F5S4', 'A0A524F647', 'A0A524F741', 'A0A524FH65',
       'A0A524FIC3', 'A0A524FJL1', 'A0A524FM84', 'A0A524FQH0',
       'A0A524FR67', 'A0A524FT65', 'A0A524LWR7', 'A0A524M540',
       'A0A524N5N7', 'A0A524P3R2', 'A0A524P513', 'A0A532T328',
       'A0A532TAS4', 'A0A532TCN5', 'A0A532TF22', 'A0A532TK07',
        'A0A532TPU3', 'A0A532TU16', 'A0A532TVI5', 'A0A532TXR5',
       'A0A533UAF2', 'A0A533UHN2', 'A0A533VBE3', 'A0A533VCS1',
       'A0A533VLJ7', 'A0A533VNC3', 'A0A533VUB7', 'A0A533W9N5',
       'A0A533WB40', 'A0A533WDX6', 'A0A537EPY5', 'A0A537EVA1',
       'A0A537H572', 'A0A537IDZ7', 'A0A538P8V2', 'A0A550GKM6',
       'A0A550GNL0', 'A0A550H0Z9', 'A0A550H4Y0', 'A0A550HK09',
       'A0A557SSF2', 'A0A564PY57', 'A0A564Q252', 'A0A564Q7Z0',
       'A0A564QAD2', 'A0A5B2Z0J9', 'A0A5B9D7L9', 'A0A5B9D861',
       'A0A5B9D9F4', 'A0A5B9DCD4', 'A0A5B9DF44', 'A0A5B9M1E3',
       'A0A5B9M3B4', 'A0A5B9M5J2', 'A0A5C0SJE3', 'A0A5C0SME8',
       'A0A5C0XQT5', 'A0A5C0XRU5', 'A0A5C9DZV5', 'A0A5C9E6X3',
       'A0A5C9E985', 'A0A5C9ECC8', 'A0A5C9ENJ5', 'A0A5C9EPV3',
       'A0A5C9ES89', 'A0A5C9ESL3', 'A0A5C9F072', 'A0A5E4HH55',
       'A0A5E4HQX0', 'A0A5E4HYS3', 'A0A5E4I398', 'A0A5E4I781',
       'A0A5E4IGQ7', 'A0A5E4IMT0', 'A0A5E4IRK5', 'A0A5E4IUK4',
       'A0A5E4J3F3', 'A0A5E4J4E6', 'A0A5E4JCF8', 'A0A5E4JH22',
       'A0A5E4JHD9', 'A0A5E4JKH4', 'A0A5E4JKJ5', 'A0A5E4JW93',
       'A0A5E4KEA8', 'A0A5E4KPP3', 'A0A5E4KWQ1', 'A0A5E4L5R9',
       'A0A5E4LEZ6', 'A0A5E4LJT6', 'A0A5E4LNQ3', 'A0A5E4LQB7',
       'A0A5E4LT27', 'A0A5E4M198', 'A0A5E4P6M9', 'A0A5J4DTC7',
       'A0A5N5U7J1', 'A0A5Q0UEJ8', 'A0A5Q0UFG0', 'A0A5Q0UGM6',
       'A0A654M110', 'A0A656YUK8', 'A0A656YWD1', 'A0A656YWZ7',
       'A0A656YYG9', 'A0A660HVJ0', 'A0A662F556', 'A0A662F7T7',
       'A0A662F9C7', 'A0A662FE05', 'A0A662FLL7', 'A0A662FZ57',
       'A0A662GJ74', 'A0A662GMF9', 'A0A662GSB0', 'A0A662GV93',
       'A0A662H3Y7', 'A0A662HHT5', 'A0A662HNJ0', 'A0A662HSR3',
       'A0A662I6V7', 'A0A662IMR9', 'A0A662IQE5', 'A0A662IY63',
       'A0A662J3U3', 'A0A662JKX6', 'A0A662M696', 'A0A662MDX9',
       'A0A662N8Z7', 'A0A662NID5', 'A0A662NKM8', 'A0A662NKV4',
       'A0A662NSF6', 'A0A662NTG7', 'A0A662NWU9', 'A0A662NYI9',
       'A0A662NZI4', 'A0A662P332', 'A0A662P5G3', 'A0A662P705',
       'A0A662PFS8', 'A0A662PGN3', 'A0A662PLU8', 'A0A662PPS0',
       'A0A662PT37', 'A0A662Q9I3', 'A0A662QA04', 'A0A662QDT7',
       'A0A662QGV5', 'A0A662QJ46', 'A0A662QR28', 'A0A662R274',
       'A0A662R2Q3', 'A0A662R8B2', 'A0A662RGM1', 'A0A662RK17',
       'A0A662RXD0', 'A0A662SGA6', 'A0A662SJT3', 'A0A662SK85',
       'A0A662T171', 'A0A662T1G1', 'A0A662T6N4', 'A0A662T7H4',
       'A0A662T9B5', 'A0A662T9D5', 'A0A662TC36', 'A0A662TEB6',
       'A0A662THH8', 'A0A662TKR5', 'A0A662TSZ5', 'A0A662TVK5',
       'A0A662U1B7', 'A0A662VSA8', 'A0A662W466', 'A0A662W6J0',
       'A0A662W6M7', 'A0A662W820', 'A0A662W8G7', 'A0A662W8P7',
       'A0A662WC68', 'A0A6A2F8D3', 'A0A6A7KEB3', 'A0A6A7KH31',
       'A0A6A7LG20', 'A0A6A8AKA6', 'A0A6A8RKA3', 'A0A6B0Y6X6',
       'A0A6B0YLV2', 'A0A6B1A0A9', 'A0A6B1BLQ2', 'A0A6B1CZW4',
       'A0A6B1FHF3', 'A0A6B2C028', 'A0A6B2CCE8', 'A0A6B2CLJ9',
       'A0A6B9TCY9', 'A0A6B9TE54', 'A0A6B9TGE5', 'A0A6B9TIV2',
       'A0A6B9TJY3', 'A0A6B9TKX6', 'A0A6G1WY19', 'A0A6G1YKC1',
       'A0A6G2J758', 'A0A6G2K9J4', 'A0A6G2KEW9', 'A0A6G2KHY0',
       'A0A6G2KUV7', 'A0A6G2KXC5', 'A0A6G3LL31', 'A0A6N0NPE3',
       'A0A6N0NQD7', 'A0A6N0NSE7', 'A0A6V8F395', 'A0A7C0TZ09',
       'A0A7C0UH29', 'A0A7C0UTV9', 'A0A7C0V589', 'A0A7C0VQW2',
       'A0A7C0VRX4', 'A0A7C0VS25', 'A0A7C0XSL3', 'A0A7C0YLM9',
       'A0A7C0YRF4', 'A0A7C0Z8B0', 'A0A7C0ZDH1', 'A0A7C1AY03',
       'A0A7C1B5F1', 'A0A7C1CBI8', 'A0A7C1DY32', 'A0A7C1E4W7',
       'A0A7C1E7H0', 'A0A7C1EB13', 'A0A7C1HSM5', 'A0A7C1I6C8',
       'A0A7C1IEQ7', 'A0A7C1ILI7', 'A0A7C1LFH7', 'A0A7C1LIE7',
       'A0A7C1LYM4', 'A0A7C1LZ36', 'A0A7C1Q566', 'A0A7C1QTI8',
       'A0A7C1RMI6', 'A0A7C1T153', 'A0A7C1ZPS3', 'A0A7C2HBZ2',
       'A0A7C2L9P5', 'A0A7C2LNR4', 'A0A7C2N9V8', 'A0A7C2Q112',
       'A0A7C2Q4E6', 'A0A7C2QHZ1', 'A0A7C2QII4', 'A0A7C2R431',
       'A0A7C2TM41', 'A0A7C2W5X9', 'A0A7C2XQE4', 'A0A7C2XV10',
       'A0A7C2ZZ02', 'A0A7C3IKY7', 'A0A7C3L3G3', 'A0A7C3Q1Q7',
       'A0A7C3R2F7', 'A0A7C3R612', 'A0A7C3SEN7', 'A0A7C3SF95',
       'A0A7C3TI96', 'A0A7C3TJ55', 'A0A7C3UBK9', 'A0A7C3UEG7',
       'A0A7C3V0T1', 'A0A7C3V131', 'A0A7C3V3E7', 'A0A7C3VDL3',
       'A0A7C3XE96', 'A0A7C3YI70', 'A0A7C3YJB5', 'A0A7C3ZCX3',
       'A0A7C3ZSX8', 'A0A7C4B8E1', 'A0A7C4BFG3', 'A0A7C4CHC4',
       'A0A7C4CNA1', 'A0A7C4CWD9', 'A0A7C4CWM7', 'A0A7C4D7L8',
       'A0A7C4D8H8', 'A0A7C4E574', 'A0A7C4F2H5', 'A0A7C4G9R0',
       'A0A7C4GEH9', 'A0A7C4H946', 'A0A7C4HDB3', 'A0A7C4HH67',
       'A0A7C4I5I4', 'A0A7C4JAA7', 'A0A7C4M1S6', 'A0A7C4MBU0',
       'A0A7C4MJG6', 'A0A7C4NDU2', 'A0A7C4NM03', 'A0A7C4NPH6',
       'A0A7C4RD73', 'A0A7C4RDY2', 'A0A7C4RZE8', 'A0A7C4SDK9',
       'A0A7C4SLZ5', 'A0A7C4TRW3', 'A0A7C4W5J0', 'A0A7C4WRH9',
       'A0A7C5AA23', 'A0A7C5DGC9', 'A0A7C5DWD0', 'A0A7C5GN15',
       'A0A7C5IXP6', 'A0A7C5K180', 'A0A7C5K283', 'A0A7C5NWI8',
       'A0A7C5PBA7', 'A0A7C5Q4T4', 'A0A7C5RNL5', 'A0A7C5T2F3',
       'A0A7C5TPW6', 'A0A7C5U4E2', 'A0A7C5VE65', 'A0A7C5XJ31',
       'A0A7C5XJM2', 'A0A7C5Y956', 'A0A7C6AIA7', 'A0A7C6BS58',
       'A0A7C6DSX0', 'A0A7C6E2H1', 'A0A7C6FV71', 'A0A7C6FY48',
       'A0A7C6YH25', 'A0A7C6YLB0', 'A0A7C6YMF9', 'A0A7C7QCX4',
       'A0A7C7RH28', 'A0A7C7ULB4', 'A0A7C7UPT2', 'A0A7D5E6T4',
       'A0A7D5MC89', 'A0A7D5R350', 'A0A7D5REM1', 'A0A7D5RGR9',
       'A0A7D7Z5I0', 'A0A7G2D6C9', 'A0A7G2D881', 'A0A7G9YMX3',
       'A0A7G9Z0C0', 'A0A7G9Z4G1', 'A0A7G9Z693', 'A0A7G9Z952',
       'A0A7G9ZCQ4', 'A0A7J2H9L4', 'A0A7J2HH21', 'A0A7J2IPA4',
       'A0A7J2IQW9', 'A0A7J2IVM9', 'A0A7J2K8L1', 'A0A7J2K8W7',
       'A0A7J2KEE3', 'A0A7J2KTU6', 'A0A7J2L5J8', 'A0A7J2LAC1',
       'A0A7J2LRB6', 'A0A7J2LY63', 'A0A7J2MN55', 'A0A7J2NFM3',
       'A0A7J2NQV4', 'A0A7J2NS51', 'A0A7J2NSG5', 'A0A7J2PQ49',
       'A0A7J2PU99', 'A0A7J2PZ41', 'A0A7J2R0K8', 'A0A7J2R824',
       'A0A7J2S2I1', 'A0A7J2THX7', 'A0A7J2TKZ3', 'A0A7J2TPC2',
       'A0A7J2TWH3', 'A0A7J2TWV6', 'A0A7J2TZA9', 'A0A7J2URR8',
       'A0A7J2VUH7', 'A0A7J2VW30', 'A0A7J2VXA4', 'A0A7J2WNY4',
       'A0A7J2WPF5', 'A0A7J2X207', 'A0A7J2Y944', 'A0A7J2YV50',
       'A0A7J2Z097', 'A0A7J2ZE94', 'A0A7J2ZLE3', 'A0A7J3A980',
       'A0A7J3AZB9', 'A0A7J3BD70', 'A0A7J3CJN0', 'A0A7J3CTP1',
       'A0A7J3D273', 'A0A7J3D5L4', 'A0A7J3DN03', 'A0A7J3DP28',
       'A0A7J3EDJ3', 'A0A7J3F251', 'A0A7J3FBQ5', 'A0A7J3FDF7',
       'A0A7J3HN97', 'A0A7J3IGZ5', 'A0A7J3J9I8', 'A0A7J3KEK0',
       'A0A7J3L113', 'A0A7J3LM54', 'A0A7J3LMD7', 'A0A7J3LMV4',
       'A0A7J3LYS1', 'A0A7J3LZY7', 'A0A7J3M025', 'A0A7J3MJS7',
       'A0A7J3MKZ7', 'A0A7J3NTU9', 'A0A7J3R0D4', 'A0A7J3S225',
       'A0A7J3S6S6', 'A0A7J3S844', 'A0A7J3SCY2', 'A0A7J3T055',
       'A0A7J3UHF7', 'A0A7J3UZ27', 'A0A7J3VP55', 'A0A7J3W5V8',
       'A0A7J3WF30', 'A0A7J3WNP9', 'A0A7J3WYH8', 'A0A7J3XEF3',
       'A0A7J3XVR6', 'A0A7J4B2J3', 'A0A7J4BCC8', 'A0A7J4DPV7',
       'A0A7J4E233', 'A0A7J4E6A5', 'A0A7J4ENF3', 'A0A7J4ES01',
       'A0A7J4EVK9', 'A0A7J4FBU6', 'A0A7J4FTJ9', 'A0A7J4FX44',
       'A0A7J4FZ72', 'A0A7J4FZL4', 'A0A7J4G2B4', 'A0A7J4HT06',
       'A0A7J4I4C6', 'A0A7J4I6P4', 'A0A7J4IKH1', 'A0A7J4ITH3',
       'A0A7J4IY90', 'A0A7J4JQ63', 'A0A7J4JSH2', 'A0A7J4JTF5',
       'A0A7J4JWE7', 'A0A7J4K016', 'A0A7J4K606', 'A0A7J4KBI9',
       'A0A7J4KCJ8', 'A0A7J4KE43', 'A0A7J4KZT2', 'A0A7J4L8L1',
       'A0A7J4LCH7', 'A0A7J4LGI1', 'A0A7J4LW29', 'A0A7J4LWN8',
       'A0A7J4LWV4', 'A0A7J4MTI8', 'A0A7J4MWE9', 'A0A7J4MX65',
       'A0A7J4NXI4', 'A0A7J4P3Y8', 'A0A7J4P5K8', 'A0A7J4PHR2',
       'A0A7J4PUZ3', 'A0A7J4Q7W9', 'A0A7J4QD62', 'A0A7J4R9X9',
       'A0A7J4RYI0', 'A0A7J4RYV1', 'A0A7J4TA12', 'A0A7J4TLK6',
       'A0A7J4TU65', 'A0A7J4UMS1', 'A0A7J4UY01', 'A0A7J9NH84',
       'A0A7J9PDK4', 'A0A7J9PXI0', 'A0A7J9Q924', 'A0A7J9Q9X2',
       'A0A7J9QJ28', 'A0A7J9QW00', 'A0A7J9R6U9', 'A0A7J9TRW8',
       'A0A7K3Y3P2', 'A0A7K3Y5F8', 'A0A7K3Y9A3', 'A0A7K3YB38',
       'A0A7K3YG31', 'A0A7K3YPD3', 'A0A7K3Z1U6', 'A0A7K4A3A1',
       'A0A7K4A3N9', 'A0A7K4A508', 'A0A7K4A692', 'A0A7K4AIW7',
        'A0A7K4AZD2', 'A0A7K4B1S6', 'A0A7K4B307', 'A0A7K4B6S5',
       'A0A7K4B8F7', 'A0A7K4B8P0', 'A0A7K4B8Z6', 'A0A7K4BAC3',
       'A0A7K4BAI2', 'A0A7K4BBF5', 'A0A7K4BC09', 'A0A7K4BDT2',
       'A0A7K4BH03', 'A0A7K4BWS1', 'A0A7K4BYK7', 'A0A7K4CB55',
       'A0A7K4CCL9', 'A0A7K4CHU8', 'A0A7K4CNI2', 'A0A7K4CSF8',
       'A0A7K4CW52', 'A0A7K4D1R2', 'A0A7K4DCK2', 'A0A7K4DFE1',
       'A0A7K4EV71', 'A0A7K4F1Z4', 'A0A7K4F7B9', 'A0A7K4G3Y5',
       'A0A7K4G492', 'A0A7K4GL68', 'A0A7K4GNE9', 'A0A7K4GRM7',
       'A0A7K4H090', 'A0A7K4H321', 'A0A7K4H5L8', 'A0A7K4HDZ6',
       'A0A7K4HFG3', 'A0A7K4HMG9', 'A0A7K4HQP1', 'A0A7K4IFB3',
       'A0A7K4IQD2', 'A0A7K4IRF2', 'A0A7K4MQF1', 'A0A7K4N3A9',
       'A0A7K4N886', 'A0A7K4NAM4', 'A0A7K4NM94', 'A0A7K4NQ33',
       'A0A7L4NT45', 'A0A7L4NTV5', 'A0A7L4NWN8', 'A0A7L4NXW3',
       'A0A7L4NYX3', 'A0A7L4P2S0', 'A0A7L4P3I0', 'A0A7L4P856',
       'A0A7L4PEQ6', 'A0A7L4PEZ0', 'A0A7L4PRV5', 'A0A7L4PY31',
       'A0A7L4Q6N8', 'A0A7L4QU78', 'A0A7L4QVF6', 'A0A7L4QXR9',
       'A0A7L4R391', 'A0A7L4RDY2', 'A0A7L4RF83', 'A0A7L4RHP3',
       'A0A7L4RK32', 'A0A7L4RRN2', 'A0A7L9FEQ8', 'A0A7R9R7H0',
       'A0A7R9R7J8', 'A0A7T5R859', 'A0A7T5UJC6', 'A0A7T5UKB8',
       'A0A7T5ULA2', 'A0A7T5UUU3', 'A0A7T9CDA9', 'A0A7T9DJ97',
       'A0A7T9HFE1', 'A0A7U3CZ78', 'A0A7U3CZR4', 'A0A7Z7AVU9',
       'A0A800UG46', 'A0A812A2A7', 'A0A822IN43', 'A0A830EFW1',
       'A0A830GU53', 'A0A831KBL1', 'A0A831LKL4', 'A0A831NPA5',
       'A0A831PG45', 'A0A831SK31', 'A0A831U201', 'A0A831Z6Y4',
       'A0A832AN65', 'A0A832AUK5', 'A0A832D2M6', 'A0A832D2T3',
       'A0A832ET17', 'A0A832EZM9', 'A0A832F6W9', 'A0A832FSL2',
       'A0A832JAS4', 'A0A832JRX0', 'A0A832JTF9', 'A0A832LA75',
       'A0A832LLS5', 'A0A832LPZ0', 'A0A832LSZ2', 'A0A832MXA9',
       'A0A832NPI6', 'A0A832PEW6', 'A0A832QQI6', 'A0A832QU13',
       'A0A832RJQ6', 'A0A832RLW9', 'A0A832RQ71', 'A0A832RWS0',
       'A0A832RY90', 'A0A832S1I0', 'A0A832S4B2', 'A0A832SG59',
       'A0A832SM56', 'A0A832SVF6', 'A0A832T5V1', 'A0A832T6E5',
       'A0A832TAF1', 'A0A832TKM6', 'A0A832TVW0', 'A0A832TWV0',
       'A0A832U9B3', 'A0A832UCH1', 'A0A832UCP3', 'A0A832UET7',
       'A0A832URP0', 'A0A832V3P7', 'A0A832VFN3', 'A0A832VST1',
       'A0A832VX68', 'A0A832VYG9', 'A0A832W6R4', 'A0A832WNR9',
       'A0A832WUV0', 'A0A832XGK4', 'A0A832XJ09', 'A0A832XJI5',
       'A0A832YR84', 'A0A832YTJ8', 'A0A832Z0E8', 'A0A832Z6S6',
       'A0A832Z7C7', 'A0A832Z9Q5', 'A0A832ZDC5', 'A0A832ZF40',
       'A0A832ZYZ3', 'A0A832ZZC4', 'A0A833A2P9', 'A0A833A3V2',
       'A0A833DVH5', 'A0A833DWB3', 'A0A833DYA9', 'A0A833E3T7',
       'A0A833E5Z0', 'A0A833EET1', 'A0A833EFK1', 'A0A833EGC8',
       'A0A835SKC7', 'A0A835X1X5', 'A0A835X590', 'A0A835XAI5',
       'A0A835XIG8', 'A0A836BGR1', 'A0A836BJZ9', 'A0A836PUZ8',
       'A0A836Q6C1', 'A0A836T1T1', 'A0A838D7C9', 'A0A838NAK9',
       'A0A838U2B4', 'A0A838W3B2', 'A0A838YFD4', 'A0A842K5E7',
       'A0A842K875', 'A0A842KA99', 'A0A842KL34', 'A0A842KTB9',
       'A0A842KTJ1', 'A0A842KUW2', 'A0A842L1P3', 'A0A842L308',
       'A0A842L4F2', 'A0A842LAD9', 'A0A842LDP8', 'A0A842LLN6',
       'A0A842LNQ8', 'A0A842LPL5', 'A0A842LQ77', 'A0A842LVV3',
       'A0A842M328', 'A0A842M3L4', 'A0A842M7P0', 'A0A842MAZ8',
       'A0A842MC87', 'A0A842MHT6', 'A0A842MNR3', 'A0A842MTE5',
       'A0A842N0B6', 'A0A842N0M5', 'A0A842N485', 'A0A842NF52',
       'A0A842NG71', 'A0A842NGP8', 'A0A842NNA2', 'A0A842NND5',
       'A0A842NUH0', 'A0A842NXW9', 'A0A842P956', 'A0A842Q2H2',
       'A0A842QB91', 'A0A842QC45', 'A0A842QF08', 'A0A842QKG6',
       'A0A842QPM6', 'A0A842QX41', 'A0A842R1Y8', 'A0A842R2Z9',
       'A0A842R7N2', 'A0A842R8C4', 'A0A842RB65', 'A0A842RC82',
       'A0A842RI51', 'A0A842RKW7', 'A0A842RQ13', 'A0A842RZC0',
       'A0A842S4G5', 'A0A842S9N4', 'A0A842SGQ0', 'A0A842SHI4',
       'A0A842T2J6', 'A0A842T4H2', 'A0A842TG16', 'A0A842TG18',
       'A0A842TKJ7', 'A0A842TMX1', 'A0A842TNE5', 'A0A842TTS4',
       'A0A842U002', 'A0A842UB21', 'A0A842UBN1', 'A0A842UED8',
       'A0A842UGY6', 'A0A842UVU6', 'A0A842UXH9', 'A0A842UYK1',
       'A0A842V1X2', 'A0A842V2M4', 'A0A842V7W8', 'A0A842VI65',
       'A0A842VJL9', 'A0A842VNJ5', 'A0A842VSX0', 'A0A842VT72',
       'A0A842W658', 'A0A842W676', 'A0A842W7F7', 'A0A842WGA9',
       'A0A842WW32', 'A0A842XCU9', 'A0A842XGG1', 'A0A842XMF0',
       'A0A842XUF6', 'A0A842XW77', 'A0A842Y270', 'A0A842Y9N3',
       'A0A842YMB4', 'A0A843AF87', 'A0A843AIP7', 'A0A843AIQ4',
       'A0A843ANQ7', 'A0A843B8J9', 'A0A843BU05', 'A0A843C1R5',
       'A0A843C5F4', 'A0A843CBR1', 'A0A843CDM7', 'A0A843CH02',
       'A0A843D0Y7', 'A0A843DJZ7', 'A0A843DMM8', 'A0A843DYI8',
       'A0A843E650', 'A0A843ELZ9', 'A0A843ESH5', 'A0A843F2K6',
       'A0A843F774', 'A0A843FGZ6', 'A0A843FL70', 'A0A843FLF9',
       'A0A843FZ61', 'A0A843G7U9', 'A0A843H7P9', 'A0A843HB92',
       'A0A843HDU0', 'A0A843HEV1', 'A0A843I985', 'A0A843IGR4',
       'A0A843IH58', 'A0A843IIA2', 'A0A843INH3', 'A0A843IYG7',
       'A0A843J8S3', 'A0A843J9Q6', 'A0A843JIE0', 'A0A843JNK3',
       'A0A843JV67', 'A0A843K6G8', 'A0A843KDQ8', 'A0A843KDQ9',
       'A0A843KG10', 'A0A843KK86', 'A0A843KRF9', 'A0A843KY10',
       'A0A843KY26', 'A0A843KYM1', 'A0A843L5G5', 'A0A843L8A9',
       'A0A843L8R8', 'A0A843LAC3', 'A0A843LHS4', 'A0A843LI99',
       'A0A843LIQ1', 'A0A843LMM1', 'A0A843LRW8', 'A0A843LT97',
       'A0A843M8X9', 'A0A843ML00', 'A0A846NFA3', 'A0A846NQM8',
       'A0A846NYK1', 'A0A846PH54', 'A0A846PK31', 'A0A846PLB3',
       'A0A846PMQ9', 'A0A846PN12', 'A0A846PVY9', 'A0A846T5R6',
       'A0A847V5Q3', 'A0A847V9T8', 'A0A848EPM0', 'A0A849PH61',
       'A0A849PQK5', 'A0A849PR52', 'A0A849PY48', 'A0A849Q054',
       'A0A849Q925', 'A0A849QDV4', 'A0A849QL36', 'A0A849QMK9',
       'A0A849QVL3', 'A0A849QVW9', 'A0A849QZ48', 'A0A849R2X4',
       'A0A849R562', 'A0A850LSW6', 'A0A850LUG3', 'A0A850LVB5',
       'A0A850LVM2', 'A0A850LWW6', 'A0A850LY41', 'A0A850M118',
       'A0A850M1W7', 'A0A850M3B1', 'A0A850M3C2', 'A0A850M3K8',
       'A0A850M4P4', 'A0A850M5A2', 'A0A850M6B8', 'A0A850MBP8',
       'A0A850MGW9', 'A0A850MMI5', 'A0A850MN19', 'A0A850MN65',
       'A0A850MPT7', 'A0A850MW10', 'A0A850MZ07', 'A0A850SLQ6',
       'A0A8A3S3Y5', 'A0A8A3S742', 'A0A8B2HWL5', 'A0A8B3S2G3', 'A0B552',
       'A1RX27', 'A2SQR2', 'A3CTR7', 'A3CWU7', 'A5UJP0', 'A5ULH1',
       'A5UMN7', 'A6UPZ2', 'A6URZ8', 'A6UUS1', 'A6UWK9', 'A6VGT4',
       'A6VJ21', 'A7I778', 'A8B1G1', 'A8B1G4', 'A8MAF0', 'A9A6R3',
       'A9A9W6', 'B1L604', 'B1L7I0', 'B3T8R2', 'B6YSY3', 'B6YXB0',
       'B7R0Z7', 'B7R276', 'B8GJY4', 'B9ACU5', 'B9AEH7', 'B9AFQ1',
       'C5A1L3', 'C5A417', 'C6A4I7', 'C6A4V1', 'C7DGD3', 'C7P5P3',
       'C7P645', 'C7P7Z5', 'C9RDY4', 'C9RE68', 'C9RGG7', 'D1JH27',
       'D1YVU1', 'D1Z211', 'D2RG36', 'D2RG91', 'D2ZNX1', 'D2ZQ30',
       'D2ZRU2', 'D3DYY6', 'D3DZ31', 'D3E264', 'D3E480', 'D3E4G7',
       'D3RX55', 'D3RZM5', 'D3S4U7', 'D3S6Z4', 'D3S8K2', 'D5E7P3',
       'D5VSU5', 'D7DR65', 'D7DRP5', 'D7EAB7', 'D9PUI4', 'D9PVQ1',
       'D9PX60', 'E1QV35', 'E1RKS0', 'E3GWR6', 'E3GZI5', 'E6N889',
       'F0LI23', 'F0LLL9', 'F0QTG3', 'F0TA21', 'F0TAB1', 'F0TB06',
       'F2KMN7', 'F2KNB3', 'F2KNH7', 'F4BTG3', 'F4HJH2', 'F4HJZ2',
       'F6BAL4', 'F6BE57', 'F6D3V8', 'F6D3Y0', 'F6D6K9', 'F7XPP0',
       'F8AFT9', 'F8AG78', 'F8ALG2', 'F8ALT6', 'F9CUB8', 'G0H212',
       'G0H314', 'G0HNE0', 'G0HNM8', 'G0QA93', 'G0QBX4', 'G0QF84',
       'G0QHW9', 'G7WMC0', 'H0AAQ7', 'H0AER2', 'H1KYR8', 'H1L006',
       'H1L1T6', 'H1Z4J4', 'H3ZPI5', 'H3ZQB5', 'H8I5G8', 'H8IAC6',
       'I3D0X7', 'I3RG04', 'I3RGC2', 'I3ZSD3', 'I3ZVF1', 'I6V1B6',
       'I6V3Z3', 'I7J805', 'I7LMT1', 'J1L208', 'J1L419', 'K0B5J9',
       'K0BEB2', 'K0INA0', 'K2R2P0', 'K2RAZ5', 'K2RQ61', 'K6U3A5',
       'K6U3B8', 'K6U3N2', 'L0HFF2', 'L0HJR8', 'L0KVH1', 'M1Q4J2',
       'N0BAY1', 'N0BCL7', 'N0BIK0', 'N6VQJ2', 'O27731', 'O28779',
       'O29910', 'O59627', 'O74092', 'O74098', 'P19267', 'P48781',
       'P48782', 'P48783', 'P48784', 'P50483', 'P50484', 'P50485',
       'P50486', 'P61881', 'P61882', 'P95669', 'Q0W2G9', 'Q0W3U2',
       'Q12W85', 'Q2FLE1', 'Q2FMI8', 'Q2FMX3', 'Q2FN15', 'Q2FNA8',
       'Q2NG91', 'Q2NGP1', 'Q2NGY6', 'Q2NHP7', 'Q2NHU2', 'Q2Y4Y8',
       'Q46C61', 'Q57632', 'Q58342', 'Q58655', 'Q60264', 'Q6LXK3',
       'Q6M085', 'Q6ZZY7', 'Q6ZZY8', 'Q74MT3', 'Q74NJ9', 'Q8PVY0',
       'Q8TSX3', 'Q9V1F5', 'Q9Y8I1', 'Q9Y8I2', 'R1E4W5', 'R1G8M2',
       'R7PU30', 'R9SI81', 'R9SKF3', 'R9SKV4', 'S5ZK75', 'T2GGT0',
       'T2GHK1', 'T2GJQ5', 'U6EAE9', 'U6EBB8', 'U6EC16', 'V6AUD8',
       'W0I0Z7', 'W0I1Z8', 'W8NVG4', 'W8P2M9', 'W9DS39', 'A0A7C5PA17',
       'A0A831SMI2', 'A0A836REH5', 'A0A523XJ05', 'A0A800BU02',
       'A0A1F3W058', 'A0A0Q1A6C2', 'A0A496UNE2', 'A0A7C5W0E3',
       'A0A1Q7PKL8', 'A0A1G1HBN4', 'A0A7Y5HEN7', 'A0A660TM59',
       'A0A497ALM5', 'A0A2E8TBW0', 'A0A1V5M6U6', 'A0A1V5MHC4',
       'A0A7C1V5Q5', 'A0A7C3TRR2', 'A0A661ARC1', 'A0A7V3RJU3',
       'A0A533RWB3', 'A0A644V2L4']

In [135]:
mapper = ProtMapper()

accessions, failed = mapper.get(
    ids=accessions_uniprot, from_db="UniProtKB_AC-ID", to_db="EMBL-GenBank-DDBJ_CDS"
)
failed

Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 618
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 587
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 654
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
Fetched: 500 / 564
Retrying in 3s
Retrying in 3s
Fetched: 137 / 140


['A0A087RPB5',
 'A0A0M0BHQ3',
 'A0A101DNZ9',
 'A0A101DPW6',
 'A0A101DRU7',
 'A0A124F9F5',
 'A0A1Q9N877',
 'A0A1V5SMG1',
 'A0A1V5SP42',
 'A0A1V5SZX8',
 'A0A256Y5A8',
 'A0A256Y627',
 'A0A256Y9Q1',
 'A0A2A2HH87',
 'A0A2D6V4T4',
 'A0A2D6WWJ0',
 'A0A2D6WXC0',
 'A0A2G4J1G2',
 'A0A2I0PYJ6',
 'A0A2I0Q259',
 'A0A2I0Q3J1',
 'A0A2I0Q5R0',
 'A0A368PFD0',
 'A0A447G5G2',
 'A0A497FB77',
 'A0A497IT48',
 'A0A497IVI6',
 'A0A497SUS5',
 'A0A497T2M2',
 'A0A497T7X0',
 'A0A662N8Z7',
 'A0A662NKM8',
 'A0A662NKV4',
 'A0A662NSF6',
 'A0A662NWU9',
 'A0A662P5G3',
 'A0A662QJ46',
 'A0A6A8RKA3',
 'A0A6B2CCE8',
 'A0A6N0NPE3',
 'A0A6N0NQD7',
 'A0A6N0NSE7',
 'A0A7C2W5X9',
 'A0A7J2H9L4',
 'A0A7J2VXA4',
 'A0A7J3UHF7',
 'A0A7J4E233',
 'A0A7J4KZT2',
 'A0A7K3YPD3',
 'A0A7K4NAM4',
 'A0A7L4Q6N8',
 'A0A832RQ71',
 'A0A832RY90',
 'A0A832S1I0',
 'A0A832YR84',
 'A0A832Z0E8',
 'A0A833DVH5',
 'A0A833DWB3',
 'A0A833EET1',
 'A0A838NAK9',
 'A0A842K5E7',
 'A0A842K875',
 'A0A842KA99',
 'A0A842LAD9',
 'A0A842LDP8',
 'A0A842M328',
 'A0A842M7

In [139]:
accessions.shape

(2490, 2)

accessions - GenBank accessions (для некоторых белков из UniProt может быть найдено несколько GenBank записей, мы будем брать все)

failed - идентификаторы UniProt, которые не были обнаружены. Однако, для них есть запись в UniParc. Брать их или нет??

In [136]:
data_sequence = []
for acc in accessions['To']:
    # print('------------------------------------------------')
    print(acc)
    with Entrez.efetch(
        db="protein", id=acc, rettype="gb", retmode="text"
    ) as handle:
        record = SeqIO.read(handle, "genbank")
    #     print(record)
    # print(record.seq)
    taxonomy_data = get_taxonomy_data(record)
    data_sequence.append({
        "accession": acc,
        "variant": "Nucleosomal",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    })
    data_sequence[-1].update(taxonomy_data)
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

AIC15241.1
Fetched taxid from NCBI 926571
KCZ72620.1
Fetched taxid from NCBI 1392998
KDE55580.1
Fetched taxid from NCBI 1495314
KDE55212.1
Fetched taxid from NCBI 1495314
AIE90726.1
Fetched taxid from NCBI 1455884
AIF00232.1
Fetched taxid from NCBI 1455999
AIF02366.1
Fetched taxid from NCBI 1456022
AIF04820.1
Fetched taxid from NCBI 1456057
AIF08013.1
Fetched taxid from NCBI 1456107
AIF10516.1
Fetched taxid from NCBI 1456159
AIF10000.1
Fetched taxid from NCBI 1456147
AIF15442.1
Fetched taxid from NCBI 1456251
AIF15492.1
Fetched taxid from NCBI 1456253
AIF68887.1
Fetched taxid from NCBI 1343739
AIF69774.1
Fetched taxid from NCBI 1343739
AIF84426.1
Fetched taxid from NCBI 1459636
AIG97158.1
Fetched taxid from NCBI 1344584
AIG98503.1
Fetched taxid from NCBI 1344584
AIJ06475.1
Fetched taxid from NCBI 1301915
AIJ05317.1
Fetched taxid from NCBI 1301915
AIJ06137.1
Fetched taxid from NCBI 1301915
KEQ56781.1
Fetched taxid from NCBI 1502293
KER06285.1
Fetched taxid from NCBI 1502292
KFM15489.1
F

In [137]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [138]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
1,AAA67720.1,Nucleosomal,,,,2162.0,Methanobacterium formicicum,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKNAGAERVSDDAREALAKALEEKGETIATEAVKL...,,,
2,AAA67721.1,Nucleosomal,,,,2162.0,Methanobacterium formicicum,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAPRVSDDARDALAKVLEEMGEGIAAEAVK...,,,
3,AAA67722.1,Nucleosomal,,,,2162.0,Methanobacterium formicicum,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAKEALAKALEENGEELAKKAVE...,,,
4,AAA72080.1,Nucleosomal,,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,,
5,AAA73196.1,Nucleosomal,,,,145262.0,Methanothermobacter thermautotrophicus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKNAGAEIVSDDAREALAKVLEAKGEEIAENAVKL...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,VVC03710.1,Nucleosomal,,,,2885751.0,Candidatus Burarchaeum australiense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MTALDVLDVERIIRKAGADRVSEGAGRMLAEVLEEKATEIAERAVR...,,,
4996,VVC04047.1,Nucleosomal,,,,2885751.0,Candidatus Burarchaeum australiense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MPELPQSAVERLIRKAGAERVSEEASHSLTRILEDMAAGISVKAIR...,,,
4997,VVC04190.1,Nucleosomal,,,,2885751.0,Candidatus Burarchaeum australiense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MAKRSAAGGRHEGRILPRAAVERLIRRAGADRVGASASEALAEVLE...,,,
4998,VVC05766.1,Nucleosomal,,,,115547.0,uncultured archaeon,,,,,MEEVGTSIAKNAVEMAIHAGRKTIKAEYIRLAAKQFSKF,,,


### Add publications

In [140]:
pid = "schwab_histones_2024"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,schwab_histones_2024,Histones and histone variant families in proka...,10.1038/s41467-024-52337-y,,2024


In [141]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds['accession'], pid))
    except Exception as e:
        print(e)
        print(ds['accession'])
        failed_toadd.append(ds['accession'])

In [142]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df['accession'].isin(accessions['To'])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
1,AAA67720.1,Nucleosomal,,,,2162.0,Methanobacterium formicicum,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKNAGAERVSDDAREALAKALEEKGETIATEAVKL...,,AAA67720.1,schwab_histones_2024
2,AAA67721.1,Nucleosomal,,,,2162.0,Methanobacterium formicicum,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAPRVSDDARDALAKVLEEMGEGIAAEAVK...,,AAA67721.1,schwab_histones_2024
3,AAA67722.1,Nucleosomal,,,,2162.0,Methanobacterium formicicum,Methanobacteriota,Methanobacteria,,,MAELPIAPVGRIIKNAGAQRISDDAKEALAKALEENGEELAKKAVE...,,AAA67722.1,schwab_histones_2024
4,AAA72080.1,Nucleosomal,,,,2180.0,Methanothermus fervidus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKDAGAERVSDDARITLAKILEEMGRDIASEAIKL...,,AAA72080.1,schwab_histones_2024
5,AAA73196.1,Nucleosomal,,,,145262.0,Methanothermobacter thermautotrophicus,Methanobacteriota,Methanobacteria,,,MELPIAPIGRIIKNAGAEIVSDDAREALAKVLEAKGEEIAENAVKL...,,AAA73196.1,schwab_histones_2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,VVC03710.1,Nucleosomal,,,,2885751.0,Candidatus Burarchaeum australiense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MTALDVLDVERIIRKAGADRVSEGAGRMLAEVLEEKATEIAERAVR...,,VVC03710.1,schwab_histones_2024
4996,VVC04047.1,Nucleosomal,,,,2885751.0,Candidatus Burarchaeum australiense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MPELPQSAVERLIRKAGAERVSEEASHSLTRILEDMAAGISVKAIR...,,VVC04047.1,schwab_histones_2024
4997,VVC04190.1,Nucleosomal,,,,2885751.0,Candidatus Burarchaeum australiense,Candidatus Micrarchaeota,Candidatus Micrarchaeia,,,MAKRSAAGGRHEGRILPRAAVERLIRRAGADRVGASASEALAEVLE...,,VVC04190.1,schwab_histones_2024
4998,VVC05766.1,Nucleosomal,,,,115547.0,uncultured archaeon,,,,,MEEVGTSIAKNAVEMAIHAGRKTIKAEYIRLAAKQFSKF,,VVC05766.1,schwab_histones_2024


In [143]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [144]:
cursor.close()
conn.close()
tunnel.stop()