In [27]:
from urllib.error import HTTPError

from io import StringIO
import pandas as pd
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

34855


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [47]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [7]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add other archaeal sequences as H1-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41467-024-53364-5#Sec28)

pid='toner_characterization_2024'

**Последовательности добавлены только с рисунка Figure S1 (кроме A. castellanii H1.3, L8GITG8), остальные необходимо найти**

In [9]:
accessions = [
    "BBI30246.1",
    "QPB44292.1",
    "QYA18369.1",
    "ELR15828.1",
    "ELR13465.1",
]

In [10]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [11]:
data_sequence = []
for acc in accessions:
    print("------------------------------------------------")
    print(acc)
    try:
        with Entrez.efetch(
            db="protein", id=acc, rettype="gb", retmode="text"
        ) as handle:
            record = SeqIO.read(handle, "genbank")
        #     print(record)
        # print(record.seq)
        taxonomy_data = get_taxonomy_data(record)
        data_sequence.append(
            {
                "accession": record.id,
                "variant": "H1-like_(Viruses)",
                "gi": None,
                "ncbi_gene_id": None,
                "hgnc_gene_name": None,
                "taxonomy_id": None,
                "organism": None,
                "phylum": None,
                "class": None,
                "taxonomy_group": None,
                "info": None,
                "sequence": str(record.seq),
                "variant_under_consideration": None,
            }
        )
        data_sequence[-1].update(taxonomy_data)
        # for k, v in data_sequence[-1].items():
        #     print(k, v, type(v))
    except HTTPError as err:
        if err.code != 400:
            raise
        print(err)

------------------------------------------------
BBI30246.1
Fetched taxid from NCBI 3114988
------------------------------------------------
QPB44292.1
Fetched taxid from NCBI 3069717
------------------------------------------------
QYA18369.1
Fetched taxid from NCBI 2831644
------------------------------------------------
ELR15828.1
Fetched taxid from NCBI 1257118
------------------------------------------------
ELR13465.1
Fetched taxid from NCBI 1257118


In [12]:
len(data_sequence)

5

In [13]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession QPB44292.1 <class 'str'>
variant H1-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 3069717 <class 'int'>
organism Medusavirus stheno T3 <class 'str'>
phylum Nucleocytoviricota <class 'str'>
class Megaviricetes <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MPRNSTYGPLVIAALKHLNEPTGSTADAIIECVNRLFHDKISASYKRAVVRAIKKGLENGDLARSGRRYRLLPPGGAIDPPPRELTFEEELIMIGCHGDGMFPDHLARDFVDYMDDDELEEAKKWLMDQGYLKLGSCPDGFEVYKWTSKARKTYCYVPGDEYSEASIDCFLL <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [15]:
failed_toadd = []
for ds in data_sequence:
    if ds["accession"] not in accessions:
        continue
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [16]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
665,BBI30246.1,H1-like_(Viruses),,,,3114988.0,Acanthamoeba castellanii medusavirus J1,Nucleocytoviricota,Megaviricetes,,,MPKNIYAPLIARALKHLDDPAGSTAEAIADCVNRLFHDEITAGYRR...,,,
869,ELR13465.1,H1-like_(Viruses),,,,1257118.0,Acanthamoeba castellanii str. Neff,Discosea,,,,MPREPIPRRAKEVKREPTDYVSLIQPAFQPSQVVGHRKKEEEAKKE...,,,
870,ELR15828.1,H1-like_(Viruses),,,,1257118.0,Acanthamoeba castellanii str. Neff,Discosea,,,,MPKVTHASMVKDALKDLNEPTGSGVIAIIERVAKLYEGKLTSTYER...,,,
4271,QPB44292.1,H1-like_(Viruses),,,,3069717.0,Medusavirus stheno T3,Nucleocytoviricota,Megaviricetes,,,MPRNSTYGPLVIAALKHLNEPTGSTADAIIECVNRLFHDKISASYK...,,,
4304,QYA18369.1,H1-like_(Viruses),,,,2831644.0,Clandestinovirus,,,,,MEGDGNNSVPIETKKNVHPSYKRMVTLAIQSKDEKGASLKTIYGFL...,,,


In [17]:
pid = 'toner_characterization_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [18]:
data_publication = {
    "id": pid,
    "title": 'Characterization of Medusavirus encoded histones reveals nucleosome-like structures and a unique linker histone',
    "doi": '10.1038/s41467-024-53364-5',
    "author": None,
    "year": '2024',
}
cursor.execute(add_publication, data_publication)

In [19]:
pid = 'toner_characterization_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,toner_characterization_2024,Characterization of Medusavirus encoded histon...,10.1038/s41467-024-53364-5,,2024


In [20]:
failed_toadd_publication = []
for nex_acc in accessions:
    try:
        cursor.execute(add_sequence_has_publication, (nex_acc, pid))
    except:
        print(nex_acc)
        failed_toadd_publication.append(nex_acc)

In [21]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
665,BBI30246.1,H1-like_(Viruses),,,,3114988.0,Acanthamoeba castellanii medusavirus J1,Nucleocytoviricota,Megaviricetes,,,MPKNIYAPLIARALKHLDDPAGSTAEAIADCVNRLFHDEITAGYRR...,,BBI30246.1,toner_characterization_2024
869,ELR13465.1,H1-like_(Viruses),,,,1257118.0,Acanthamoeba castellanii str. Neff,Discosea,,,,MPREPIPRRAKEVKREPTDYVSLIQPAFQPSQVVGHRKKEEEAKKE...,,ELR13465.1,toner_characterization_2024
870,ELR15828.1,H1-like_(Viruses),,,,1257118.0,Acanthamoeba castellanii str. Neff,Discosea,,,,MPKVTHASMVKDALKDLNEPTGSGVIAIIERVAKLYEGKLTSTYER...,,ELR15828.1,toner_characterization_2024
4271,QPB44292.1,H1-like_(Viruses),,,,3069717.0,Medusavirus stheno T3,Nucleocytoviricota,Megaviricetes,,,MPRNSTYGPLVIAALKHLNEPTGSTADAIIECVNRLFHDKISASYK...,,QPB44292.1,toner_characterization_2024
4304,QYA18369.1,H1-like_(Viruses),,,,2831644.0,Clandestinovirus,,,,,MEGDGNNSVPIETKKNVHPSYKRMVTLAIQSKDEKGASLKTIYGFL...,,QYA18369.1,toner_characterization_2024


In [22]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H1-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [25]:
accession_prefix = 'HISTDB_H1_like'

In [30]:
fasta_str = '''>M-3300001346-3.JGI20151J14362_10003186_8
MDNICLLGLGLLGLIYFSSKNNNILTSTDSLSKTGEIVYVDSKDQMDVILHNLMSNKFMNVNNSVVVWEVPLDEIFNHELHPFVGFDYFKIDNGNDNMTTVGRYIEISDEFKETVNKFIPDLEYCHVLKFNDGRNMTLIKITDKLIPLSAIKLNFIENINKNLSFCKLKSSYNKAIKNYSCKNIIDPAIDWFNVDLFSVDSSFYKELCDSEQTSLLETSQIMETDMNSDTSDDSMNDEQTSDNDVDDIKTEDLSDTSMNDEDGQILDSDVNMEDMDLSKTSMDDNNDNNDNNDNNDNSVNNQLMLSDSLELSNTSNNNLDINNNLSNTSFDLSNEDSTESRSPKKKASPKKKTSAKKTSAKKTSAKKPSAKKTSAKKPKKPKKPKKPKKPKKVKKGNTKLLIEEASLEGGSSSSLYNASKSILNRF
>M-3300009436-18.Ga0115008_10005292_2
MSSYLEKIAAAIIELKNRKGSSLNAIKTFLHADSSESRFINSALKKGVKSGALVKIDGLYKIKQTKALAKRRSIMVPGRFDIRCELDKFYEEVNAMQQHQQQQQFGQQQQWQQQHQQHQAPPASTFGFGGGRRKRRSSKLKRRSRRSSKLKRRSRRSSKLKRRNN
>M-3300017963-43.Ga0180437_10000057_151
MPTYQEMVVEAIKSMKRGRKGLSRVAIARWLQGNYTLNQKAFQRSLRDALKKGVAEGVFETTTGYSFRLSRDHSRKVRRRVRLKPEFGHLCSVCKRRRAQSWRT
>M-3300023174-207.Ga0214921_10000841_30
MTSLLDIFKIKNLDIVLNDNSKIKVFDKDIHNLYEDSNGEKKEFKISEEENTVKLTRPYDKQIKKRKIYEVPLMDDDLEIVFNRTREQNYELLKSLDYNEFEITSSQVPNDIYRIPYMSKGETKGYAVLVNRTEDAFLLDLNSLFVVKPLSSHINCLTSNTRYIDPQTPKKEKVVEVPQAPRKKTSTSSSFYSSKKLDFEEKEVASVDIPTGHIVTFIYKGDEKRVLVKETNDKYTEGICQTDNKYKKYLTRYIEKVNKVEDASSEDEDNLSEEPQPTYKNMILNTIKTCYVNGTRGLSRQALQAYIMRNYNIKIDNFHKHFIMTLKRLVETGCIIQTKQKFKLGDEGRKYLKEQKKPKNTIKYDNNVVQEGPIQDAIDNEKILDIMYDGGSIPDIKRPIRPKRVYKASNGNLILQATCLIDDKVKNFSLDKVKVIA
>M-3300023179-162.Ga0214923_10005706_6
MVRLTYLQMVVMAVNRLKVRGTKGVSRQAIKKYIANYYGRMINANISEPTFNKYVSNGINRGVERGILVQNKQSFKLGENGLREYRNIQKINGNILCKMPGARKTIHPKAMVRVRHHIRVNRTTGTRTVVASHIRKF
>S-1091232-186.1091232_contig_1680_30
MVKVDDVVAVIRKLQAPGSTKGVSRVAIKAALSITSVTPAQINLALRRGVANGKLTQIKDSFKVPTPKSKPAPRKKAPAKKKAAPKKKKSEIK
>S-ERX555917-73.ERX555917_contig_113_83
MSALRRISRRTSPIRRTSRRRRSISGGRKSDKVPNKPSYKDQIKAALLALKETRTGSSLPAIKKFLGASPAQYRFINAALRSGVASGFFIKNKGKYKLSPEAKKGPKKKKKKKKKKKKKALLKKCSDNKIRSPRGTRCRPGPRTPCRDPEKHLKRRKLSQRKTKHGGKVYECVKRKRRKSKK
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MDNICLLGLGLLGLIYFSSKNNNILTSTDSLSKTGEIVYVDSKDQMDVILHNLM...NRF'), id='M-3300001346-3.JGI20151J14362_10003186_8', name='M-3300001346-3.JGI20151J14362_10003186_8', description='M-3300001346-3.JGI20151J14362_10003186_8', dbxrefs=[]),
 SeqRecord(seq=Seq('MSSYLEKIAAAIIELKNRKGSSLNAIKTFLHADSSESRFINSALKKGVKSGALV...RNN'), id='M-3300009436-18.Ga0115008_10005292_2', name='M-3300009436-18.Ga0115008_10005292_2', description='M-3300009436-18.Ga0115008_10005292_2', dbxrefs=[]),
 SeqRecord(seq=Seq('MPTYQEMVVEAIKSMKRGRKGLSRVAIARWLQGNYTLNQKAFQRSLRDALKKGV...WRT'), id='M-3300017963-43.Ga0180437_10000057_151', name='M-3300017963-43.Ga0180437_10000057_151', description='M-3300017963-43.Ga0180437_10000057_151', dbxrefs=[]),
 SeqRecord(seq=Seq('MTSLLDIFKIKNLDIVLNDNSKIKVFDKDIHNLYEDSNGEKKEFKISEEENTVK...VIA'), id='M-3300023174-207.Ga0214921_10000841_30', name='M-3300023174-207.Ga0214921_10000841_30', description='M-3300023174-207.Ga0214921_10000841_30', dbxrefs=[]),
 SeqRecord(seq=Seq('MVRLTYLQ

In [37]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": "H1-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300001346-3.JGI20151J14362_10003186_8
------------------------------------------------
M-3300009436-18.Ga0115008_10005292_2
------------------------------------------------
M-3300017963-43.Ga0180437_10000057_151
------------------------------------------------
M-3300023174-207.Ga0214921_10000841_30
------------------------------------------------
M-3300023179-162.Ga0214923_10005706_6
------------------------------------------------
S-1091232-186.1091232_contig_1680_30
------------------------------------------------
S-ERX555917-73.ERX555917_contig_113_83


In [38]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession HISTDB_H1_like_1 <class 'str'>
variant H1-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MSSYLEKIAAAIIELKNRKGSSLNAIKTFLHADSSESRFINSALKKGVKSGALVKIDGLYKIKQTKALAKRRSIMVPGRFDIRCELDKFYEEVNAMQQHQQQQQFGQQQQWQQQHQQHQAPPASTFGFGGGRRKRRSSKLKRRSRRSSKLKRRSRRSSKLKRRNN <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [39]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [40]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H1-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3,HISTDB_H1_like_0,H1-like_(Viruses),,,,,,,,,,MDNICLLGLGLLGLIYFSSKNNNILTSTDSLSKTGEIVYVDSKDQM...,,,
4,HISTDB_H1_like_1,H1-like_(Viruses),,,,,,,,,,MSSYLEKIAAAIIELKNRKGSSLNAIKTFLHADSSESRFINSALKK...,,,
5,HISTDB_H1_like_2,H1-like_(Viruses),,,,,,,,,,MPTYQEMVVEAIKSMKRGRKGLSRVAIARWLQGNYTLNQKAFQRSL...,,,
6,HISTDB_H1_like_3,H1-like_(Viruses),,,,,,,,,,MTSLLDIFKIKNLDIVLNDNSKIKVFDKDIHNLYEDSNGEKKEFKI...,,,
7,HISTDB_H1_like_4,H1-like_(Viruses),,,,,,,,,,MVRLTYLQMVVMAVNRLKVRGTKGVSRQAIKKYIANYYGRMINANI...,,,
8,HISTDB_H1_like_5,H1-like_(Viruses),,,,,,,,,,MVKVDDVVAVIRKLQAPGSTKGVSRVAIKAALSITSVTPAQINLAL...,,,
9,HISTDB_H1_like_6,H1-like_(Viruses),,,,,,,,,,MSALRRISRRTSPIRRTSRRRRSISGGRKSDKVPNKPSYKDQIKAA...,,,


In [41]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [42]:
data_publication = {
    "id": pid,
    "title": 'Self-assembling viral histones are evolutionary intermediates between archaeal and eukaryotic nucleosomes',
    "doi": '10.1038/s41564-024-01707-9',
    "author": None,
    "year": '2024',
}
cursor.execute(add_publication, data_publication)

In [43]:
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [44]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [45]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H1-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3,HISTDB_H1_like_0,H1-like_(Viruses),,,,,,,,,,MDNICLLGLGLLGLIYFSSKNNNILTSTDSLSKTGEIVYVDSKDQM...,,HISTDB_H1_like_0,irwin_self-assembling_2024
4,HISTDB_H1_like_1,H1-like_(Viruses),,,,,,,,,,MSSYLEKIAAAIIELKNRKGSSLNAIKTFLHADSSESRFINSALKK...,,HISTDB_H1_like_1,irwin_self-assembling_2024
5,HISTDB_H1_like_2,H1-like_(Viruses),,,,,,,,,,MPTYQEMVVEAIKSMKRGRKGLSRVAIARWLQGNYTLNQKAFQRSL...,,HISTDB_H1_like_2,irwin_self-assembling_2024
6,HISTDB_H1_like_3,H1-like_(Viruses),,,,,,,,,,MTSLLDIFKIKNLDIVLNDNSKIKVFDKDIHNLYEDSNGEKKEFKI...,,HISTDB_H1_like_3,irwin_self-assembling_2024
7,HISTDB_H1_like_4,H1-like_(Viruses),,,,,,,,,,MVRLTYLQMVVMAVNRLKVRGTKGVSRQAIKKYIANYYGRMINANI...,,HISTDB_H1_like_4,irwin_self-assembling_2024
8,HISTDB_H1_like_5,H1-like_(Viruses),,,,,,,,,,MVKVDDVVAVIRKLQAPGSTKGVSRVAIKAALSITSVTPAQINLAL...,,HISTDB_H1_like_5,irwin_self-assembling_2024
9,HISTDB_H1_like_6,H1-like_(Viruses),,,,,,,,,,MSALRRISRRTSPIRRTSRRRRSISGGRKSDKVPNKPSYKDQIKAA...,,HISTDB_H1_like_6,irwin_self-assembling_2024


In [46]:
# Make sure data is committed to the database
conn.commit()

# Add H2A-like_(Viruses), H2B-like_(Viruses), H3-like_(Viruses), H4-like_(Viruses)

In [48]:
data_histone = [{
    "id": f"{htype}-like_(Viruses)",
    "level": "variant",
    "taxonomic_span": "Viruses",
    "taxonomic_span_id": "10239",
    "description": None,
    "parent": "Singlet",
} for htype in ['H2A', 'H2B', 'H3', 'H4']]
for dh in data_histone:
    cursor.execute(add_histone, dh)

In [49]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin([f"{htype}-like_(Viruses)" for htype in ['H2A', 'H2B', 'H3', 'H4']])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
107,H2A-like_(Viruses),variant,Viruses,10239,,Singlet
153,H2B-like_(Viruses),variant,Viruses,10239,,Singlet
168,H3-like_(Viruses),variant,Viruses,10239,,Singlet
196,H4-like_(Viruses),variant,Viruses,10239,,Singlet


In [50]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H2A-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [51]:
accession_prefix = 'HISTDB_H2A_like'

In [52]:
fasta_str = '''>M-3300017989-17.Ga0180432_10000436_27
MAGTSTEKKNVSRSKRAGLNFPVGRINRMMRNNRYAERIGAGAPVYMASVLEYLTAEILELAGNAAADNKKTRITPRFLNLAIRSDEELNRLLNGVQISAGGVLPHINASLLPKTKKTVEATAE
>Mama.Mama_1_192
MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANIIIDQKTLRHDAFLWLLTDIQGELGKHSQDFANSVLYGEKELVFPTKRTENLMRKNTCLRISRSAVKTLTAILEYFCGQIMEASFSQAKKSKRKRIRPIDIEAAISQDKELHSMFGKGVISGR
>Me06.Me06_1_195
MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANIIIDQKTLRHDAFLWLLTDIQGELGKHSQDFANSVLYGEKELVFPTKRTENLMRKNTCLRISQSAVKTLTAILEYFCGQIMEASFSQAKKSKRKRIRPIDIEAAISQDKELHSMFGKGVISGR
>Medus.AP018495_BBI30458_1_318
MEIDSHVQPAEVLAAASESMQLEEQTQLPASAAGEALEELQTEGKKTSPAKKRTSSGKNVKRANERAGLKLPPGRIQKIIKANQTTDVGRSSPTASVFLTAVIEDIVKEIIKGADKKSEERGRIRISPQDILKYLTENGEAYMHILGDAFVSHGGVGQVAEMAAAAANTGIKKRKRAASTAEGAPKKKIAKKAAAKKATGAKKVVKKKSGSTKSKTTGKSVTKKASSRKVASA
>Tufo.Tufo_1_56
MERIGNYTLFVKRMTPEGFEVTKEAVSQINNMLVFLADKTITKALILLGDKKTLKHDILFWLLRDIPGELGKHGRDYVDSVLYANKELVFPTKRTENLIRKKSCKRVGKSSVQALTAILEYFCREILVSSAREAKRESRKRIKVLDIQKAVKKDMELSQVFGSGVFSGR
>M-3300009182-70.Ga0114959_10000291_17
MNTFRKTTLSDESDNGVRININRECPPGYYKKPNGKCYKIFFGDEDDEEPTPEKNHVSPLSLDGKPSSNHHRPPTIRRIADVLTKPNNRCKKGTRKNKDGICVPKNTETITNHDKSSNDNDTFAYDDKQSTKNPRRPTKRHISSVVTKPRTQKKKPVAVTTISGEDVSSNVITLEEVTNKTHTLLEMNSSIKVSKPTILHLFNLIKKYNPDELRQRNEDNSIMLPFNTYDNRQILLYLFNEIMDASINRTRDSKKKMVTIQIIQSVIENDAQLNELLMPLR
>M-3300023174-73.Ga0214921_10003059_5
MDPLVKKRCPNGTKKNKSGDCIKKDATSSKPKPQRTRKNLRAKKPDADPNKITLEEATQETQTLLAANPSIKVSKPTILHLFHFINKLNPAQLRVRNEEAILPLENYPDDKGVLLYLFREIMDLSINRTRDSKKKTVSVKIIEQVIQNDDQLRILFTA
>S-1016713-165.1016713_contig_440_11
MTPTFKFLHYVRLIVRRLTVGGELPQVQLSATFCKKIEEILFVVLMHFMGSIARTIRHNNRKTVDVADVQLGIYTLFPLHLRALVAKIVDEKIKNFETTTTGSHVDRANLTVPPSKCRRIFKSTYDDLRLSKGSPVAIAAMIEVVVTETMSLALKKSQEMQIQRMGSQHLDEILDNNLELSAFVNSAYAWVGVVSRV
>S-1017247-24.1017247_contig_228_13
MNKYMDTKKLLPTKDFQYSTEAKDYINQLLSSLLTKINSITIGERIVEDIDKILPEYIATGAKWEIQNSIRHNKPFLSIDKLLKQKKVSTQSKRNLSYLLEYLVREILDLAINCCRDQKKKRITKQHVEYVLRNDAEFPMIFGLKPWH
>S-1035085-51.1035085_contig_35_46
MVYKLASITELLQLIHPNLKIKNDSVMMLNNMISDHILHDLIVNNNILTVDIIKAHILTITNDKLLQDHAFNEINKSLTNLGLNLTLLKQDTIDIINEKYGISVNEDNVVTAITTLIEYILAEILELSGNITISNRRVYVTKYNIIQSIKDDEALNDLFK
>S-3300012936-62.3300012936_a_Ga0163109_10001679_7
MLKFSFANNVNKCDIQTFKCPISLDLMEDPVFLEDGYTYDRKNIEDHLVNSNSSPLTNEYLYDITLTSNFNLKSQINEWKIKNILKKEIIFYNNLFKYKDNNKNVFSDKIEMYHEDFTFKGSVKNGLKDGNGELILKNGTVYKFNWVNGEFDGKGEIIYNNGDVYEGYVINYNRNGKGKIVYKNGDFFDGYWLNDMRSGYGTYISNEEKYTGEWANDKKNGIGILEYKDFIIESKWLEDVENNSCKIYYNDGSKYIGEVKDHKKHGKGELFLKDGTIYRCNWINDEKVDDIEIIYKNGDFFKIHVNDYKCIKSRKSDNIYIDFKINNILKDLSSNVYINIVAKEYIDKICYNMLNILSNKFNNYLDNNNDEVNEELLEESSDDEIFKDLDNYSKKSLKNISIDTLLNLLKELVPGCLYSNCLSDIYKNINKIDSINRFHINKKFGFKIKVKSIIKFFKNKCDNLAINYIIMVSIVTLLEYIINEILDLSINNILIESNLINIENVKSVIKNDNELSEFVNNKLKIDL
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MAGTSTEKKNVSRSKRAGLNFPVGRINRMMRNNRYAERIGAGAPVYMASVLEYL...TAE'), id='M-3300017989-17.Ga0180432_10000436_27', name='M-3300017989-17.Ga0180432_10000436_27', description='M-3300017989-17.Ga0180432_10000436_27', dbxrefs=[]),
 SeqRecord(seq=Seq('MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANIIIDQKTLR...SGR'), id='Mama.Mama_1_192', name='Mama.Mama_1_192', description='Mama.Mama_1_192', dbxrefs=[]),
 SeqRecord(seq=Seq('MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANIIIDQKTLR...SGR'), id='Me06.Me06_1_195', name='Me06.Me06_1_195', description='Me06.Me06_1_195', dbxrefs=[]),
 SeqRecord(seq=Seq('MEIDSHVQPAEVLAAASESMQLEEQTQLPASAAGEALEELQTEGKKTSPAKKRT...ASA'), id='Medus.AP018495_BBI30458_1_318', name='Medus.AP018495_BBI30458_1_318', description='Medus.AP018495_BBI30458_1_318', dbxrefs=[]),
 SeqRecord(seq=Seq('MERIGNYTLFVKRMTPEGFEVTKEAVSQINNMLVFLADKTITKALILLGDKKTL...SGR'), id='Tufo.Tufo_1_56', name='Tufo.Tufo_1_56', description='Tufo.Tufo_1_56', dbxrefs=[]),
 SeqRecord(seq=Seq('MNTFR

In [54]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": "H2A-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300017989-17.Ga0180432_10000436_27
------------------------------------------------
Mama.Mama_1_192
------------------------------------------------
Me06.Me06_1_195
------------------------------------------------
Medus.AP018495_BBI30458_1_318
------------------------------------------------
Tufo.Tufo_1_56
------------------------------------------------
M-3300009182-70.Ga0114959_10000291_17
------------------------------------------------
M-3300023174-73.Ga0214921_10003059_5
------------------------------------------------
S-1016713-165.1016713_contig_440_11
------------------------------------------------
S-1017247-24.1017247_contig_228_13
------------------------------------------------
S-1035085-51.1035085_contig_35_46
------------------------------------------------
S-3300012936-62.3300012936_a_Ga0163109_10001679_7


In [55]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession HISTDB_H2A_like_1 <class 'str'>
variant H2A-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANIIIDQKTLRHDAFLWLLTDIQGELGKHSQDFANSVLYGEKELVFPTKRTENLMRKNTCLRISRSAVKTLTAILEYFCGQIMEASFSQAKKSKRKRIRPIDIEAAISQDKELHSMFGKGVISGR <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [56]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [58]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H2A-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2A_like_0,H2A-like_(Viruses),,,,,,,,,,MAGTSTEKKNVSRSKRAGLNFPVGRINRMMRNNRYAERIGAGAPVY...,,,
1,HISTDB_H2A_like_1,H2A-like_(Viruses),,,,,,,,,,MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANI...,,,
2,HISTDB_H2A_like_10,H2A-like_(Viruses),,,,,,,,,,MLKFSFANNVNKCDIQTFKCPISLDLMEDPVFLEDGYTYDRKNIED...,,,
3,HISTDB_H2A_like_2,H2A-like_(Viruses),,,,,,,,,,MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANI...,,,
4,HISTDB_H2A_like_3,H2A-like_(Viruses),,,,,,,,,,MEIDSHVQPAEVLAAASESMQLEEQTQLPASAAGEALEELQTEGKK...,,,
5,HISTDB_H2A_like_4,H2A-like_(Viruses),,,,,,,,,,MERIGNYTLFVKRMTPEGFEVTKEAVSQINNMLVFLADKTITKALI...,,,
6,HISTDB_H2A_like_5,H2A-like_(Viruses),,,,,,,,,,MNTFRKTTLSDESDNGVRININRECPPGYYKKPNGKCYKIFFGDED...,,,
7,HISTDB_H2A_like_6,H2A-like_(Viruses),,,,,,,,,,MDPLVKKRCPNGTKKNKSGDCIKKDATSSKPKPQRTRKNLRAKKPD...,,,
8,HISTDB_H2A_like_7,H2A-like_(Viruses),,,,,,,,,,MTPTFKFLHYVRLIVRRLTVGGELPQVQLSATFCKKIEEILFVVLM...,,,
9,HISTDB_H2A_like_8,H2A-like_(Viruses),,,,,,,,,,MNKYMDTKKLLPTKDFQYSTEAKDYINQLLSSLLTKINSITIGERI...,,,


In [59]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [60]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [62]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H2A-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2A_like_0,H2A-like_(Viruses),,,,,,,,,,MAGTSTEKKNVSRSKRAGLNFPVGRINRMMRNNRYAERIGAGAPVY...,,HISTDB_H2A_like_0,irwin_self-assembling_2024
1,HISTDB_H2A_like_1,H2A-like_(Viruses),,,,,,,,,,MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANI...,,HISTDB_H2A_like_1,irwin_self-assembling_2024
2,HISTDB_H2A_like_10,H2A-like_(Viruses),,,,,,,,,,MLKFSFANNVNKCDIQTFKCPISLDLMEDPVFLEDGYTYDRKNIED...,,HISTDB_H2A_like_10,irwin_self-assembling_2024
3,HISTDB_H2A_like_2,H2A-like_(Viruses),,,,,,,,,,MDRVGKYGLFIKRISPKDADITKESLETVNNMLVFLAEKLTKQANI...,,HISTDB_H2A_like_2,irwin_self-assembling_2024
4,HISTDB_H2A_like_3,H2A-like_(Viruses),,,,,,,,,,MEIDSHVQPAEVLAAASESMQLEEQTQLPASAAGEALEELQTEGKK...,,HISTDB_H2A_like_3,irwin_self-assembling_2024
5,HISTDB_H2A_like_4,H2A-like_(Viruses),,,,,,,,,,MERIGNYTLFVKRMTPEGFEVTKEAVSQINNMLVFLADKTITKALI...,,HISTDB_H2A_like_4,irwin_self-assembling_2024
6,HISTDB_H2A_like_5,H2A-like_(Viruses),,,,,,,,,,MNTFRKTTLSDESDNGVRININRECPPGYYKKPNGKCYKIFFGDED...,,HISTDB_H2A_like_5,irwin_self-assembling_2024
7,HISTDB_H2A_like_6,H2A-like_(Viruses),,,,,,,,,,MDPLVKKRCPNGTKKNKSGDCIKKDATSSKPKPQRTRKNLRAKKPD...,,HISTDB_H2A_like_6,irwin_self-assembling_2024
8,HISTDB_H2A_like_7,H2A-like_(Viruses),,,,,,,,,,MTPTFKFLHYVRLIVRRLTVGGELPQVQLSATFCKKIEEILFVVLM...,,HISTDB_H2A_like_7,irwin_self-assembling_2024
9,HISTDB_H2A_like_8,H2A-like_(Viruses),,,,,,,,,,MNKYMDTKKLLPTKDFQYSTEAKDYINQLLSSLLTKINSITIGERI...,,HISTDB_H2A_like_8,irwin_self-assembling_2024


In [63]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H2B-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [64]:
accession_prefix = 'HISTDB_H2B_like'

In [65]:
fasta_str = '''>M-3300017989-17.Ga0180432_10000436_28
MVKSEQQEQKVRRRKQRNTTNFNAYIYKVLKNVHPEHGISKKAMSVMNGICSDLFERIGAEAARVSRYNNRRTLSSKEIQTATRLILPGELSKHAVSAGIQSVTRFNSN
>Pabr.LT972217_583
MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAKHEPGTAKKTKGKKTLKKKKAVGTAKTESLDGVKRKAYRHKKDYASYSTFIYRVLKQVHPDVGISNKSMSIMNSFVNDMIDRIATEAGRLARTNKRNTITAREIQTAVRLIMQGELARHAVSEGTKAVTKYNEAVNAGSVGDDETAAA
>Pama.MG011691_1199
MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAKHEPGTAKKAKGKKTLKKKKAAGTPKTESLDGVKRKAYRHKKDYASYSTFIYRVLKQVHPDVGISNKSMSIMNSFVNDMIDRIATEAGRLARTNKRNTITAREIQTAVRLIMQGELARHAVSEGTKAVTKYNEAVNAGSVGDDETAAA
>Pane.MG011690_1287
MSAPSAASDSLPMDTDAPVAAPPTLEDQTSGLGTGDIGAAADTVAKHEPGAAKKTKKIAKKKKGAALGGVGKTESLDGVKRKTHRHKKDYASYSTFIYRVLKQVHPDVGISNKSMSIMNSFVNDMIDRIATEAGRLARTNKRNTITAREIQTAVRLIMQGELARHAVSEGTKAVTKYNEAVNAGSVGDDETAAA
>Pass.OFAI01000004_1301
MSASEPLPMDTDAPVASPPAATAAPADDAGAPEGGGAPAVHDTTTESVAKHEPGTAKAKKTIKKKKRSASGLGVSKAESVDGAKRKAHRHKKDYASYSTFIYRVLKQVHPDVGISNKSMSIMNSFVNDMIDRIATEAGRLARTNKRNTITAREIQTAVRLIMQGELARHAVSEGTKAVTKYNDAVNNSAVADDETAAA
>M-3300017792-3.Ga0163161_10000024_3
MSSPSSPIADSSADMIIDATLSTSIPALEAENGDMTIDSSSSDIVASTAASDVIIAPTSDDSIIAAASTPSAENTDAGGKTGDEEKKKKKKKKAAINKHSRFMNAVHNIGHRAAPGCQMSKKAISVGTHLLDNFLQRCASEAVSMAGEGKKATVTSTDIFASALKILPNELGKVSVNSSMPILVKYVDEQMARNAAAKIVAKASKDAKLAAAVALQESSDVTVSADIPSIASVAATA
>M-3300022916-4.Ga0233431_1013871_3
MSSKKGKSKSDDTVNHQSDSKVYCSGVVSILKAHGVIFSTSNINKQWRYYCQSKGSAKWIATCTRSSKKVSEDDAKFSMYVGIGGKISTAILIQHLIKELLTISIKHIKSDIDGLYELSATHFNNMIMVHKTLKKYYASIYNYEYNPDRQYHSDYDKCIDAYIKKLQSTNSIKQCKITKDGMNMLLFFVNSIIDDLMRCVLSIMEYSQKKTISEKMIYCAANILLDNSISASIDETTKCVLSHIPKAVKKDDINDDEDVVNDNDEDDEDDDDGDDEDSDDEDSNEDGDNSDQVNEKKINQKPSKKQSNSSKKCVDSRSDNDKDNEDVQETNDNQPLQKPKSGRKK
>Indi.Indi_1_164
MSTEKKPNKTTEKAGLTFNVNTIKQKLKSYYEGQDLLTLMFSGGHIAITATLEKLWETILHECLKRVGKDKSGVRQVNRESLQYSVLMHSGLERYFMSHFRYYDVSLEYKDQSPVINTELDKVMERVDKDMSLTSKARNLAHYMLLKVFSHLAVTAHGFVEYAKKKSLDGRSVTFAVSTVFHESVSSDFNKEITRVMKEFGEELEETHAANETPSDTQGTTPAGEADTGDGEEEKQSAKTKNTKKTTSTTSTTSSSTTSTKNAKKKTETIEEEADDKEEQEEQEEQDEKEDNDEKPEEAAPSKSTKKAVTGSTTKKSSNSPKQTKNGKK
>Medus.AP018495_BBI30201_1_61
MSQIDEHVTEMTEFEAEHESTYSEHSDEEEQELGARVPSRKQKGKAAAKKVKKAVAKKSGEERRRKKNYDSFATFIAKLVGPNGKGRKPGFSAKGMEVLESIVKSLATEMTIVANELAKHQGRQTLGAGDFRTALAVRGSLIAREPATVKALTEMGEKAVLKYQSSLGRPAKTAPKKKKATKKASA
>Padu.Padu_1_1099
MDTNTEVAPEPQAVASETADNEATAPSAVSEADAAPKAEPGTKATKKKAGKQGKKKIVLSTAAKAEGGAAAKVGKRGRGQRKKNYTSYSSFIYKVLKQVHPDVGISNKSMSVMNSFVNDMIDRIGTEAGRLAHSNKRNTIGTREIQTAVRLIMRGELARHAVSEGTKAVTKYNEAVNAAAETAAAAAIDTTAA
>Pain.Pain_1_895
MNSFVNDMIDRIGTEAGRLARSNKRNTIGTREIQTAVRLIMRGELARHAVSEGTKAVTKYNEAVNAAAEANAAAIDTTAA
>Papa.OFAJ01000016_971
MSAPSPMDTHTEVAPEPQAIASEAVDNEAAVPLAASEADAAPKAEPGTKATKKKAGKQGKKKIVLSTAAKAGEGGAAAKAGKRGRGQRKKNYTSYSSFIYKVLKQVHPDVGISNKSMSVMNSFVNDMIDRIGTEAGRLAHSNKRNTIGTREIQTAVRLIMRGELARHAVSEGTKAVTKYNEAVNAATEAHAAAAIDTTAA
>Paqu.MG011689_1423
MSAPSPSPMDTETVPEPQVPAEAVPAGEAAATETGTEAGTAAVPKPEPGAKKTKKKAAKRPKKIALSTKTEGDAGAVGKRGRGQRKKNYASYSSFIYKVLKQVHPDVGISNKSMSIMNSFVNDMIDRIGTEAGRLARSNKRNTIGTREIQTAVRLIMRGELARHAVSEGTKAVTKYNEAVNAAAEANAASAAIKSTAA
>Pasa.Pasa_1_1681
MSAPAPTDIEMTTDASVPADAAPTAAEPQHQRDMETETEAASAVPKPEPGATKTKKKAAKRTKKIALSTKKTQSGGDAGAAGKRGRGQRKKNYASYSSFVYKVLKQVHPDVGISNKSMSIMNSFVNDMIDRIGTEAGRLARTNKRNTIGTREIQTAVRLIMRGELARHAVSEGTKAVTKYNDAINAAAAEAHAVAVDTAAA
>S-ERX556017-93.ERX556017_contig_5615_8
MQTGFDITEDTVNKDLMTKIEAMLVTLLEDCIHVAVIYVNECGRNTITSTDMLYAVQYQAREFFKQENLIENIDNNVKLLKEEDEEDEDDEEDEGDEGDEEDEEDEEDEEDEEDEEDSDEEEFTRCENETNSIVKLMNEYHDSWNEWKPDHNYEILLKRVIDEKMCC
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MVKSEQQEQKVRRRKQRNTTNFNAYIYKVLKNVHPEHGISKKAMSVMNGICSDL...NSN'), id='M-3300017989-17.Ga0180432_10000436_28', name='M-3300017989-17.Ga0180432_10000436_28', description='M-3300017989-17.Ga0180432_10000436_28', dbxrefs=[]),
 SeqRecord(seq=Seq('MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAKHEPGTAKK...AAA'), id='Pabr.LT972217_583', name='Pabr.LT972217_583', description='Pabr.LT972217_583', dbxrefs=[]),
 SeqRecord(seq=Seq('MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAKHEPGTAKK...AAA'), id='Pama.MG011691_1199', name='Pama.MG011691_1199', description='Pama.MG011691_1199', dbxrefs=[]),
 SeqRecord(seq=Seq('MSAPSAASDSLPMDTDAPVAAPPTLEDQTSGLGTGDIGAAADTVAKHEPGAAKK...AAA'), id='Pane.MG011690_1287', name='Pane.MG011690_1287', description='Pane.MG011690_1287', dbxrefs=[]),
 SeqRecord(seq=Seq('MSASEPLPMDTDAPVASPPAATAAPADDAGAPEGGGAPAVHDTTTESVAKHEPG...AAA'), id='Pass.OFAI01000004_1301', name='Pass.OFAI01000004_1301', description='Pass.OFAI01000004_1301', dbxrefs=[]),
 SeqRecord(seq=Seq(

In [66]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": "H2B-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300017989-17.Ga0180432_10000436_28
------------------------------------------------
Pabr.LT972217_583
------------------------------------------------
Pama.MG011691_1199
------------------------------------------------
Pane.MG011690_1287
------------------------------------------------
Pass.OFAI01000004_1301
------------------------------------------------
M-3300017792-3.Ga0163161_10000024_3
------------------------------------------------
M-3300022916-4.Ga0233431_1013871_3
------------------------------------------------
Indi.Indi_1_164
------------------------------------------------
Medus.AP018495_BBI30201_1_61
------------------------------------------------
Padu.Padu_1_1099
------------------------------------------------
Pain.Pain_1_895
------------------------------------------------
Papa.OFAJ01000016_971
------------------------------------------------
Paqu.MG011689_1423
------------------------------------------------
Pasa.Pa

In [67]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession HISTDB_H2B_like_1 <class 'str'>
variant H2B-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAKHEPGTAKKTKGKKTLKKKKAVGTAKTESLDGVKRKAYRHKKDYASYSTFIYRVLKQVHPDVGISNKSMSIMNSFVNDMIDRIATEAGRLARTNKRNTITAREIQTAVRLIMQGELARHAVSEGTKAVTKYNEAVNAGSVGDDETAAA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [68]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [69]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H2B-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_like_0,H2B-like_(Viruses),,,,,,,,,,MVKSEQQEQKVRRRKQRNTTNFNAYIYKVLKNVHPEHGISKKAMSV...,,,
1,HISTDB_H2B_like_1,H2B-like_(Viruses),,,,,,,,,,MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAK...,,,
2,HISTDB_H2B_like_10,H2B-like_(Viruses),,,,,,,,,,MNSFVNDMIDRIGTEAGRLARSNKRNTIGTREIQTAVRLIMRGELA...,,,
3,HISTDB_H2B_like_11,H2B-like_(Viruses),,,,,,,,,,MSAPSPMDTHTEVAPEPQAIASEAVDNEAAVPLAASEADAAPKAEP...,,,
4,HISTDB_H2B_like_12,H2B-like_(Viruses),,,,,,,,,,MSAPSPSPMDTETVPEPQVPAEAVPAGEAAATETGTEAGTAAVPKP...,,,
5,HISTDB_H2B_like_13,H2B-like_(Viruses),,,,,,,,,,MSAPAPTDIEMTTDASVPADAAPTAAEPQHQRDMETETEAASAVPK...,,,
6,HISTDB_H2B_like_14,H2B-like_(Viruses),,,,,,,,,,MQTGFDITEDTVNKDLMTKIEAMLVTLLEDCIHVAVIYVNECGRNT...,,,
7,HISTDB_H2B_like_2,H2B-like_(Viruses),,,,,,,,,,MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAK...,,,
8,HISTDB_H2B_like_3,H2B-like_(Viruses),,,,,,,,,,MSAPSAASDSLPMDTDAPVAAPPTLEDQTSGLGTGDIGAAADTVAK...,,,
9,HISTDB_H2B_like_4,H2B-like_(Viruses),,,,,,,,,,MSASEPLPMDTDAPVASPPAATAAPADDAGAPEGGGAPAVHDTTTE...,,,


In [70]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [71]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [72]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H2B-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_like_0,H2B-like_(Viruses),,,,,,,,,,MVKSEQQEQKVRRRKQRNTTNFNAYIYKVLKNVHPEHGISKKAMSV...,,HISTDB_H2B_like_0,irwin_self-assembling_2024
1,HISTDB_H2B_like_1,H2B-like_(Viruses),,,,,,,,,,MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAK...,,HISTDB_H2B_like_1,irwin_self-assembling_2024
2,HISTDB_H2B_like_10,H2B-like_(Viruses),,,,,,,,,,MNSFVNDMIDRIGTEAGRLARSNKRNTIGTREIQTAVRLIMRGELA...,,HISTDB_H2B_like_10,irwin_self-assembling_2024
3,HISTDB_H2B_like_11,H2B-like_(Viruses),,,,,,,,,,MSAPSPMDTHTEVAPEPQAIASEAVDNEAAVPLAASEADAAPKAEP...,,HISTDB_H2B_like_11,irwin_self-assembling_2024
4,HISTDB_H2B_like_12,H2B-like_(Viruses),,,,,,,,,,MSAPSPSPMDTETVPEPQVPAEAVPAGEAAATETGTEAGTAAVPKP...,,HISTDB_H2B_like_12,irwin_self-assembling_2024
5,HISTDB_H2B_like_13,H2B-like_(Viruses),,,,,,,,,,MSAPAPTDIEMTTDASVPADAAPTAAEPQHQRDMETETEAASAVPK...,,HISTDB_H2B_like_13,irwin_self-assembling_2024
6,HISTDB_H2B_like_14,H2B-like_(Viruses),,,,,,,,,,MQTGFDITEDTVNKDLMTKIEAMLVTLLEDCIHVAVIYVNECGRNT...,,HISTDB_H2B_like_14,irwin_self-assembling_2024
7,HISTDB_H2B_like_2,H2B-like_(Viruses),,,,,,,,,,MSASSASDSSPMDTDTPVTAPPAIEEQVGDLGDHTDGDASADSIAK...,,HISTDB_H2B_like_2,irwin_self-assembling_2024
8,HISTDB_H2B_like_3,H2B-like_(Viruses),,,,,,,,,,MSAPSAASDSLPMDTDAPVAAPPTLEDQTSGLGTGDIGAAADTVAK...,,HISTDB_H2B_like_3,irwin_self-assembling_2024
9,HISTDB_H2B_like_4,H2B-like_(Viruses),,,,,,,,,,MSASEPLPMDTDAPVASPPAATAAPADDAGAPEGGGAPAVHDTTTE...,,HISTDB_H2B_like_4,irwin_self-assembling_2024


In [73]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H3-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [74]:
accession_prefix = 'HISTDB_H3_like'

In [75]:
fasta_str = '''>M-3300025880-30.Ga0209534_10000479_25
MPRTKRVAESSVLLRSSDRRPVVAAKSPFRFIMKKHSQSRLRDANGNLIRKPYKYKPGTVALREIRKQQKRTDTVLPKASFQRLVRELSINFKSDLRFEREALLALQEAAESHLVNVFQHSHNLAIYTGRKTLRKTDVEMSTHIREDSTNIVIPKHISPHKCNAQLIATAPTAPTTASVDESATIAMNETQVEMETVEQHRQTDTLIDTLVDDFDD
>S-1063923-109.1063923_contig_2940_19
MAQKAPRKKDAQKAMKAPRSTDGAKKPHRFRPGTVALREIRRFQKSTDLLMPKLPFQRMVRIAADDVQKDVRFQASAIAALQEATEAYIVSLFEQANLAAIHSKRVTVMPKDIALALRIRPKNTR
>Arvu.Arvu_1_24
MEQTMNKLSLDDDEWEDEIEDEEEIKAEEENSEEESESDNEVEEVEEDEEEEEITDDCGEGSFIPKTPFKRLVKEIAQDFAVDLRFQKEAIDALQQVSEEYLIKLFHDANKCAEHAKRQTVTPDDIKLVRFLRQNKSMDFELS
>Medus.AP018495_BBI30395_1_255
MPKERAIKKSKSAVKKVAEKKAAIKSKAKKAATGVKKPHRFRPGTTAKRLSKKEQKLSSTKTTVRRAPFGRIVRTIASLSSADSMRFSANAVDLLQQGIELYMLDLMKNAALAAKQAKRMTLMGKDIDLIQTANHEMIDEAHAAKLANTSSAGFARRRVTKKE
>M-3300005517-3.Ga0070374_10002873_8
MARKIQVARKSFGGKAPLSKQAFKAALKIPISIKKKKRSRPGLIALKEIRKFQNSTETLIKRLPFQKLVKEIIQGFNSNFRCQSIAIDALQQAAEAFIVDMFENTNLCAIHAKRVTIQPKDMQLTSRIKPI
>M-3300009182-30.Ga0114959_10001569_12
MSNLTFGSVSERNVELMNYALSYLKENFPDFRFSAEAIAELQAEINVIEDAEISAKLSKRTNINEKDYMLARRLSNRPI
>M-3300009182-30.Ga0114959_10001569_15
MARTKQTARKSTGGKAPRKQLATKASRMTAPFAGGTKKPHRYRPGTVALREIRKYQRSTETLIPRIAFQRLVREVTQDRKADLKFQSSALLALQEGAEAYLVSLFNDSNLCAIHRKRITVTPFDMQLARRIRGERA
>M-3300009436-29.Ga0115008_10001158_7
MDSQRMNTTTIREISTSLTVDKTDALLDALTNDLDDTDMVTFTPGVPTQEDLDFIDTNPNPVAEAHSDSYDSCETDEHSDSDGEFVRSIALKNPHAVGDDGSIDFDKVDETNDIHDSDSASNTSQSDSDDDCEEPSDDQNCKKEELRFEEQQRDKQTPGGGRSRRVKKPRRFKPGTVALREIRKMQRGCDLLIPKLPFQRLVREITHYYRDDIRFTEDALEAIHEASEAYLIELFANTQRVAIHAHREQIEPRDMQIVCLLRESNRPITY
>M-3300010375-16.Ga0105239_10001517_25
MQFKQELRFQGSAILALQEASESYLVSLFEDTNLCAIHAKRVTIMPKDINLARRIRGERA
>M-3300014838-63.Ga0182030_10004491_6
MDNSSSEEAETNIDSIVTQLRLLAIKFERTKEKVFRLQNGTDLSDLIPKLKTEEDVINLAHRILDGTILSETGPYNDFKEEEKTYKEENEEENKDVNEGFGCMDEDEDDDERDPKREALVDLFIRNGYHNMWETWFDCFKYFGYEAPGNLKTYEMPNVYELPVVFEDAYDKETFDVPAYFKTFIEGQADASTLLTIVDAIVAYYIEEKEEMDSEELDEEERQENKYDIYGTRDMVSGAVDMLYEALANGEEDKKIPLIEDFWNEFREHEDDVDALMEIYEEVDAELDKLDDAEFDSYEQGCCEDCSVPCDVHEEDDIDDDIKPYPVLYNARGYKSREEWVVKKRKVFDEVDIPIEEIRVEQSQTDLIIDPRAFKRLTLEIVQDFREGGYHFEDGALEALQTAAEAHLIQNFEQASARAVKANRTYIGIADMKSTV
>M-3300017724-7.Ga0181388_1001570_2
MARTKQTARKSLGGLAPIKMMKQLATLASRKSAPATGGVKRPKRYRPGTVALREIRKYQKGTELLIRKLPFQRLVKEISQDYKTDLKFQSAAILALQEASEAYLVGLFEDTNMCAIHAKRVTIMPKDIHLARRIRGEKRY
>M-3300017765-1.Ga0181413_1000150_9
MARTKQTARKSTGGKAPRKQLATKVARMKARPQAGGVKKPHRYRPGTVALREIRKYQRSTELLIRKLPFQRLVREIAQDFKTDMRFQSHALLALQEASEAYLIGLFEDTNLCAIHAKRVTIMPKDIQLARRIRHER
>M-3300017971-10.Ga0180438_10005924_6
MHILARVVCWFVFTHHKKKVASFFLVLLFLSYLFFEKKPNPHISKNKKKRITQTKKSMARTKQTSRMIMGHKAPRKSPANPQPRAKPHRYRPGTVALREIRRYQKSTDLLIRKLPFQRLVRQIAQDFKDDLRFQASAVFALQETAEAYLVSLFEDTNLCAIHARRVTITPKDLQLARRIRGERPT
>M-3300017989-17.Ga0180432_10000796_46
MARIKQTNPKKTTGKAPRKKLNQKRSKTEKAPLTEAAATAAVGRDAPKTRKPHRFRPGTVALRQIRKYQKSTDLLLRKLPFQRLIREVAAEYKEDLRFQATAIEAIQEASENFLVNLYEDGNLLALHASRVTVQPEDLRLAKRLMNHNA
>M-3300018416-26.Ga0181553_10000392_19
MARVKSTNRVSTGGKMFRKQLATKAARKSFPNSINSIKKPYRYRPGTVALREIRKYQKSTDLLIRRLPFHRLVREIAQDFSQHLRFQSHAVLALQEAAEAYLVGIFEDTNLCAIHSNRITIMPRDIQLARRIRGESA
>M-3300018876-24.Ga0181564_10001970_2
MARTKQTARKCTGGKAPRRFLQAMAARKFAAPRPRRHHRYRPGTVALREIRKYQKSTELLIRKVPFQKLVREIAQDYKMDLRFQSSAILALQEAAEAYLVGLFEDTNLCAIHAKRVTIMVKDLQLARRIRGDRA
>M-3300020185-59.Ga0206131_10002306_3
MISKHVSFTDDLVDNLSDKENKPPHTDVDTMPQECVTDDDETSVASVVTHDVKSPSKLRPKPKLKPKAYRKKKDKWLSEILLQQASTKLTIPKAAFHRLVREMTIEFQTDIRYEANAFECLQEASEAFLVQMLEDAQLVCLFSGRRMISKKDIQVALRKKLF
>M-3300020185-80.Ga0206131_10002165_17
MARTKQTARKSLGGKAPRKTLATKAARSSAPTSGGVKKPHRYRPGTVALREIRKYQKSTELLIRKLPFQKLVKEISQDYKTDLRFQSTALLALQEASESYLVGLFEDTNMCAIHAKRITIMPKDIQLARRIRGESRSMSFHIVQRPCGSVAPEIIPILPSREVEGTESGETQAQVQGTESGETQAQ
>M-3300020187-94.Ga0206130_10010125_3
MARTKQTARKSLGGLVPIKMMKQLATLASRKSAPATGGVKRPKRYRPGTVALREIRKYQKGTELLIRKLPFQRLVKEISQDYKTDLKFQSAAILALQEASEAYLVGLFEDTNMCAIHAKRVTIMPKDIHLARRIRGEKRY
>M-3300020716-5.Ga0214207_1000346_6
MARTKQTARRSTGGKAPRKHLATKTVPPALRAVETPKGGVRKPRRYRPGTAALREIRKYQKSTELLIRKLPFQRLVREIAQDFKTDIRFQSSAVLAIQEAAEAYLIGLFEDCNLCAIHSKRVTIMPKDIQLARRIRGERS
>M-3300021354-14.Ga0194047_10000041_64
MELRSRKNQVGERVLRSGAPQAARKVPRGKPKPAPAQKVSVPKSRAERAAARNDRVPQMVNVPPVQPEPAIRRSASKPEVIAPAPVIRRGDSKTEVIEPEPVMRRGDSKTEVIEPEPVMRRGDSKMEVIEPEPVMRRGDSKMEVIEPEPVVGKRSSSKVDIAVEKVPSPVRPVQAEAPLAPEVAQKLLQQLDGTQAKHAMKTTGGRGGNPKKTGRQAPAKPGNQPAKPKTGGPNGIPKRRPEVDRKNIVNTERDRKKRFKLDKRLPHTNKNSRRAMLLRGVRIAQRNSSEDVFYKAPFRYLIIQIAQAHNKGDVRYTAESVKVLQEILEFEVIRLLEIAQMASSHARPGRGVADSKPKVLRSDIEFSYRMKFGNHFGDTGLDLVKVY
>M-3300023179-83.Ga0214923_10000013_208
MSNYCHKKFSMARTKQVARKVLAIKEPFTSAPATGGKMFPTMPSRIKFTSRRSAPATGGVKKPYRYRPGTVALREIRKYQKSTELLIRKLPFQRLVREITEGFKLDVRFQSTALLALQEASEAYLVSLFEDTNLCAIHAKRVTIMPKDMQLARRIRGERS
>M-3300023184-120.Ga0214919_10000160_32
MVNDQQIIGGSRGGQKAFKAVKQVKSRKQSAPKKANVKETTKKRRYRPGEKALREIRFYQRNTDLLIRRIPFGRLVREIQTYFFRKEYRWQAEAILALQEAAEAHLVGLFEDANLCTIHAKRITLMTKDIQLARRIRGPLRE
>M-3300023184-120.Ga0214919_10000160_49
MGFSVVNCTIYTDMARTKHTARKSTGGKAPRKLLATKAARRLAPQCGGVKRPHRYRPGVVALREIRKYQKSTDLLIRKVPFQRLVREVACDYKSDLKFQSAALLCLQEASEAYLTGLFEDTNLCAIHAKRVTIIPKDMQLARRLRGERS
>M-3300024510-1.Ga0255187_1000007_147
MRTPKFKYKISRDILASYMHQNITIEPFATNHTPILCTWIIFNPLDKILHHRQITVLTKNIHEFGLHVIVRDDEETISIERSKISKNTFDDVFIEYGRSRPDFSEAPIEPVDMKDVMTYKYPTQNSLVFVTSGLRGRVRVTLFHFFINSIIDFNSNALFFERICNDLNANYEKIVNSFSQIKTKENVTFVDNEKQYITFENEHGEVFATSIRKSGEYLPKELLPPASDDIVSEKEEEEEVPADNEDDKKRKREKVQSDDEELQSDDEEVQSDDENIVSDNAKKQRMCKRCRQPGHYAKNCGKEPKVSRTYTCSLCGETGHGIRSCPSRVGGRKKFKSKPFDNNGIRKIVHEQIVAEKKKGIPFAALVRCIRDESEKINADMKWQYSACAAISTAVHAYMNEVMEDVNNCAINRNSTTVEPRDFHLRALLRRETFPVVKTLKVGKTDVNVIMDDDYYE
>M-3300025310-20.Ga0209172_10003417_16
MELPRLENITKTITRNQSNGFHSGHEELSFNLSNSPTSVISYDVSSPSFNISPSRLVAIKSNKPTPRPKSPNPRKKEKPQRCPPGTRALMEIRKQQKRTNLLIRKGPFARLFKETFNRYSYLGRVTRVQPAAIEAVQHAAENYLIGLFHDANLCAIHSGRITIQPKDIQLVRRLRGLKEALF
>M-3300027721-1.Ga0209492_1001171_2
MLVREIAQDYKRDLKFQAEAINILQVVSEEYLLKLFQDSQQCANHAKRDTLTVDDMRLVKILRKNTTLDFEK
>M-3300027754-40.Ga0209596_1001834_3
MDEDKILKEISYYQNITHNLIPRESFKRLYYEVLQDFIANSCFSNNYE
>S-1026894-114.1026894_contig_93_26
MARVKQIAGKSIAGMRPAFGTKTLPLAKGGVKKPHRFRPGTVALREIRVYQKSTNLLIRKAPFQRLVREVAATYRPDLRFQSAAVMAIQEAAEAYLVGVFEDTNLAALHAKRVTIMPKDLKLARRIRGDIA
>S-1035106-53.1035106_contig_1019_6
MGRDEKSHNGVAKVAEKRAANSSSSRSERRMYRPGEKVLREIRFYQRSTFLLIRRAPFARLVREVQTLFFRNPIRWQAEAMLALQEAAESHIVGLFEDANLCTIHAKRVTIMPKDIQLARRIRGPLRE
>S-1035112-36.1035112_contig_121_30
MARTKQTARKSVGGKAPRKQLATKAARLACPTSGTFVCFFWLFFGIASSHDFSHPLLKNPTLSGGVKKPHRYRPGTVALREIRKYQKSTELLIRKLPFQRLIREITQDWKTDLRFQSSAMLALQESAEAYLIGLFEDTNLCAIHAKRVTIMPKDMALARRIRGERT
>S-1037377-105.1037377_contig_6351_10
MARTKQTARRSTGGKTPRKNLAAKAARKNPTPWSQMKPKKKRYRPGTVAIREIRKFQKSTELLIRKLPFQRLVKEVAQDVCSTPMRFQSLAILALQEATEAYLTGLFEDTNLCAIHAKRVTIMPQDMQLARRIRGH
>S-1063923-109.1063923_contig_6153_5
MARTKAKVVQNSGKVRLSEAARAAGKMPRKPQTQESSNEAKKPHRFRPGTVALREIRRYQKSTELLIRKMPFQRLVREIAQDFKPDLRFQASAVAALQEASEAYLVGLFEDTNLCALHARRVTIMPRDIQLARRIRGERT
>S-1091232-186.1091232_contig_1680_45
MARTKQTARKSTGGKAPRKQLATKAARTSAPATGGVKKPHRYRPGTVALREIRRYQKSVTLLIRKLPFQRLVREIAQEFKTDLRFQSKAVEALQEACEAYLTSLFEDINLCAIHAKRVTIQPKDIQLARRISTGFA
>S-1101168-109.1101168_contig_12964_5
MARTKQTARKSTGGKAPRKQLAAAARIGSFSDYERRLQEVRELQEQAQLQLDQVNSEILGEGAGVSSSINDEPYAQEEEAQQVDQALSTNINDKMTRQREVLQPLKVDSTEEPEPLGGGAPSRSRKRKQRKTQKRKNRRNRTRKN
>S-3300010368-67.3300010368_a_Ga0129324_10009559_6
MARTKQTANAHSAYAKKSPFRMILKQAFQSKLAGTRKPIRFKPGTVALRQIRKLQRTCDLLVPLQPFQRLIREVALSFKSDLRFEREALLAIQEAAEAHLIGMFQHTQNLAVHSGRKTIQDKDLSLAVHVRENATTPHIPPNRLVRVMPRFNNTPPIETSTQAQDSQHNGTALAQLASVVETLEGDTVQQVQSTLVDSPSQHAQHFEDSPRF
>S-3300013131-216.3300013131_a_Ga0172373_10001433_30
MPPKSSSVVANRAFHIAPRPTDKARRQHEVDARELVRHRRDALRHRSRREQEEEEEEEEEDEEEYEEEEEEQRGAAAASKVSSISSVLSSSRRRAAQKGSASALHKVSPSLLRSSEKATRAALLSAIARSQVSDTHLLSAPAFQRHVDMLAQSAGVTVGVEVIALLQEAVESYLVDLFTHARNVAAHACRDRVDGADIRIVCDIRSDHLRFPPA
>S-ERX555907-72.ERX555907_contig_2105_9
MSDDDEMSQCESSCSSSEGSIDIIYQLINVENFRVTPDKLYWMFEEFRRDEGHEPNINDIRAELVAKQFDVQDIHDQESSDEESEFDLQLYFTGEESDETEVDSEEEREMEEDIYLQDKHEEFESHLDKYLMEQSYLTGYEPDNYAVQDIYEKSYNESLSTQQAINEIKQLPISDEDKKEIEEFKEDTPVWLSEINIEQNKTDLCLNKRNFKNLVKEIANDCKTGLQFEPEVFEMLQHVSEDYLIKLFEDTQKMAIKAQRQEIMPKDMYTVRHLNKN
>S-ERX555967-131.ERX555967_contig_106_76
MSYSSFSHINYNSDNDIIYQLINIENFTIHPDKLHNIICEIQNIHNLEKPHLLLEYTRAILLAEQFNNLESSVNDSSTDSEFDFNETESEIDSDDEAEMENDMQFQDELEKKVKPLDNWLSEHTYLLPYLDDNFITTIYSQLQDNTITHTEALNHLFQHKHSIILEHPDYTPDDLWLREIKSEQEKTDLCLNTINFENLVREMVHNYTPHIIDFEPTVFTMLQTISESFLIETFQHANIKAISVNRHEIQPKDLH
>S-ERX555967-131.ERX555967_contig_365_3
MSDSESEENIDIVYELINTEGFNIDAEKLQSIEEEIREKYNLWNSDDKTVLNHMRAYLLEETFNINIDENKPQINNEDCIDEHDFEETETEVDSDEEEQMEKDMEIEDNMEEEESKLDNWLHENKYFESYMDDDVISQIYNDFRSGDLNKKDAIQNFEHHKAQSLSKIQAEDPEFVPDPTWLCEIKCQQEKTDFCLNKDNFENLIREITQDINSNITNFEPEVFEMLQSVSEDYLLEKFRFANYQAIHSGRDEIRPKDFYSIRF
>S-ERX556019-53.ERX556019_contig_403_29
MARTKQTARKSTGGKAPRSELAAKVSRNINMYGVRRRPHRYRPGTVALRQIRNYQKSTELLIRKLPFQRLVREVAQDFKCDLRFQSTALMALQEAAEAYLVGIMEDTNLCAIHAKRVTIMPKDMQLARCIRGERK
>S-ERX556028-76.ERX556028_contig_3277_7
MPCAISSASASASTAPRTGSIAAHARWRCATRRARWKGARTPRASASCIVEEKKKKKKKKKEVKKEEGKVELVSQIDKKQKTKTGSWRFPTILDYFLSMYWMETVDGSLRRARTGRSLISRQPPPSARDRSLFTFSLAVVAPVFVFVIRHPSLQSMAKTKQTARKSTGGKAPYLHLASKVPRQAARKSAPRKGGVRKPHRYRPGTVALREIRKYQRSTELLIRKLPFQRLVREISQDFKTDIRFQSTALLALQEAAEAYLVGIFEDTVGLCLRARCHTPPADPPHAHAHHARATHAPHL
>S-ERX556045-64.ERX556045_contig_3476_12
MPKVLHSQLTRLDIKRRYKYKPGTVALREVHKLQRTSHLLIPLSPFQELIREIAMSYKADLRFQREALLAIQEAAESHLVSIFQHTQNMAIHGGRKTIQDRDLTLALHVRDNNVEPFIPPVRTRRDLLMNNVAKMVDENSNSSGNTSPNITNTNTTVPELQSTNEVQTYVQREDEDEDEDEEHDY
>S-ERX556119-27.ERX556119_contig_957_6
MARTKQTARKSTGGKAPRKMLATKAARKTEPHSGGVKKPHKYRPGTVALREIRKYQKTTDLLLRKLPFSRLVREVSQGFRSDIRWQASAMQGIQEITEAFIIGLLQNANLGCIHAKRVTCMPKDVDLAHRIKYRQENGI
>Sylv.Sylv_21_653
MARTKQTARKSCSGKTPRKMLATKTARKAFPQDAGIKGKRKFRYRPGTVCLQEIRRFQRSTELLIKKMPFQRLVREIAEEFKHDLRFQSTAILALQEISESYLVGLFEDTNLCAIHAKRVTISVRDMHLALRIRGERS
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MPRTKRVAESSVLLRSSDRRPVVAAKSPFRFIMKKHSQSRLRDANGNLIRKPYK...FDD'), id='M-3300025880-30.Ga0209534_10000479_25', name='M-3300025880-30.Ga0209534_10000479_25', description='M-3300025880-30.Ga0209534_10000479_25', dbxrefs=[]),
 SeqRecord(seq=Seq('MAQKAPRKKDAQKAMKAPRSTDGAKKPHRFRPGTVALREIRRFQKSTDLLMPKL...NTR'), id='S-1063923-109.1063923_contig_2940_19', name='S-1063923-109.1063923_contig_2940_19', description='S-1063923-109.1063923_contig_2940_19', dbxrefs=[]),
 SeqRecord(seq=Seq('MEQTMNKLSLDDDEWEDEIEDEEEIKAEEENSEEESESDNEVEEVEEDEEEEEI...ELS'), id='Arvu.Arvu_1_24', name='Arvu.Arvu_1_24', description='Arvu.Arvu_1_24', dbxrefs=[]),
 SeqRecord(seq=Seq('MPKERAIKKSKSAVKKVAEKKAAIKSKAKKAATGVKKPHRFRPGTTAKRLSKKE...KKE'), id='Medus.AP018495_BBI30395_1_255', name='Medus.AP018495_BBI30395_1_255', description='Medus.AP018495_BBI30395_1_255', dbxrefs=[]),
 SeqRecord(seq=Seq('MARKIQVARKSFGGKAPLSKQAFKAALKIPISIKKKKRSRPGLIALKEIRKFQN...KPI'), id='M-3300005517-3.Ga0070374_10002873_8', name='M-33

In [77]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": "H3-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300025880-30.Ga0209534_10000479_25
------------------------------------------------
S-1063923-109.1063923_contig_2940_19
------------------------------------------------
Arvu.Arvu_1_24
------------------------------------------------
Medus.AP018495_BBI30395_1_255
------------------------------------------------
M-3300005517-3.Ga0070374_10002873_8
------------------------------------------------
M-3300009182-30.Ga0114959_10001569_12
------------------------------------------------
M-3300009182-30.Ga0114959_10001569_15
------------------------------------------------
M-3300009436-29.Ga0115008_10001158_7
------------------------------------------------
M-3300010375-16.Ga0105239_10001517_25
------------------------------------------------
M-3300014838-63.Ga0182030_10004491_6
------------------------------------------------
M-3300017724-7.Ga0181388_1001570_2
------------------------------------------------
M-3300017765-1.Ga0181413_1000150_

In [78]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession HISTDB_H3_like_1 <class 'str'>
variant H3-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MAQKAPRKKDAQKAMKAPRSTDGAKKPHRFRPGTVALREIRRFQKSTDLLMPKLPFQRMVRIAADDVQKDVRFQASAIAALQEATEAYIVSLFEQANLAAIHSKRVTVMPKDIALALRIRPKNTR <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [79]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [80]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H3-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H3_like_0,H3-like_(Viruses),,,,,,,,,,MPRTKRVAESSVLLRSSDRRPVVAAKSPFRFIMKKHSQSRLRDANG...,,,
1,HISTDB_H3_like_1,H3-like_(Viruses),,,,,,,,,,MAQKAPRKKDAQKAMKAPRSTDGAKKPHRFRPGTVALREIRRFQKS...,,,
2,HISTDB_H3_like_10,H3-like_(Viruses),,,,,,,,,,MARTKQTARKSLGGLAPIKMMKQLATLASRKSAPATGGVKRPKRYR...,,,
3,HISTDB_H3_like_11,H3-like_(Viruses),,,,,,,,,,MARTKQTARKSTGGKAPRKQLATKVARMKARPQAGGVKKPHRYRPG...,,,
4,HISTDB_H3_like_12,H3-like_(Viruses),,,,,,,,,,MHILARVVCWFVFTHHKKKVASFFLVLLFLSYLFFEKKPNPHISKN...,,,
5,HISTDB_H3_like_13,H3-like_(Viruses),,,,,,,,,,MARIKQTNPKKTTGKAPRKKLNQKRSKTEKAPLTEAAATAAVGRDA...,,,
6,HISTDB_H3_like_14,H3-like_(Viruses),,,,,,,,,,MARVKSTNRVSTGGKMFRKQLATKAARKSFPNSINSIKKPYRYRPG...,,,
7,HISTDB_H3_like_15,H3-like_(Viruses),,,,,,,,,,MARTKQTARKCTGGKAPRRFLQAMAARKFAAPRPRRHHRYRPGTVA...,,,
8,HISTDB_H3_like_16,H3-like_(Viruses),,,,,,,,,,MISKHVSFTDDLVDNLSDKENKPPHTDVDTMPQECVTDDDETSVAS...,,,
9,HISTDB_H3_like_17,H3-like_(Viruses),,,,,,,,,,MARTKQTARKSLGGKAPRKTLATKAARSSAPTSGGVKKPHRYRPGT...,,,


In [81]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [82]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [83]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H3-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H3_like_0,H3-like_(Viruses),,,,,,,,,,MPRTKRVAESSVLLRSSDRRPVVAAKSPFRFIMKKHSQSRLRDANG...,,HISTDB_H3_like_0,irwin_self-assembling_2024
1,HISTDB_H3_like_1,H3-like_(Viruses),,,,,,,,,,MAQKAPRKKDAQKAMKAPRSTDGAKKPHRFRPGTVALREIRRFQKS...,,HISTDB_H3_like_1,irwin_self-assembling_2024
2,HISTDB_H3_like_10,H3-like_(Viruses),,,,,,,,,,MARTKQTARKSLGGLAPIKMMKQLATLASRKSAPATGGVKRPKRYR...,,HISTDB_H3_like_10,irwin_self-assembling_2024
3,HISTDB_H3_like_11,H3-like_(Viruses),,,,,,,,,,MARTKQTARKSTGGKAPRKQLATKVARMKARPQAGGVKKPHRYRPG...,,HISTDB_H3_like_11,irwin_self-assembling_2024
4,HISTDB_H3_like_12,H3-like_(Viruses),,,,,,,,,,MHILARVVCWFVFTHHKKKVASFFLVLLFLSYLFFEKKPNPHISKN...,,HISTDB_H3_like_12,irwin_self-assembling_2024
5,HISTDB_H3_like_13,H3-like_(Viruses),,,,,,,,,,MARIKQTNPKKTTGKAPRKKLNQKRSKTEKAPLTEAAATAAVGRDA...,,HISTDB_H3_like_13,irwin_self-assembling_2024
6,HISTDB_H3_like_14,H3-like_(Viruses),,,,,,,,,,MARVKSTNRVSTGGKMFRKQLATKAARKSFPNSINSIKKPYRYRPG...,,HISTDB_H3_like_14,irwin_self-assembling_2024
7,HISTDB_H3_like_15,H3-like_(Viruses),,,,,,,,,,MARTKQTARKCTGGKAPRRFLQAMAARKFAAPRPRRHHRYRPGTVA...,,HISTDB_H3_like_15,irwin_self-assembling_2024
8,HISTDB_H3_like_16,H3-like_(Viruses),,,,,,,,,,MISKHVSFTDDLVDNLSDKENKPPHTDVDTMPQECVTDDDETSVAS...,,HISTDB_H3_like_16,irwin_self-assembling_2024
9,HISTDB_H3_like_17,H3-like_(Viruses),,,,,,,,,,MARTKQTARKSLGGKAPRKTLATKAARSSAPTSGGVKKPHRYRPGT...,,HISTDB_H3_like_17,irwin_self-assembling_2024


In [84]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H4-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [85]:
accession_prefix = 'HISTDB_H4_like'

In [86]:
fasta_str = '''>M-3300010375-16.Ga0105239_10000898_19
MRRIRSALSGAVAAKPRTGSAPSSAASAPSSALSGSAQPASALSGSPASSALSPAPSVPPAPSPASASSFLSRSNVVVSSSPHPQDQNQTNTNAKDDDEETIGNEDEEKEQEPEQEDKEQEEQEDKENDNDDDDDKEENDKEDDDDTDTTNDKMNKMVTAKHVLINNKPSVPTQSKNMKGLGKFSYGGKGKHQSMVRHRKVLRDNIKGITKPAIRRLARRGGVKRLSGMIYEPTRNVLKVFLEETIRDAVTYTEHAHRKTVTTMDVVYSLKRRGRTLYGFGG
>M-3300012936-38.Ga0163109_10001438_2
MKTISLDKRALRHRNILWNNIQGISKNDIKRLARKGGVKRINGLIYDEIRCSLRGFLTNIIKDSVTYAEHSRRKTVTTMDILYALKRSGKTLYL
>M-3300013004-26.Ga0164293_10004614_11
MSKEISSLSLKRLFNRAGIKRVKHDCYNNLIKYISNFTKQILSDTTTLISNRKNKTITEEDIKNGHIVNNLLKIIDNIHNGGDSEGFCHGSPSQCGIVNQVIPVNAPEMQKGGTVDRIIPNSFCNGNPSQCFYSDLATECKVGGSVIRDNNEYIFSIPNTQFQRFLNTLQFNDVKMTKNSVNYLQYIVEQDAINYLIDKKKDMDDMNQKQLDLIEENDDSNNE
>M-3300017765-1.Ga0181413_1000103_21
MYGRSGPPEYSSAEIKQLLEDRERWERLEVTLREKGQWFAPDEIDKARRDAVYRAADEKAAAEKAAAEKAAAEKAAAQAKVRPTPRGRGKGGKGLGKGGSKRHRMFLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLEGVIRDAIEYTTHARRKTVNAIDVVYALKRQGRSLYGFGG
>M-3300017989-29.Ga0180432_10002178_29
MSVVRHRRRGLPVGPSKAAIAKLAKRGGVKRIGSDSSGRHTFDEVKGVLKQFIETVLKDAIMYTEYARRKTITVGDILYALKRQGRHVYGFDEGVVVKPKKTKVIPLHRRRITDYEIYVDGKHKANRSRQDGLWDEQRVDQLRRVYMEEKQEEARDGKHRYFKIYFRTRPRGRKAQEALYKVYINGRLLNNPSRSNGLWLNRELTAKLRRFKETHRELGRGFVKLYLENK
>M-3300017991-11.Ga0180434_10016851_5
MEDLYKYKMEDLTKPSITRLARRAGVKSVSDDCFVPIRNIIANRLDELIMTALVVNSEHQTKTLMSDDIYDALWLSGENTTQSNDLGTSTCSK
>M-3300023174-125.Ga0214921_10003031_21
MDYLNKSSINKLSRRAGVKSISEECNEKIRKIIENKLDEIISTIIIINSEHNTKTIMTNDVHESLHLLNHKITTYN
>M-3300023179-83.Ga0214923_10000013_213
MTEGRGKGKANIGKGGSKRHRKILRDSAHGITKPAIRRLARRGGVKRLSGLIYEETRGILRMFLENIMCDAITYTDYSRRKTVTAMDVVYALKRQGRTLYGGFETK
>S-1063923-109.1063923_contig_1741_4
MPGKKKPATAGGQPKRRITRVIYIRPFPTKASTRRLARRGGVKRLGGGVYDDMQKKIRGFVEETTRLAVTYMDHARRKTVRADDVVHALKRSGHVLYGY
>S-1091232-186.1091232_contig_1680_44
MSGRGKGKGGKGLGKGLGQGLRHRKILRDNIQGITKPAIRRLCRRGGVKRISGLIYDETRGVLKVFLENVIRDAVTYTEHAKRKTVTGLDVIYALKRQGRTLYGFGG
>S-ERX556119-27.ERX556119_contig_957_12
MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRSGIKRISGELYETVRGHTKTFLTNVICDAITYTEHARRRTVSAMDLVYALKRQDISAYGQWTGY
>M-3300001346-3.JGI20151J14362_10006060_6
MENLLLKTGFDNLENDDSNQEVQLQMIALLTVFMENAMKTAEIYTKEANRKVITSHDISLSLKRELFTFLDNEDIEERSLEILNEFKSELENHNETYNDESDEGDEGVEGDEGVEGDEGDEGDEGDEGDEGDEGDEGDEGDEEEFTICNSDCKICQEVNMYAEKWKTWQPTNNIEKILWSGINKIDEKFNLN
>M-3300005589-24.Ga0070729_10000551_85
MEITRPSITRLARRAGIKSVSEECFPSIKALIVYELENAIRASLIVNSEHQTKTLMTDDIYDGLALNGKRLTMSHDLGTATVAK
>M-3300009163-40.Ga0114970_10000255_13
MSKEISSLSLKRLFNRAGIKRVKHESYEQLAKYMTNFTKQILSDTNVLISNRKNKIMTEEDIKNGLIVNNLLHIIDNIHKGGNSENFCHGNNSQCGSDPPIIAPPEPGKMTGGTYDSITNTFCNGNTSQCYFDDVVDCNMGGGGLLNNDEYVFSIPNTQFQRFLNTLHFNNMKITKSSVNYLQYFIEQNAIKYLIAKKKDMEDMNQKQFDLLEENNNDSDTSI
>M-3300012952-6.Ga0163180_10010018_5
MSTPETTQDNITDKGFTKPAITRLARRAGVKSMSDDCVNPIRNLIGMELDRILSTVLIVNEQHATKTIMPDDIYEALGLLGTHVAKTEEF
>M-3300012953-22.Ga0163179_10001168_4
MEHITKPSMTRLARRAGVKSMSEDCYPVIHQEIGEMIEDIMRVALVVNSTRSTKTLMVEDMYDALRLKGYNVAHSTDMGTGTCLK
>M-3300017786-5.Ga0181424_10000068_27
MTDRGIPDQIKTGFNLIKDDGQDVIENIASIVLVFMENAVKSADIYAKHAKRNTITSEDIKRALMLEVFFMKQRPNMLEQCEEMKKTIQDIIKEEEDSDEEFEIFDENEEESFKESECTCPMCGCMNTIYTRWENFTPETTIEIAMAKHIETIS
>M-3300018065-4.Ga0180430_10003326_10
MDNVPNSTLRKIAQRAGIQRVDHDCYPIMRQLLSARLDNIMAVIFAIRESRATKTITQDIVNSALVSLGYSVLPEDVTGSYTLVRDIPDEDDSISEDSGNTV
>M-3300020187-27.Ga0206130_10001190_21
MNNLFQILFGGNIKKKKNCNLTNSSIKRFARKAGIMYLPKNTYDNIRKIFNNKFKKTLSELENITELRNGKIITSNDFQFYILKKKYEKLYNKYHNSNFIGGNDNPSYCDGPTNLTQCMDSLDTCNTLNGGSDNPSYCDGPTNLTQCMDSLDTCNTLNGGSNNPSYCDGPTNLTQCMDSLDTCNTLNGGSDNPSYCDGPTNLTQCIDSLDTCNTQRGGQSTKNKDLNFLFPHKTFKRYLKEYKNTDFKISNKFAIKLHLYLENYIHHTLVKASLLENKSKNLKISLDKFL
>M-3300021364-5.Ga0213859_10000473_10
MNENNKFSDTNEQISKGSIVKLARRSGIQIMNKDTIEIVNDIISYELDDIIKSSISVKNVNKEKTLMKDHVYSAMKLKGDNYQKIL
>M-3300022752-15.Ga0214917_10000327_48
MEITKPSITRLSRRAGVKSLSDECHEMIRKIIENKLDEVLNAVLCVNSEHNTKTILSSDVYGALHLLNHKVTASTEL
>M-3300023174-198.Ga0214921_10000796_51
MDITKPSITKLSRRAGVKSLSDDCHDMIRNIMENKLTDVIKAVIAVNSEHNTKTIMSNDVYDALSLLNHRVTQSNDLNV
>M-3300023179-108.Ga0214923_10000479_29
MEGITKPSLTRLARRAGVKSLSDDCFDTVRNLIGMKLTEVIKTINIVNSEHQTKTIMPSDIYESLHLLNYNVTQSNDLSINK
>M-3300023179-111.Ga0214923_10000143_35
MEITKPSITRLSRRAGVKSLSDECHDTIRKIIETKLDEILKTVITVNSEHNTKTIMTADVYEALHLLNHNITTSNDLNS
>M-3300023179-146.Ga0214923_10000147_31
MENIDNISKPSITRLARQSGIKSLSDDCFETIRNIMDEKIDEIVKTILIINSEHQTKTVMVSDAYHALQMLNHNITESTELNTKSKN
>M-3300023179-159.Ga0214923_10009027_5
MENNIDNISKPSITRLARQSGIKSLSEDCFETIRNLIDEKLDEITKTILIVNSEHQTKTIMVSDVYKALQILNYNVAESTELNTKNK
>M-3300023179-32.Ga0214923_10001249_9
METISKPSLTKLARQAGIKSLSDDCFETIRHVMNNKIDEIIKTMLIVNSEHKTKTIIVNDVYESLQILNHNVAESSELNTKS
>M-3300023179-45.Ga0214923_10001383_30
MDYLNKSSINKLSRRAGVKSISEECNEKIRKIIELKLDEIISTIIVINSEHNTKTIMVNDVYDSLHLLNHKITSSSGL
>M-3300023179-83.Ga0214923_10000013_140
MDVITKISITKLSKRAGIKCISDDCYDTIKQIIETKLSDVLHKILIVNSENKSKTIMNTDVYAALEIMGETLTQSNDLGLHKQKLK
>M-3300023184-29.Ga0214919_10008089_5
MENIDNISKPSITRLARQAGIKSLSEDCFETVRNLIDEKLNEVIKTIIVVNSEHQTKTIMVSDVYKGLQLLNYNVAESSYLNTKNKS
>M-3300024319-1.Ga0228670_1000005_4
MESITSPSITRLARKAGIKSMSNECYDCIRNIAQEELVNIVKTMLVVNSEHNTKTIMQDDIYDALKLKGHFVAQSQELSS
>M-3300025676-16.Ga0209657_1000031_8
MESITRPSITRLARKAGIKSMSNDCYDCIRGIAQEELVNIVKTMLVVNSEHNTKTIMQDNIYDALKLKGHFVAQSQELSS
>M-3300027687-8.Ga0209710_1002799_3
MTHIIYAYLLQMNTTQPKQIESDTVENVMAIVLSFMENAIHDAGTYVEHAGRTIVSKQDIRMALQAETFEYMRREDIEGALLYYKEEVHKDIEQYDNPHYEDSDSGEDEVQTFLQNKVVPDSDMETFTKSLCKCPICSRIHRAVAMWDTWNPTTDMEKVIKKVIDTQLID
>M-3300027697-22.Ga0209033_1001294_11
MENITKPSITRLARRAGVKSLSDDCYNNIRDIVNKQLSDIIVAALVVNSEHNTKTLMPEDIYEAFRLRGYNVTQSSELGTSTCAK
>S-1016713-169.1016713_contig_219_51
MDYITKPSISRLAKRAGIKTISDDCYLIIHESIGEEINKIISTALAVNKTKTLMVEDIQAAFRLNGYNIAKSNDIGSGKY
>S-1016716-111.1016716_contig_9580_5
MENITRPSITRLARRAGVKSVSDNCFDTVRELIGQRLQEVIAVSLIVNSEHQTKTLMADDVYEALRLLSYNVTQSSDMGTSTCSK
>S-1030632-100.1030632_contig_8437_5
MEHITKPSITRLARRAGVKSISEDCYPVIHDSIGAVIEDVIGVALLVNAARSTKTLMEEDIHDALRLKGYNVAQSTDLGVSTCLR
>S-1038524-41.1038524_contig_17_104
MEDITKPSITRLARRAGVKSVSDDCFNAIRHLIANRLDELILAALIVNSEHQTKTLMSDDVYDAFSLIGQNVTQSSDLGTSTCSK
>S-1101173-79.1101173_contig_5_106
MENIDNISKPSITRLARQAGIKSLSEDCFETVRNLIDDKLNEVVKAIIVVNSEHQTKTVMVSDVYKALQILNYNVAESSYLNTKNKS
>S-3300002186-40.3300002186_a_JGI24539J26755_10000119_24
MTIEVDGVTKPAMIRLARRAGIKSVADECFPFIRKIIHDKADDIIRTSIFINSEHQTKTLMTEDIYESLRISGHNVAKSDDFNGSK
>S-3300010368-70.3300010368_a_Ga0129324_10000075_58
MSGITKPSITRLARRAGVKSSSDECFDTIRGLIRDKLDEVINVAMVVNSEHQTKTLMVENIYDALHLMNENVTHSMDLGKTTYIN
>S-3300013131-216.3300013131_a_Ga0172373_10003265_4
MRSQVRRIGAEVPPRRSDRIHADAKPSSSIVHTDHPVGITASSIRRIARRAGVLTISNDVYLEGTRCYDNVIGTLVHEVCINAKHARRASVSMDDVRAVLRERDLVVFK
>S-ERX555957-35.ERX555957_contig_6704_7
MENLILKTGFNNSINNSSQSEEYELQMMALMTVFIENAVKTAEIYTKHSNRKTITSIDISLGLKKELFTFLDNDDIEERALAIFNEFKNEDFSTSSSDNDEESVEDDDDDDDDDLDREILEEDPFGNFCHLYDTNTKEKKNEIDKNEEEDEEEFKKSECECDVCKKTNEYAELWKTWEPTNRIEEILYSSIKNIDNKFNLLD
>S-ERX556045-99.ERX556045_contig_436_43
MEHITKPSIIRLARRAGVKSISEECYPVIHDTIGSTIEDIVRVALIVNSARSTKTLMVEDIHDALSLEGYNVAQSTDLGTGSCIR
>Medus.AP018495_BBI30394_1_254
MPKKIAARRSSKHIKNLGEEIGNSAVRKTVLRTGVVFRLDKTVRPKFHKVMLSKLYEAVNIAKLAAKHSGRSTIQPKDVRLGLKLASIKLLA
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MRRIRSALSGAVAAKPRTGSAPSSAASAPSSALSGSAQPASALSGSPASSALSP...FGG'), id='M-3300010375-16.Ga0105239_10000898_19', name='M-3300010375-16.Ga0105239_10000898_19', description='M-3300010375-16.Ga0105239_10000898_19', dbxrefs=[]),
 SeqRecord(seq=Seq('MKTISLDKRALRHRNILWNNIQGISKNDIKRLARKGGVKRINGLIYDEIRCSLR...LYL'), id='M-3300012936-38.Ga0163109_10001438_2', name='M-3300012936-38.Ga0163109_10001438_2', description='M-3300012936-38.Ga0163109_10001438_2', dbxrefs=[]),
 SeqRecord(seq=Seq('MSKEISSLSLKRLFNRAGIKRVKHDCYNNLIKYISNFTKQILSDTTTLISNRKN...NNE'), id='M-3300013004-26.Ga0164293_10004614_11', name='M-3300013004-26.Ga0164293_10004614_11', description='M-3300013004-26.Ga0164293_10004614_11', dbxrefs=[]),
 SeqRecord(seq=Seq('MYGRSGPPEYSSAEIKQLLEDRERWERLEVTLREKGQWFAPDEIDKARRDAVYR...FGG'), id='M-3300017765-1.Ga0181413_1000103_21', name='M-3300017765-1.Ga0181413_1000103_21', description='M-3300017765-1.Ga0181413_1000103_21', dbxrefs=[]),
 SeqRecord(seq=Seq('MSVVRHRRRGLPVGPSKAAIAKLAKRGGV

In [87]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": "H4-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300010375-16.Ga0105239_10000898_19
------------------------------------------------
M-3300012936-38.Ga0163109_10001438_2
------------------------------------------------
M-3300013004-26.Ga0164293_10004614_11
------------------------------------------------
M-3300017765-1.Ga0181413_1000103_21
------------------------------------------------
M-3300017989-29.Ga0180432_10002178_29
------------------------------------------------
M-3300017991-11.Ga0180434_10016851_5
------------------------------------------------
M-3300023174-125.Ga0214921_10003031_21
------------------------------------------------
M-3300023179-83.Ga0214923_10000013_213
------------------------------------------------
S-1063923-109.1063923_contig_1741_4
------------------------------------------------
S-1091232-186.1091232_contig_1680_44
------------------------------------------------
S-ERX556119-27.ERX556119_contig_957_12
-----------------------------------------------

In [88]:
for k, v in data_sequence[1].items():
    print(k, v, type(v))

accession HISTDB_H4_like_1 <class 'str'>
variant H4-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MKTISLDKRALRHRNILWNNIQGISKNDIKRLARKGGVKRINGLIYDEIRCSLRGFLTNIIKDSVTYAEHSRRKTVTTMDILYALKRSGKTLYL <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [89]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [90]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H4-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H4_like_0,H4-like_(Viruses),,,,,,,,,,MRRIRSALSGAVAAKPRTGSAPSSAASAPSSALSGSAQPASALSGS...,,,
1,HISTDB_H4_like_1,H4-like_(Viruses),,,,,,,,,,MKTISLDKRALRHRNILWNNIQGISKNDIKRLARKGGVKRINGLIY...,,,
2,HISTDB_H4_like_10,H4-like_(Viruses),,,,,,,,,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRSGIKR...,,,
3,HISTDB_H4_like_11,H4-like_(Viruses),,,,,,,,,,MENLLLKTGFDNLENDDSNQEVQLQMIALLTVFMENAMKTAEIYTK...,,,
4,HISTDB_H4_like_12,H4-like_(Viruses),,,,,,,,,,MEITRPSITRLARRAGIKSVSEECFPSIKALIVYELENAIRASLIV...,,,
5,HISTDB_H4_like_13,H4-like_(Viruses),,,,,,,,,,MSKEISSLSLKRLFNRAGIKRVKHESYEQLAKYMTNFTKQILSDTN...,,,
6,HISTDB_H4_like_14,H4-like_(Viruses),,,,,,,,,,MSTPETTQDNITDKGFTKPAITRLARRAGVKSMSDDCVNPIRNLIG...,,,
7,HISTDB_H4_like_15,H4-like_(Viruses),,,,,,,,,,MEHITKPSMTRLARRAGVKSMSEDCYPVIHQEIGEMIEDIMRVALV...,,,
8,HISTDB_H4_like_16,H4-like_(Viruses),,,,,,,,,,MTDRGIPDQIKTGFNLIKDDGQDVIENIASIVLVFMENAVKSADIY...,,,
9,HISTDB_H4_like_17,H4-like_(Viruses),,,,,,,,,,MDNVPNSTLRKIAQRAGIQRVDHDCYPIMRQLLSARLDNIMAVIFA...,,,


In [91]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [92]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [93]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant = 'H4-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H4_like_0,H4-like_(Viruses),,,,,,,,,,MRRIRSALSGAVAAKPRTGSAPSSAASAPSSALSGSAQPASALSGS...,,HISTDB_H4_like_0,irwin_self-assembling_2024
1,HISTDB_H4_like_1,H4-like_(Viruses),,,,,,,,,,MKTISLDKRALRHRNILWNNIQGISKNDIKRLARKGGVKRINGLIY...,,HISTDB_H4_like_1,irwin_self-assembling_2024
2,HISTDB_H4_like_10,H4-like_(Viruses),,,,,,,,,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRSGIKR...,,HISTDB_H4_like_10,irwin_self-assembling_2024
3,HISTDB_H4_like_11,H4-like_(Viruses),,,,,,,,,,MENLLLKTGFDNLENDDSNQEVQLQMIALLTVFMENAMKTAEIYTK...,,HISTDB_H4_like_11,irwin_self-assembling_2024
4,HISTDB_H4_like_12,H4-like_(Viruses),,,,,,,,,,MEITRPSITRLARRAGIKSVSEECFPSIKALIVYELENAIRASLIV...,,HISTDB_H4_like_12,irwin_self-assembling_2024
5,HISTDB_H4_like_13,H4-like_(Viruses),,,,,,,,,,MSKEISSLSLKRLFNRAGIKRVKHESYEQLAKYMTNFTKQILSDTN...,,HISTDB_H4_like_13,irwin_self-assembling_2024
6,HISTDB_H4_like_14,H4-like_(Viruses),,,,,,,,,,MSTPETTQDNITDKGFTKPAITRLARRAGVKSMSDDCVNPIRNLIG...,,HISTDB_H4_like_14,irwin_self-assembling_2024
7,HISTDB_H4_like_15,H4-like_(Viruses),,,,,,,,,,MEHITKPSMTRLARRAGVKSMSEDCYPVIHQEIGEMIEDIMRVALV...,,HISTDB_H4_like_15,irwin_self-assembling_2024
8,HISTDB_H4_like_16,H4-like_(Viruses),,,,,,,,,,MTDRGIPDQIKTGFNLIKDDGQDVIENIASIVLVFMENAVKSADIY...,,HISTDB_H4_like_16,irwin_self-assembling_2024
9,HISTDB_H4_like_17,H4-like_(Viruses),,,,,,,,,,MDNVPNSTLRKIAQRAGIQRVDHDCYPIMRQLLSARLDNIMAVIFA...,,HISTDB_H4_like_17,irwin_self-assembling_2024


In [94]:
# Make sure data is committed to the database
conn.commit()

# Add H2A-H2B-like_(Viruses), H2B-H2A-like_(Viruses), H4-H3-like_(Viruses)

In [95]:
data_histone = [{
    "id": f"{htype}-like_(Viruses)",
    "level": "variant",
    "taxonomic_span": "Viruses",
    "taxonomic_span_id": "10239",
    "description": None,
    "parent": "Doublet",
} for htype in ['H2A-H2B', 'H2B-H2A', 'H4-H3']]
for dh in data_histone:
    cursor.execute(add_histone, dh)

In [96]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin([f"{htype}-like_(Viruses)" for htype in ['H2A-H2B', 'H2B-H2A', 'H4-H3']])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
107,H2A-H2B-like_(Viruses),variant,Viruses,10239,,Doublet
154,H2B-H2A-like_(Viruses),variant,Viruses,10239,,Doublet
198,H4-H3-like_(Viruses),variant,Viruses,10239,,Doublet


In [97]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H2A-H2B-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [99]:
type_like = 'H2A-H2B'
accession_prefix = f"HISTDB_{type_like.replace('-', '_')}_like"
print(f'accession_prefix: {accession_prefix}')

accession_prefix: HISTDB_H2A_H2B_like


In [100]:
fasta_str = '''>M-3300004273-3.Ga0066608_1004297_5
MCDKSKKAGLQFSVNGTIDYINKFITPSYLEYKISVEVAVIITSTLEYLSAEILELSGNNVSSSKVIHDNHIEKGICSDEELNGVFGYYFVNKKNETFLQNEITKIGRHLDLTDGIKKVLKQVHPNLDISDESCSSVCFLLSVMRKNLIEKSINLCKFIKKDIVENEDILIIIRMIISNELGKHALLEASKAVYKFTKD
>Indi.Indi_5_569
MEVQCGLQMSVNAIKKILINELELRGITYNISNDVAIRITAIIEYLVAEIMELGGNVTLNKNRKRLSIDHVILAIQTDSELNKLYNSTNNIISNKPKDETPRLSTWTCKILTQIHPDTNISRDAKRFIDKLVYDTVKKFSDMFPLNEKDININDMIDLVIPNELSKHAKNEMNKAIAKFNSK
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MCDKSKKAGLQFSVNGTIDYINKFITPSYLEYKISVEVAVIITSTLEYLSAEIL...TKD'), id='M-3300004273-3.Ga0066608_1004297_5', name='M-3300004273-3.Ga0066608_1004297_5', description='M-3300004273-3.Ga0066608_1004297_5', dbxrefs=[]),
 SeqRecord(seq=Seq('MEVQCGLQMSVNAIKKILINELELRGITYNISNDVAIRITAIIEYLVAEIMELG...NSK'), id='Indi.Indi_5_569', name='Indi.Indi_5_569', description='Indi.Indi_5_569', dbxrefs=[])]

In [101]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": f"{type_like}-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300004273-3.Ga0066608_1004297_5
------------------------------------------------
Indi.Indi_5_569


In [102]:
for k, v in data_sequence[0].items():
    print(k, v, type(v))

accession HISTDB_H2A_H2B_like_0 <class 'str'>
variant H2A-H2B-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MCDKSKKAGLQFSVNGTIDYINKFITPSYLEYKISVEVAVIITSTLEYLSAEILELSGNNVSSSKVIHDNHIEKGICSDEELNGVFGYYFVNKKNETFLQNEITKIGRHLDLTDGIKKVLKQVHPNLDISDESCSSVCFLLSVMRKNLIEKSINLCKFIKKDIVENEDILIIIRMIISNELGKHALLEASKAVYKFTKD <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [103]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [104]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2A_H2B_like_0,H2A-H2B-like_(Viruses),,,,,,,,,,MCDKSKKAGLQFSVNGTIDYINKFITPSYLEYKISVEVAVIITSTL...,,,
1,HISTDB_H2A_H2B_like_1,H2A-H2B-like_(Viruses),,,,,,,,,,MEVQCGLQMSVNAIKKILINELELRGITYNISNDVAIRITAIIEYL...,,,


In [105]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [106]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [107]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2A_H2B_like_0,H2A-H2B-like_(Viruses),,,,,,,,,,MCDKSKKAGLQFSVNGTIDYINKFITPSYLEYKISVEVAVIITSTL...,,HISTDB_H2A_H2B_like_0,irwin_self-assembling_2024
1,HISTDB_H2A_H2B_like_1,H2A-H2B-like_(Viruses),,,,,,,,,,MEVQCGLQMSVNAIKKILINELELRGITYNISNDVAIRITAIIEYL...,,HISTDB_H2A_H2B_like_1,irwin_self-assembling_2024


In [108]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H2B-H2A-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [110]:
type_like = 'H2B-H2A'
accession_prefix = f"HISTDB_{type_like.replace('-', '_')}_like"
print(f'accession_prefix: {accession_prefix}')

accession_prefix: HISTDB_H2B_H2A_like


In [109]:
fasta_str = '''>M-3300005613-23.Ga0074649_1000658_4
MVSDTKYNIYIKRVKNGINREIRISSDAIEIMNDIINYSISILMKACNSLKSNKKTLEAREIKTAVNLILPGELEKHAIVEGFKSVTRAKGNKDGSIQEKSELKFPISRISNEMKKLSLYERQGKMASVYMTSVIEYLVVEIIEVSVFTLYKKKRKTITVDDIKNTIKIKDAELVKLYENITLKGKYYKVGNHNK
>M-3300015360-154.Ga0163144_10009201_25
MAVYTTTKFDTFIHRVLKQVHPDTGMTGDAMSCIDNAIRIVIKKIMFGVNRLQLHSQKKTITSREIQSAVRLIFPGEIAKHAVSTGTKAVVKYNDAVEKKSARKEKSKKDKSPKKRHSVSRTHLAGLEFPVTRVERIMMEFSVSSRKSASAAVYLAAVLEYITAEILELAGNAARDYKKHRITPRHIVLAVKNDEELDMLYTKTVFSGGVIQSFSR
>M-3300021354-14.Ga0194047_10000041_42
MDPTAKRSSKKGSKRGSRKVDLTRPKVAWIHAILDGIDKDMRISKRAVYAVVDLCHNMASNICVEAAKITHSLKKKTVGMGEVLTACKTGLPMLLLKTVEAHAFAAVQRCEDMNLNVHQEKIGLIFGVGQSRNLLKSKSNFKVATTALVFLAAMQEAICRLFLIKAVERSKDMRMVTVKLDQAAFVGKRDEEFSKFWNSSA
>M-3300023174-67.Ga0214921_10015819_2
MENFEVYIKKIQKQIHPDQRISKDSLLLIQFIVNFVLLKLAQKAVMLAQPLDYEAKPRKTPLGKKTISSRDIQTAVKLVLPGELANHAVTEGTKAVVKYTSFEPKRGQRPPIKASTKAGLHFSVARTHNLLRRNISLRIGETAPVYLAAVLEYIAGEILELAGNHSQEQKMNTVMPRHIKDAVVKDEELHRLMRDIHVILPGVKDQHNKNDVVHNTGLLWVM
>M-3300023184-71.Ga0214919_10000112_94
MEHFDVYIHKVLKQVHPDTRISHESLALMNFIVNFLCVKIAHKAVMLVNPLNYESKKKTTTTAKKTISSRDIQAAVRLLLEGGLSKHAVSSGTKAVTKYTSFHPKSKKPTTAAEKAGLQFSVSRTSNLLRRNISLRVGETASVYLAAVLEYICAEILELAGNNSKDHKMKTIMPRHMKEIVVEDEELHRLMRDLHVILPGVKDQPHIKKHAMYNESAFWVM
>M-3300023210-3.Ga0233412_10000202_35
MSQNFNIYIKKVLKQVDPGKHISQQCSFQLNALLNYLGDKISQLAITLAQTEKKKTVTARDIQISETMILGRELAKYANKEGIKAVTNYKSSTKKKGNRPVDRAGIVFPPVRARHLIEKNLPKHFHIGDAAPVYLAAVLEYLTAEFIQQTGNCTRDNNMNTLTLSCLYKATYNDEELNQLLSKIKFRFATTKVLPNIHRVLQKKPKPKRKSKKRSKSRSRR
>S-1016704-142.1016704_contig_227_43
MSLRTYIRKQLKLIASESRISSVAIGEIEALLINALEKVVMNAIVIKMNCRKRTVEARDIEASVKVSLPSEISKYGRSRALKAYTRYNATTVHPKGTSRSTKAGLDIPVSIVENMIRSMVADNEEKGLRVSDESGVYAGGVLQYLTIELLEVSLHEADKENKKTITDTIVVNAVKKDIDLGELFHCRYFEPSRMDFVVCN
>S-1063923-109.1063923_contig_4197_4
MATKRGGGGGSGGGARDRYDTYVYNVLKRVHPDLGITRGAMRSVETMVRDLVARVCTAANDLTLLTKKQTVTAREIHAATRQVIQGDSLVASAVSDALRAVTKYTGSFKDNGGRAKVAKSRAQRAGLVFSPARVERQLRACSSAKRVGENAPVYLAAVAESITGEILELAGDQARDSKRKRITPRDLTLVLRGNEPLDRTFSGFVAGGGVLPHIHRDLLPPPSKKAKTAAAAAARSGNMLSGWDDRFDDDYDDDAELDYVPGHGDYDYDDDDE
>S-1063923-109.1063923_contig_6429_10
MASKALMKRGGANRYDAYIEKVLKQVHPGHGLTWDAMRSIDSMIKDVVSRLCHAANDLTQANKKQTVTARDIQTAVMQVFPAPELDRHALAEGTKALTKFNSTATAAGKNKSKGHSGRTLRAGLVFSPARVERQLRSCSSAKRVGAGAPVFLAAVAEYVAAEVLELAGNSSRDRKRRHGTRRLRITSRDVMLAVRNDMDLDALLGGVIARGGVRPNIEHQLLPTKKKQHAENYMRCGWDDRFDDEAELDYVGGGDDDYDDDDDDDDL
>S-1092409-41.1092409_contig_1547_3
MKQVKPSDYEFKKLTGKQVDNINFDALCDGAVQSEFVNKNKQTAIFYGLVRDKNIDAVIMIDQPSANMSDALKRDLKKQGLQPENAIEIVAFCVAPAARKGNLGRHLMHEALKKTQAHTIYLFQSESEIEADQWQFHPGASRLYSKVFGSGEKSIYEQDMMFFAAPRTEVMQNLRTLLSLNTAAKKSAKSRTKVSQAVAAKSTLQKRKRSSGQTGCRVPQKARKDTFQTYTYKVLRQNEPDNGISQSGMCSIETLIDELLGKLLKESKRLLLRTGKKTLTAKEVQTATRLLLRGELAKTAVAQAEQALTQYNVAGTGQTRSNRAKLLFPVGRVERLVRQQQTANRISDSAPVYLAAILQYVVAEILQGAAMSARKRKFKRITPRDIMLTVEMDDELNKLFSGVFAHSGNVYQIEPALLPASKKTKLAK
>S-1096103-95.1096103_contig_746_2
MVMSKRKYDKFDTYIERVLKHLYPELGMTSEAKMEVNNLIKASFQKIMDAVNYLNAQSDKKTISAKHLGITTDIVLGKELAKHVHQEGTRAVTEYNTSVSSSQGKSDTTTTRSERAKLIFPVGRIENTWIRPAMCGRDTRLSEDAPVYMAAVLEYITADILELAGHVAKDDKKMRITTRHLALALQRDDELDHFLGDYIVSGGVVPHIHSALLLKKKTRKKKTDA
>S-1096103-95.1096103_contig_823_25
MADVEKRRKRKDYSSFDTYIQKVLKQVHPDTGIKGEAMVEIDNFVKAVLHEIMRVVNMLTVANNKKTITHREVQTAYQAVVPEELARHGVSEGVKAVTKYNSSISGDQSSSRGPVSSSNRAGLQFPVTRIKTKWMKPLASVDRVGDSAAVYMAAVLEYLTAEIMELAGNAARDHGKVRITDRHIALAIQNDEELNKMSKNMVFAGGVVPYIPTELLPK
>S-1096103-95.1096103_contig_3144_3
MATKARTKSKKVTKKNTGSVSVSKKTTQESSASQESRLKKKKNYDTFRTFIQKLLQDVHPGGISGGALDVVDSYVKINCDKLVRNSDGLLKRSKKKTLSEKEIESAVYLTVGDDLAQESNKRGTEAVNKYMSNLDKKGSTQRSTKSSKAELIFPVSRVGERVKSASSLDGLRIGEKAIVFLTAVLEYLTQELLRMAGEVADTGKNKHKRITPRDIKLAINKDKGLEELTRNVYIPGGVPVRPRKR
>S-1096109-37.1096109_contig_4615_5
MNTAEPELNFTIYTYKVLKQVHPDTGISSGANTQMNVIINAIGEKISEKAAFLAIKAGRKTISAKDIQSAVRLVLPGELAKHAVSEGTKAVTKYSDSPYARKIRKRSARAGLQFPVSRVEKFVRAAANKGSLDSSCKKGGCLRVTDTTGVYLAGAIEYLVAEILELGGNAARDNKKARITSRHLFLGIVNDEELSKMMETLNIFIPNSGVLPHIHSALLPKRRR
>S-1101174-77.1101174_contig_724_8
MENFEVYIKKVLKQVHPDTHINQTTVSLINFMINKIGDSIVQESNRLIHPTHYKNLSSHVDEKNTIGSREIQTSTRLILPGELVKHAVSQGTKAVTKFTSSSTHTSKSARAGLQFSVDRCKDMIGKHSNCRVDETAAVYLAGVLEYLTGEILELAGNVCKDERKKTITPHHVKNAVENDGELLILIFEQNILLPGTMNVKYPKSKKNIPPNFGMKAALYSSNVGGNF
>S-3300001748-13.JGI11772J19994_1000085_13
MNTLNLQTYIRKIAKQIHPDLRVSAEFTTSVNNMINVLGKKLILTAIAIKINAGRVELQAKDIKAATEVVLLPDLYKYATSNATKAVTIASSGKIEGRRVTRTEEAKLTLSVPRVETFMRDNIPKRAGRTRISKYAIVYLTAVLEYIAAETCELSGNEATRNKKVTMTKTHYNTVSAKDVSLKQILDCHAR
>S-3300001748-13.JGI11772J19994_1000123_7
MNTLNLQTYIRKIAKQIHPDLRVSAEFTTSVNNMINVLGKKLILTAIAIKINAGRVELQAKDIKAATEVVLLPDLYKYATSNATKAVTIASSGKIEGRRVTRTEEAKLTLSVPRVETFMRDNIPKRAGRTRISKYAIVYLTAVLEYIAAETCELSGNEATRNKKVTMTKTHYNTVSAKDVSLKQILDCHAR
>S-ERX556017-116.ERX556017_contig_12137_6
MSSRKRTKRKVYTFKSHIYKVLKQVHPDTGFTTDAKEQMNHFVEHFADKIADEAAFLAKKEHKKTISSREVQTAVRLVLPGELAKHAVSEGTKAVTKYTSHYSRKGKKVSKSWRAGLQFPVSRFKTILQKHAPRGFRIGEGAPVYLAAVMEYLTAEMLELGGNAARDNRKSNINSRHLQLASRNDEELSKLLGKHNIVGGGTLPNIHAVLLPSKHKKQKHQSARRKKARKNSRKPKKK
>Brma.Brma_1_159
MERVGKYGLYVKRMSPKDFDVTKEAVEQINNMLVFLADKMMTKALILLGDKKTLKHDALFWLLRDVPGELGKHGKDYVDSALYGNKELVFPTKRTENLMRKKTCKRVGQSSVKTLTAILEYFCQEILNSSAREAGKESRKRIKVSDVQGAVKKDRELFQVFGSGIFSGR
>Brma.Brma_1_444
MASKATKEKSKKSEDASARKKPSKDINFKVGIRRVLAQVHPDQSIRAEALEELDNIAIFLGKAISKDAAILVGTESKTINGRAISSAAVALLGGELGKHAHAQAGKAITASQAGDGKGESRSIKARLQLSVARSERLIREHACAYRVSATAGIALAAVLEYIIAEIIELAGNASRDSKKVRIAVKHIQLAVQNDAELHHLLGKGIFSGGGVQLVGPQYFSVKPPKAKKATSPKKKPASPKKKPASPKKKVQKRKSSGKDEEGRSPSF
>Laus.Laus_1_58
MERIGKYGLFIKRMTPKDTEVTKEAVEQISSMLSFLAETLLRKSTILLGDKKTIKHEVIFWLLRDIPGELGKHSKDYVDSVVYGKRELIFPTKRTENLMRTKTCKRVGQSAVKTLTAILEYFCREVLEAASKEAKRNSRKRIKVLDIQNSVKKDAELFKVFGSGIFSGR
>Laus.Laus_1_415
MASRTKQEKSKPEGTVSRKKPSKDVNFQVGIRRVLAQVHPDQSIKGEALRQLDSIAVYLGKKIAHDAAVIVGTESKTINGRAVLLATRALFDGELAKHAHALAGKAVTHYEAAGDKGDSRSAKAHLQLSVSRAERLIREHGGCGYRVSATAGVVLAAALEYIIAEIIELAGNASRDSKKVRISIKHVQLAVQNDADLFRLLGKGVFSAGGVLLVGVPAPPRARKSPAKKAASPAKKASPKKKAASPKKGSAVQKMALRKSKAIAALQKEREGMSPSF
>Mama.Mama_1_476
MATQKETTRKRDKSVNFRLGLRNMLAQIHPDISVQTEALSELSNIAVFLGKKISHGAVTLLPEGTKTIKSSAVLLAAGDLYGKDLGRHAVGEMTKAVTRYGSAKESKEGSRSSKAKLQISVARSERLLREHGGCSRVSEGAAVALAAAIEYFMGEVLELAGNAARDSKKVRISVKHITLAIQNDAALFAVVGKGVFSGAGVSLISVPIPRKKARKTTEKEASSPKKKAAPKKKKAASKQKKSLSDKELAKLTKKELAKYEKEQGMSPGY
>Me06.Me06_1_474
MATQKETTRKRDKSVNFRLGLRNMLAQIHPDISVQTEALSELSNIAVFLGKKISHGAVTLLPEGTKTIKSSAVLLAAGDLYGKDLGRHAVGEMTKAVTRYGSAKESKEGSRSSKAKLQISVARSERLLREHGGCSRVSEGAAVALAAAIEYFMGEVLELAGNAARDSKKVRISVKHITLAIQNDAALFAVVGKGVFSGAGVSLISVPIPRKKARKTTEKEASSPKKKAAPKKKKAASKQKKSLSDKELAKLTKKELAKYEKEQGMSPGY
>Noum.Noum_1_96
MASTTKSKPEGTVSRKKPSRDVNFEVGIRRVLAQVHPDQSIKGEAMRQLDSIAVYLGKKIAHDAAVIVGTESKTINGRAVSLAARALMGGELGKHAHSGAAKAITHYQAAGDKGDSRSAKARLQLSVSRAERLIREHGGCAYRVSATAGVALAAAIEYIIAEILELAGNAARDSKKVRIAVKHIQAAVQADAELFGLLGKGVFSGGGVQLVATAAIPRRKSPAKKAASPAKKKASPAKKKASPASKKKAAAKKMAMEKSKSPFF
>Noum.Noum_1_478
MERVGKYGLFIKRMTPKDTEVTKEALEQISAMLSFLAETLLRKANILLGEKKTVSHEVIFWLLRDIPGELGKHSKDYVDSVLYGKRELIFPTKRTENLMRTKTCKRVGQSAVRTLTATLEYFCREILEASSKEAKRNSRKRIRVLDIQNSVKKDGELYKVFGSGIFSGR
>Povi.Povi_1_58
MERIGKYGLFIKRMTPKDTEVTKEAVEQISSMLSFLAETLLRKSTILLGDKKTIKHEVIFWLLRDIPGELGKHSKDYVDSVVYGKRELIFPTKRTENLMRTKTCKRVGQSAVKTLTAILEYFCSEVLEAASKEAKRNSRKRIKVLDIQNSVKKDAELFKVFGSGIFSGR
>Povi.Povi_1_414
MASRTKQEKSKPEGTVSRKKPSKDVNFQVGIRRVLAQVHPDQSIKGEALRQLDSIAVYLGKKIAHDAAVIVGTESKTINGRAVLLATRALFDGELAKHAHALAGKAVTHYEAAGDKGDSRSAKAHLQLSVSRAERLIREHGGCGYRVSATAGVVLAAALEYIIAEIIELAGNASRDSKKVRISIKHVQLAVQNDADLFRLLGKGVFSAGGVLLVGVPAPPRARKSPAKKAASPAKKASPKKKAASPKKGSAAQKMALRKSKAIAALQKEREGMSPSF
>Tokv.Tokv_1_111
MATQKETTRKRDKSVNFRLGLRSLLSQIHPDISVQTEALAELSNITVFVGKKISHGAATLLPEGTKTISAEAVILSAEDLFGKDLGRHAASEARKAVASYKASKVSEGSRSSKAKLKLSVARSERVLREHSACSRVSGGASVALAAALEYFMAEIIELAGNAARDSKKVRISVKHIMRAIQDDAALFHVVGKGVFSGAGVSLVQVHVPRRKPRKTTEEKAPKKKPSAAKKPTKKSQKSLSERQLAKLSKKELAKYEKEQGMSPSY
>Tokv.Tokv_1_227
MDRVGKYGLFVKRISPKDADVTKEALETVNNMLVFLAEKLTKQATIIVGDKKTIKHDVFLWLLTDIQGELGKHSQDFANSVLYGGKELVFPTKRTENLMRKNTCLRISQNAVRALTAILEYFCGQIMEASFSQAKRNKRKRIRPFEVEAAVAKDKELYSMFGKGVISGR
>Tufo.Tufo_1_362
MASKAVKQKSKKSGEASDRKKPSKDINFKVGIYRVLKQVHPDQSIRVEALEELDKIALFVGKKIAKDAAILVGSESKTINGRAIMGATRALLPGELGKHAISDITKAITHTASANASEEHKGESRSHKAKLQMSVARAERVIREEACSYRVSESAGIALAAALEYLIAEIVELAGNAARDSKKVRVSVKHIQLAVHNDSEMMALLGKGIFSGGGVKLVSMSYSRSKPKKTAAASPKKKTSPKKKASPAKKKSPAKKRTVKKTRSLKKDEEGRSPFF
>M-3300005613-23.Ga0074649_1000021_129
MPELKLNFEVYIKRVVSQVHPTLLLEKRASECLNAIINYSLLRLTQTCNTIKPEDKKTLSPLEIQTAVKLVIPGELSKHGVVEGFMSVNKASSAYSKKGSLTEKANLLVSVPRVSSGMRKLSIYERQGKYAGVYAAAVLEYLLAEISELAGNITRKKGKKTITVADVKMAINNDEELRHFYKDVIIKGPFRT
>M-3300007072-7.Ga0073932_1004367_13
MEAEIPKDPSRRIDYSDYRIYIRRVLEEIDNTVQITTDTIDSINDLIVSLIQRILYVADVVISYSEKKTLSTDELIVAVKAVLPPAIANNSVSAGIQAERAVAESAPERATGELIGVSRRRGKTPVATPKSPEADGKTEGRKTRTLLTKQAGIIFSVSRIRGLIEEQMEKGYLNVKRISTGSPIFLAAAIEIVTAKILSAALGYTKGDKRVRIIHSDLVYGLYNDDIFFDIYGFGSWIIREINLLEVQ
>M-3300009488-16.Ga0114925_10000125_8
MNFSAPITKVLSQVYADKVSITEDGLKQMNIFLNKIASIMVSTTAELVTVKPTNESIEMLSNPKPVDKKTVEAAAKLLLPQELASYVIEEAERATSKSVVFPTNIISNLLNSKTIKVKKGAEVYFASILETISEELIEFGGEAANQNGSSVISPRWLKLVITYEPPYKQLAETVNFDFM
>M-3300017987-33.Ga0180431_10007534_8
MLPYWNMVGGGNCTLCRSPGTNKSTCPLNPKAKRPNPAKHPLVGSVPGPVPTPAPPKTTPKRIPVPVPPKTTPKRIPSSKLPKSAPKPAATKATKASVSRDFKADIEKILLQVAPTVELNESAVGALNFILNGINNQILAQGIDSYIQGMGDLGKYIKVEIQRATDRPENVVFSYNKNAGADELKRTAALEYITAELLELTANVALDNDRHWATSADVADAIAGDEEFSKVLPSLIDSLEYVLIRKLLLNSDYEIEKLLDWLNEDVIGSRKPVTLNTGIYVHADDDTFPDAKWLTDAKAYDIDLLVFEPKRGKAITVDDVMMALKRFEAPFSPNRGYFYEGLDKGGNLNWGS
>M-3300018080-43.Ga0180433_10002388_30
MSKKITDRFDRYIRLVFSKVLPDCTLKPEGVTYLHGHLVTILKRACKESFDYMHTSKPPRVTLMKTDVLLGIQQILPHSLRSFAEEFIRTVLERYEDPSVKIRSKTKKAGLHVAIGRVAVYVRRFVANHQIRHEAFVILAAFLEYLLVEVFQGVKTYLETTLEGTRAVTADIVEVVWKEDRDFSKLVE
>M-3300023174-172.Ga0214921_10003464_13
MSIYKKAINVVLSQIQPTLKIGNEGLKAIEEYLTKTGEDIIKKLVLLKSISIPIDDEKKDTIEARDVQFAVRLLIPKDLGRHAISEGTKGALKFSLGDHVNIYYDINNFNPKFKKISGDAAAYFSCVLEYLSAEILELSGNAARDNKRVTINKDFVDRAIKNDTELEKLGCLIGYNKKSSEKMVKKSSKSLKDTRKASRKNSSRKSRKASRRSRKASRKASRKASRKSRKASRKSRKASRKSRRKASRKSRKASRKSRRKASRKSRRKASRKSRRKASRKSRRKASRKSRKTSRKSRRKASRQSRKASRKASRQSRKASRKASRKASRKASRRSRKTKK
>M-3300023179-113.Ga0214923_10010245_10
MSFTENYFGKKTKQVLMQIYPELSISNKGIQFLYLFFSEIRKIMNDSISLLKKYDKHSIEDLIKCILTGTNELAIHAIKNMEIYETVFTGQIWDTHHSLVDKYFSTTLEYILAEVLELSGNVTKDNQKKRITPYYIWYSISLDEELLVLFKKIGFDQYKFEKEELIKNRPQLNNFLYYHTKNGQIYKMKKEELFDIYKDYL
>M-3300023179-146.Ga0214923_10009034_4
MINFANYIKRVLKELNPILQINGDSLEIINKILNNVYIKIINDKKFPNMMDVIDNLFLRNENEMKKEIIQDIQKSLHQYENNNKVKIPFLSIKFNVDEKISYLIGSVLQYITVDILEISGNITRKYGKTRITVEYITEAIKKDKQVKAFIQNNRIINFKEDVLKSPQIIKKIKNSRRKQVKKSSRRKSIRKSSRKTN
>M-3300023184-117.Ga0214919_10000753_32
MNSTISKVLKQVHPDLRINGQAKEVISVMINNVLSAIVNLSPSMSLDDLTNTVKNLLPGELAKHALSTANKDLVKYSRSEGPHGLKNVTKIVLLQLSPVTIKKIIKNMGADVKDVPVESFIFLASALEYLIAELLEIGGNAARDNRKGTIGKWQIELVLLNDEELIELFNRVGNPLAEHWRTPAQYPKMSKYQMRKDLEKLGIKVELPYPTRVKTFNMYPDKDNYPKKQCKGAEKDPSVYVCNPRTGKYIKKNSYDAHFI
>S-1021933-23.1021933_contig_5_281
MEEYSENIIKVLDQVGPYSEIDQTGLNFLFEVLNFLNSKFLNVINLEEIKNILPTIFEKELARHALSEGNKAIARYSFNINSITRETVSKKSLLVLDTNITKKHLPSLTTENIVFITAVYEYILAEILELSSNTAQNMKTSTINEKIIIKTMKQDEELNIFYKKYFKSIFKKVEKKSPRYEILKKIFSLIQDDHEIDNEDIDNIFSDFLNNKSEMKIDEIDEQEIIKFILKKYPIHANFRYFRDYYEFLNTYPENIKKSIEIIIKNEEEKPEIKTLDDLFQIIKFFKKPPIYFNNNNENESFAILKSSSVVNYLNLKEIFNLTMNNKKPKPEFKLKIIFEDETKEYRLFSFFLEKFDYFRNLINFNNSMKSSITVNHEKTGEDLIEFIITGMIEMDYVDLKNNDTKRFVSLYEVADMLQMTDLMKICENVLMHSELNIAICSYYNAEDPFISSQKCFD
>S-1056828-40.1056828_contig_1521_4
MSFGKFIVELIHEMYSGFKIHSSSLDILNFLLNHFCKKIYSGSLIFFQFSQENIKLELKHLIGSIELNVTQDLKKRMKEEIDIYLIGKSKFSLSVKIIESILSSFNYKILISKDYVSCMCIIIEYLTAELIENSITACKLRKSVVISDKDIGNGIDGDTDELNVIFNDVMLIKNICP
>S-1062768-28.1062768_contig_11_6
MYNKMIRFYDSLFFGDEGMMHKLKDKEAKNLAEYFDLCLELAGAGLLTQKKFDCFKTHMWQNAIDKKTIKVGMVISIGKEKHQIDHVSSVNIKASGKTISFSDITDVQEKKKDVTEKQTTKSKVATRVFTEYGLTDVEILKVLSQMHPDNKIRQDAIEEVKGIVQKLYTKLITLPMYSIENFQTVLPGELGKHAYSELKKYTVAERMLNHPAIFHSVDGKGTNIILEYIVAELLELSGNVAHDFHKVNITPDHLKTAIGNDVELSQMMAIINDKKLMYEFTWKYGDSIHKKPVTGQHTWEWEEGKQQMFGLVVDFMQDKFDDTDKEFEFEEAIRDTFDTKAYIKPVGNIEAVWTTTKVWNAQGTGTDKVDILKLADGKDSYLGIQIKAKTD
>S-1101173-79.1101173_contig_5_16
MMKFKNYIKRILKHVNPTLQLNEDSLEVVNKILGNTHFAIINNKHFPNISNVVDELFLNNHCEMKKEILEYIERVLYTNERVNIVFDKIKFNEDKNVSLLIATIIEYLAVDILEISGIITRENLKVRITNEYLNKALERDEGLKMFLKNNKLFDFKDSGNLHRFRVSKSKRKTKKSKKKY
>S-3300009702-216.3300009702_a_Ga0114931_10012995_2
MSDAPKRKKDYSTFNTFTYKVLKQVHPDAGITSGAMTVVDNFVKINLEKLIKNSNLLLFHSGKKTLGAKEIKSSVWLTMEGEIASNSNKEGERAVMKFNSSKSKASSGGGRVTASERAGLLFPVSRIRNKWLKELSIANRVGEDASIFLSAVLEYLTAELLEQAGDKAKDDKRVRITPRHLKLGIENDAELEKLMRDVYVPGGVQVQPKKNANQEEEKNEE
>Gold.Gold_1_409
MASETKEKSTNPKKKELNFRVGISAITKDVSPSLGTTAEALEELNNISLYIGKCVAKGAAIIVDKESKTINGRAVALAARASFPDELGDKSFEKAMGAIKTSKKAGEAKGARSEKAGLMISVARAERLIREHGGCSLSCLLSPPGIALAAVIEYVVTEVIMAAALRASDSKKVRLAVKHIQEAVHEDTELMQLLGKGVFSGAGVHLHRGERARRAPTKKPAAKEGACRQRNPATKKPAAKKAPAKRAQKKE
>M-3300017989-1.Ga0180432_10005342_9
MPVRPRDYSTYITQQWKKLAEINGINVQAQAVKELNILLLFTLDKLLDKSLELLEISGKAQLNWETVQTAMRFVMPEPLRIPAETFAVERVEDFKSNWESRRYPRKQRCSGLIFSLTVAGNSIRAKNTNVSDPTTVFLTAVCEYLCSDILDMMFVSHSTVYAAVSRDIDLQEFFKEIRFLTISHVRLSEEELQEYSRHIHQYTDAKLTRDGLITLKMYTDIPAVEDGRFLKRTIDDCVAVESLKKRKIITEETVLDVVRLHGLTPVR
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MVSDTKYNIYIKRVKNGINREIRISSDAIEIMNDIINYSISILMKACNSLKSNK...HNK'), id='M-3300005613-23.Ga0074649_1000658_4', name='M-3300005613-23.Ga0074649_1000658_4', description='M-3300005613-23.Ga0074649_1000658_4', dbxrefs=[]),
 SeqRecord(seq=Seq('MAVYTTTKFDTFIHRVLKQVHPDTGMTGDAMSCIDNAIRIVIKKIMFGVNRLQL...FSR'), id='M-3300015360-154.Ga0163144_10009201_25', name='M-3300015360-154.Ga0163144_10009201_25', description='M-3300015360-154.Ga0163144_10009201_25', dbxrefs=[]),
 SeqRecord(seq=Seq('MDPTAKRSSKKGSKRGSRKVDLTRPKVAWIHAILDGIDKDMRISKRAVYAVVDL...SSA'), id='M-3300021354-14.Ga0194047_10000041_42', name='M-3300021354-14.Ga0194047_10000041_42', description='M-3300021354-14.Ga0194047_10000041_42', dbxrefs=[]),
 SeqRecord(seq=Seq('MENFEVYIKKIQKQIHPDQRISKDSLLLIQFIVNFVLLKLAQKAVMLAQPLDYE...WVM'), id='M-3300023174-67.Ga0214921_10015819_2', name='M-3300023174-67.Ga0214921_10015819_2', description='M-3300023174-67.Ga0214921_10015819_2', dbxrefs=[]),
 SeqRecord(seq=Seq('MEHFDVYIHKVLKQVHPDTRISHESL

In [111]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": f"{type_like}-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300005613-23.Ga0074649_1000658_4
------------------------------------------------
M-3300015360-154.Ga0163144_10009201_25
------------------------------------------------
M-3300021354-14.Ga0194047_10000041_42
------------------------------------------------
M-3300023174-67.Ga0214921_10015819_2
------------------------------------------------
M-3300023184-71.Ga0214919_10000112_94
------------------------------------------------
M-3300023210-3.Ga0233412_10000202_35
------------------------------------------------
S-1016704-142.1016704_contig_227_43
------------------------------------------------
S-1063923-109.1063923_contig_4197_4
------------------------------------------------
S-1063923-109.1063923_contig_6429_10
------------------------------------------------
S-1092409-41.1092409_contig_1547_3
------------------------------------------------
S-1096103-95.1096103_contig_746_2
------------------------------------------------
S-1096103

In [112]:
for k, v in data_sequence[0].items():
    print(k, v, type(v))

accession HISTDB_H2B_H2A_like_0 <class 'str'>
variant H2B-H2A-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MVSDTKYNIYIKRVKNGINREIRISSDAIEIMNDIINYSISILMKACNSLKSNKKTLEAREIKTAVNLILPGELEKHAIVEGFKSVTRAKGNKDGSIQEKSELKFPISRISNEMKKLSLYERQGKMASVYMTSVIEYLVVEIIEVSVFTLYKKKRKTITVDDIKNTIKIKDAELVKLYENITLKGKYYKVGNHNK <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [113]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [114]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_like_0,H2B-H2A-like_(Viruses),,,,,,,,,,MVSDTKYNIYIKRVKNGINREIRISSDAIEIMNDIINYSISILMKA...,,,
1,HISTDB_H2B_H2A_like_1,H2B-H2A-like_(Viruses),,,,,,,,,,MAVYTTTKFDTFIHRVLKQVHPDTGMTGDAMSCIDNAIRIVIKKIM...,,,
2,HISTDB_H2B_H2A_like_10,H2B-H2A-like_(Viruses),,,,,,,,,,MVMSKRKYDKFDTYIERVLKHLYPELGMTSEAKMEVNNLIKASFQK...,,,
3,HISTDB_H2B_H2A_like_11,H2B-H2A-like_(Viruses),,,,,,,,,,MADVEKRRKRKDYSSFDTYIQKVLKQVHPDTGIKGEAMVEIDNFVK...,,,
4,HISTDB_H2B_H2A_like_12,H2B-H2A-like_(Viruses),,,,,,,,,,MATKARTKSKKVTKKNTGSVSVSKKTTQESSASQESRLKKKKNYDT...,,,
5,HISTDB_H2B_H2A_like_13,H2B-H2A-like_(Viruses),,,,,,,,,,MNTAEPELNFTIYTYKVLKQVHPDTGISSGANTQMNVIINAIGEKI...,,,
6,HISTDB_H2B_H2A_like_14,H2B-H2A-like_(Viruses),,,,,,,,,,MENFEVYIKKVLKQVHPDTHINQTTVSLINFMINKIGDSIVQESNR...,,,
7,HISTDB_H2B_H2A_like_15,H2B-H2A-like_(Viruses),,,,,,,,,,MNTLNLQTYIRKIAKQIHPDLRVSAEFTTSVNNMINVLGKKLILTA...,,,
8,HISTDB_H2B_H2A_like_16,H2B-H2A-like_(Viruses),,,,,,,,,,MNTLNLQTYIRKIAKQIHPDLRVSAEFTTSVNNMINVLGKKLILTA...,,,
9,HISTDB_H2B_H2A_like_17,H2B-H2A-like_(Viruses),,,,,,,,,,MSSRKRTKRKVYTFKSHIYKVLKQVHPDTGFTTDAKEQMNHFVEHF...,,,


In [115]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [116]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [117]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_like_0,H2B-H2A-like_(Viruses),,,,,,,,,,MVSDTKYNIYIKRVKNGINREIRISSDAIEIMNDIINYSISILMKA...,,HISTDB_H2B_H2A_like_0,irwin_self-assembling_2024
1,HISTDB_H2B_H2A_like_1,H2B-H2A-like_(Viruses),,,,,,,,,,MAVYTTTKFDTFIHRVLKQVHPDTGMTGDAMSCIDNAIRIVIKKIM...,,HISTDB_H2B_H2A_like_1,irwin_self-assembling_2024
2,HISTDB_H2B_H2A_like_10,H2B-H2A-like_(Viruses),,,,,,,,,,MVMSKRKYDKFDTYIERVLKHLYPELGMTSEAKMEVNNLIKASFQK...,,HISTDB_H2B_H2A_like_10,irwin_self-assembling_2024
3,HISTDB_H2B_H2A_like_11,H2B-H2A-like_(Viruses),,,,,,,,,,MADVEKRRKRKDYSSFDTYIQKVLKQVHPDTGIKGEAMVEIDNFVK...,,HISTDB_H2B_H2A_like_11,irwin_self-assembling_2024
4,HISTDB_H2B_H2A_like_12,H2B-H2A-like_(Viruses),,,,,,,,,,MATKARTKSKKVTKKNTGSVSVSKKTTQESSASQESRLKKKKNYDT...,,HISTDB_H2B_H2A_like_12,irwin_self-assembling_2024
5,HISTDB_H2B_H2A_like_13,H2B-H2A-like_(Viruses),,,,,,,,,,MNTAEPELNFTIYTYKVLKQVHPDTGISSGANTQMNVIINAIGEKI...,,HISTDB_H2B_H2A_like_13,irwin_self-assembling_2024
6,HISTDB_H2B_H2A_like_14,H2B-H2A-like_(Viruses),,,,,,,,,,MENFEVYIKKVLKQVHPDTHINQTTVSLINFMINKIGDSIVQESNR...,,HISTDB_H2B_H2A_like_14,irwin_self-assembling_2024
7,HISTDB_H2B_H2A_like_15,H2B-H2A-like_(Viruses),,,,,,,,,,MNTLNLQTYIRKIAKQIHPDLRVSAEFTTSVNNMINVLGKKLILTA...,,HISTDB_H2B_H2A_like_15,irwin_self-assembling_2024
8,HISTDB_H2B_H2A_like_16,H2B-H2A-like_(Viruses),,,,,,,,,,MNTLNLQTYIRKIAKQIHPDLRVSAEFTTSVNNMINVLGKKLILTA...,,HISTDB_H2B_H2A_like_16,irwin_self-assembling_2024
9,HISTDB_H2B_H2A_like_17,H2B-H2A-like_(Viruses),,,,,,,,,,MSSRKRTKRKVYTFKSHIYKVLKQVHPDTGFTTDAKEQMNHFVEHF...,,HISTDB_H2B_H2A_like_17,irwin_self-assembling_2024


In [118]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H4-H3-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [119]:
type_like = 'H4-H3'
accession_prefix = f"HISTDB_{type_like.replace('-', '_')}_like"
print(f'accession_prefix: {accession_prefix}')

accession_prefix: HISTDB_H4_H3_like


In [120]:
fasta_str = '''>M-3300005613-23.Ga0074649_1000021_124
MAAKPNKKTTSAKSKKVSAKTTTKTSKPKKNKDNSESLEDTLEGQIRQGPLRNIMSKAGMERVNEKAFEVVRNHTQKYLTSLLKDAYAFTTADKRTTIGSEDVNKVVEARKYSHTFEDVVKMIDEADEKILFIANANAERTCRKIADSDGFYEGEKKPRFGNDTIKCFRYYLEAELVRLVESVLLVTQHNNRKTIKAEDIELVLRIEERCPKCV
>M-3300005613-23.Ga0074649_1000021_130
MPPRVRKILRNNIHGVSTPVIKRFYRRGGALAVPTPLVEYTRSYLSRIMKAYLHDIIVIADYNKKKTVSTDHFKTALELRNKSLASSKKPNVKACKNAATVYKKITAAKRVEYQQKNTDCLIIPRLPFNRLTREIAQDFETGLAFSSDFLIYFQIFIEMTLSDLTTEAVKHAKHAKRKGVTTEDLDMASVHIEDHLM
>M-3300010354-11.Ga0129333_10001603_23
MSCKTQHDGGSPKPPSTFGGTVTKEVKNLTTKKIAKSGTTRKVYRDNVYGITNNVIARLAHKAGVKTISGLIYKELRAIINNNFLEPVVQASLTYMEHANRVTVSREDVLEGFRIKGFTIYSGGDEANPKKCKDYDSTLKEKKSDKKVKPGMKSLKEIRHYQTQSDCVYIARAAFERLVKEIVQDVRTDAVRWSSDAMGLLQQSIEAEIVNFIENANLCAIHAGRQTLQPKDLQLVMKILNIKT
>M-3300015360-154.Ga0163144_10009201_26
MPTPKKHRKVLRDNIQGITSPALKRILHRAGVKRINTIVYEELRGRLLNFARVTISKVIIFTQHVHRRTVSVKDLECALDIMGISLAAGHNANAKKTVTLQSCNSRGKTSAVSKGGRKKKPGEMVTRDIRYQQKNSDCLAIPKLNFERLVREVGQDFETDLRYSPGFFDLFQLVCEDYLYHMCVFAYSIAVAAGRETLTGKDLHLAVSIATG
>M-3300023087-13.Ga0255774_10000023_167
MSTKVKIPVRRARVTLKNANIAKPSLVRVARRAGVVSLQELVYPEMRRIIKMQIDNIIKFAYIFAKYRDRRTITFDDVKLALERLGLKVWGVNRIEKVNRSGLKTTSPKRKTQRGTKAVREVKKYQQSTSLLMARAPIERLIRKSMGSWSRNFRIEARAISLIHVVVEKQMTQLLAKSQIAAIHAKRKTVQTKNVSLIRKLNNAVANWEV
>M-3300023184-71.Ga0214919_10000112_93
MSYQELGDITKPAILRLAHRVGVVRVSGLMFEETRGILGSFLQEFLRRVVIHTEHDRKHTISVNHVYASMWPNKVLLTTKDLKDCQSKSKDIQTCLSIPKAPFERLVRAIVQVYKKDLRIQGEAALLIQYYTEMYLLKLWSLAHRMAMHGHRMTVEPRDLQSARYALKMAM
>M-3300023210-3.Ga0233412_10000202_36
MLKFCDILLIVIFFSINKNMNKVQDLNKTAIRRIAQRAGIKSISGLLYEETRGVAAVFLENLLRIVVSFTQNARRKTVSVKDVENALKADLLPMGDFASLEKGHVTHCKNLNPRSKSGAKTSKFRPGVKALMNIRKAQKADCLYFPAASFSRFVRKIGDKFMKGLRFSQESLDLIQIVLENHLILLFEDANLCAISAGRQTVYPKDVQLVRRIIGMRS
>M-3300027721-1.Ga0209492_1002999_3
MNDSALKRISYRAGVYRTHSIVYFHMRQIGEKYLSSVIYYAIIYTQYGRRTTISEEDVINAIEKTGYISLYPTTGSVKKCKISKHKKILSKIREYQQQYDCFTLAKAPIEALIKQETATYDTSWKKYRWSLDALRTLHFALEYMLYKLFFAANKIAIHSDRSTIQHKDISLAIELIEDNCQK
>M-3300027790-3.Ga0209273_10001459_19
MPRRRSASRRKSSKPRRSGRRTSGKRRSSSRAPPMRYRHRKVLRDNIHGFKNPQIKRISLKAGVGQLSGSVYDLTKRYIGVFLHELVKTAVTYAEHSRRKTITVQDILDAFERITAGHKIYGTDSSEGCKMYKGKKGKVSQATYITRQIGFYQRQHDCFYFTRTAVDRFLKEVTQDYNLNMRWTAEAKIAAQAALENYILQLFQKAGLIAHEQKRTTVMDKDLFLAARLCGAFNMNRPLEY
>S-1041346-124.1041346_contig_12464_3
MSEKPAVNHAEWKRLSEKAGIPQLSYTAYPYLDYISEEFLKKIISKTLIFMEHDNRITVYRKDVENAISVAGYSPLFDVPDEGLKRCKISNARRPASRTRSYQSQYNCLMTAKAPFRAAVIRLIGNNRIRLCIGSDRILLSEKSLTSLHIAFETFIHSILTSSAEVMYNAKRKTLHRSDIKVASKIVLKNCSVMNLQDRKLKLWQDVF
>S-1064190-106.1064190_contig_13255_8
MSTKVKIPVRRARVTLKNANIAKPSLVRVARRAGVVSLQELVYPEMRRIINMQIDNIIKFAYIFAKYRDRRTITFDDVKLALERLGLKVWGVNRIEKVNRSGLKTTSPKRKTQRGTKAVREVKKYQQSTSLLMARAPIERLIRKSMGSWSRNFRIEARAISLIHVVVEKQMTQLLAKSQIAAIHAKRKTVQTKNVSLIRKLNNAVANWEV
>S-1092401-23.1092401_contig_1062_13
MKGYGMIGEGSWDMTKGKRKDMKEMKGNGGGDEISYENEKWAEMAGGAKKHRKHKILTNNIKNVNNASIKNMSLVAQIESLSSASNDIVRNLLYAYLNELIHRTVIITDHNKSKTILKDHVVMALHSIDRAFRISQYYGDMDDNEKIKRCTKPEVDVKDKKKKQERTHKIAAERHGECIYVSREGFVRLIREIAQNYVWETKISQDASRYIQLLAEQFVIETMTRANIVARQVGRSTVTPSDFIVVFKLRNVPKTVYQ
>S-1096103-95.1096103_contig_746_3
MASPEGIESITNPDITRLARKGGAKIIAENVYDFAREIMRDYVGELIRGSVASMRYSGKKTLKPADLDTALTLNHKVLLAGVGPSLDTCRSYKPTQGKSRSGQNALREIGYYQKHSDCLVVRKLPFDRLLREVSQQYEDIKQISKDFVLLSQLVTESFLVNIFKDAVLTAIHSGRQKITDNDIKLVLTIRRVPY
>S-1096103-95.1096103_contig_3144_2
MTTRNRKQVTSTKNTEIEDPITGFKSIQIRRLAYRAGAERIESDTYDCVRQVISEQVSDLLRNILVFTTHTDRKTVTLEDLEGALESKGKFLMAGGNTARDIKKSKAKAKKPKKVPTDEGSSSDPPKKPHRFRPGTVANQEIKRNQETSDRLIFKQNRFEAYVRDLTKKQLSNWGKSQTTLRFSKEFFKLFQVVMEEYLVRLLERATKASAHANRKSISKKDVDFIYAITSQ
>S-1096109-37.1096109_contig_580_4
MAGRHRMVLRDPIQGITKPAITRIANRAGCKRLSGLIYEETRGIALVRMEKIVKAAVVYTDNAGRQTVKESDVSAALADLGDGAAWIKPRKKKSGRRLVGYIALEQRERAFRMDEDDDGIADLFDDPARGPAPGGVRRRHRYRPGTVALRNIRREQKKTNLMIPRQSFNRIVRKVGSYYKSDLRYSDSALELIQMAVEHYLVLLFEEANILAIHAKRQTVQPKDLQAARRVRGDRS
>S-1097288-23.1097288_contig_41_5
MNDSALKRISYRAGVYRTHSMVYLQLRKIGEKYLSSVIRYAIIYAEYAKRRTISEDDVLNALEKTGFVFLYPTIGKIKQCKISNRKDVISKIREYQQQHDCFTLAKAPIEALIKKKSDFKWSFNAIRTLHLALEYMLYKLCFSANKIAINSNRRTVQHKDVSLAIELINDNCQFEALQDFNFRV
>S-3300009084-278.3300009084_a_Ga0105046_10001788_30
MVQGSKKRPAKKTVSKAYTTKTSRKKYDETRPKRGSLRRMIKRYLDIRISSTLYNAVGEYAYNLTSLIVDNAYTYAEYARRVTILDRDVEDAARLYGIEKKDYKSLSLGDLPKCDMYTKERGKGLKQKITRAARKASEGKKKNIEQYSREVEKVLRVDFTDLECVTLQKASFRKFVTHIMEDIGIDMATAKDGSKVKTRWSSAAMDLLQLIVEGLILKYAAGAAAIAKSGKRETITGSDFQIAKRIGGNKKLSF
>S-3300009702-216.3300009702_a_Ga0114931_10012995_3
MSTRKGRRRSSATKRKTKTMKDPIEGLKNPQIKRLAYRAGVQRIDGGIYDMVRKIIHERSDKLVGKTLIFTTNSKRKTVSVEDLQGALESEGLYLMAGGNTAKEFRGCKARPKKTFKVGAKKHRFRPGTVARRQITYNQKNSDCLVFEQKPFKVFIRDLAERKIKEWKIRDLKLRFKEDFFKLFQFVIEEYLVGLLRGALRIAEHSKNQTVRPKDIALADTLDVYSDRTI
>S-403982-34.403982_contig_1015_12
MTKISKEVLNKLAKKAGVKSLSGLTYEEIIGTVLVMIQKILRNCILISNNRKTITLDDIKYSFDFLGHKLYTYDEDISKCKTFTKTKVKNKYLKQIKYYQNQADCVYIPKASFKKIVKNELNNLQYDLPDDLQYSKNMKISNIALDHFQFAIESLCIELLEKAYKLAITHAKRITLQPKDIAAVRYILSNSDPLLGS
>Anmi.NC_023848_1_144
MNDSALQRIAYKAGITRISESVYGFVREAGNMYLRSITEYAEIYAEYEGKKTISGDHAIHAIENTGFADQYKIKDSLKPCKISGKVKVLAKIREYQKQHDCNTLAKAPIERDIKSFTHFKISKEGVQNIHSALEYFIYKLFFSALLITLNAKRKTVSVDDVKLTLRMINDNCKSINFR
>Brma.Brma_1_445
MSKSGKKTLARPLGHLASFVHSKETQLPKATTQHLLRKAGSRTSAADAFEPITGFVHMKLEKLLGKALLSMQFAKRTTLLKEDVQKAAEMMHLPVFAIPSKKETGAKGSVFLSCRQSGSGSELKGKETNVQEIRKQQRQTCMIIPKERFRTIVKEIADKLSVAESVRLSEKALDLLQVIVESCTVRLLEKALALTMEAGKDRVTGRHIEAIFLIEHGPL
>Gold.Gold_1_410
MSARALSSFLNRGETRLPKATTQHLLKKAGARTASSDTFEPICGYVAMKIDKILKRALMTMQYANRSTLLKGDVLKAAEMLHIPVFAVPKKGESGTRGSVFLSCRQDGSGSQLKGKETGTQEVRKQMKQTCLIIPKARFRDIVQESAVKIGAGYADVRFSEKALDILQVIVESMTVRLLEKARILTTAGKRDRVTGKDIDGAFFIEHGPY
>Inr2.Inr2_1_141
MNNSALQRLAHKAGATRVSSEVYNRARNIGDQYLKSIVNYAIIYCEHENKKIITEEHALHAIEHVGFSGMYRAVGDIKKCKTSNKKKLITRIKEYQNQSDCVTLSKAPIEQQIKSLGSGFKWSKEALINIQFALEFMLYQLLYSALKVTINAKRITMLDSDVDLTIDLITTNCKNIRL
>Inr4.Inr4_1_154
MNNSALQRIAHKAGATRVSSEVYNRARSIGDQYLDSIVRYAIIYCDHEKKKVVTEDHALHAIEHVGFSGMYRVSGDTPKCKTSNKKKLITRIKEYQNQSDCVTLSKAPIEHQIKSLGSGYKWSKESLINIQFALEFMLYQLLSSALKVTINAKRVTMSESDLDLTIDLITTNCKNIRI
>Inr5.Inr5_1_146
MNNSALQRLAHKAGATRVSSEVYNRARNIGDQYLKSIVNYAIIYCEHENKKIITEEHALHAIEHVGFSGMYRAVGDIKKCKTSNKKKLITRIKEYQNQSDCVTLSKAPIEQQIKSLGSGFKWSKEALINIQFALEFMLYQLLYSALKVTINAKRITMLDSDVDLTIDLITTNCKNIRL
>Laus.Laus_1_416
MSKSGKKTLAPAPGYLASFVRSKETQLPRATTQHLLRKAGSRTSTADTFEPITGFVHMKLEKLLGKALLAMQFAKRTTLLKEDVKKAAEMMHLPVFAVPSKKESGAKGSVFLSCRQSGSGSELKGKETNMQEIRKQQRQTCMIIPKARFRDIAKEIVDRELEGVRLSEKALDLLQLIVESLTVRLLEKAVALTLEAQKDRVTGRSIEAIFKIEHGPL
>Mama.Mama_1_475
MSKAGKKVKAQQHGHLADHVSVGETQIPKASTQHLLRKAGSLSAAGDTEVPIRGFVHMKLHKLVQKSLLAMQLAKRKTIMKSDVKKAAELMHLPVFAIPTKDSGAKGSVFLSCRQKGAGSAGTGSETNSQEVRSQMRSTCLIIPKERFRTMAKEISKKEGHDVHIAEAALDMLQVIVESCTVRLLEKALVITYSGKRTRVTSKDIETAFMLEHGPL
>Me06.Me06_1_473
MSKAGKKVKAQQHGHLADHVSVGETQIPKASTQHLLRKAGSLSAAGDTEVPIRGFVHMKLHKLVQKSLLAMQLAKRKTIMKSDVKKAAELMHLPVFAIPTKDSGAKGSVFLSCRQKGAGSAGTGSETNSQEVRSQMKSTCLIIPKERFRTMAKEISKKEGHDVHIAEAALDMLQVIVESCTVRLLEKALVITYSGKRTRVTSKDIETAFMLEHGPL
>Noum.Noum_1_95
MSKSGKKSAAHAPGHLASFVRSKETQLPKATTQHLLRKAGARTSSASTFEPITGFVHMKLEKLLDKALLSMQFAKRTTLLKEDVKKAAEMMHLPVFAIPSKKESGAKGSIFLSCRQSGSGSELKGKETNAQEIRKQQRQTCMIIPKARFRDIVKEIADRQSTEKDVRLSEKALDLLQIIVESLTVRLLEKALVLTLEAKRDRVTGHDIETMFKIEHGPL
>Povi.Povi_1_415
MSKSGKKTLAPAPGYLASFVRSKETQLPRATTQHLLRKAGSRTSTADTFEPITGFVHMKLEKLLGKALLAMQFAKRTTLLKEDVKKAAEMMHLPVFAVPSKKESGAKGSVFLSCRQSGSGSELKGKETNMQEIRKQQRQTCMIIPKARFRDIAKEIVDRELEGVRLSEKALDLLQLIVESLTVRLLEKAVALTLEAQKDRVTGRSIEAIFKIEHGPL
>Tokv.Tokv_1_112
MSKAAKKSKSQQHGHLASYVLGGETQIPKATTQHLLRKAGSLSATDDTEVPIRGFVRMKLHKLLQKSLLAMQLAKRKTILKGDVKKAAELMHLPVFAIPTKESGAKGSVFLSCRQKGAGSAGSGSDTNSQEVRNQMKSSCLIIPKERFRTMAKEISKREGHDVHIAEAALDMLQVIVESCTVRLLEKALVITYEKDKKRVTSRDIETAFMLEHGPL
>Tufo.Tufo_1_363
MSKSGKKTLTRTSGHLSSFVHDKETQLPKATTQHLLRKAGSHTSSATAFEPITGFVHMKLHKLLGKALLAMQYAKRTTLLKGDVVKAAEMMHLPVFAIPSKKESGAKGSVFLSCRQAGSGSELKGKETNVQEIRKQQRQTCMIIPKERFRTIVKEIADKLAMVDSVRLSEKALDLLQLIVESCTVRLLEKALALTKSAKRDRVNGQDIETVFLIEHGPL
>Wiir.Wiir_1_84
MNNSALQRIAHKAGATRVSSEVYNRARSIGDQYLDSIVRYAIIYCDHEKKKVVTEDHALHAIEHVGFSGMYRVSGDTPKCKTSNKKKLVTRIKEYQNQSDCVTLSKAPIEHQIKSFGSGYKWSKESLINIQFALEFMLYQLLFSALKVTINAKRVTMSESDLDLTIDLITTNCKNIRI
>M-3300009488-16.Ga0114925_10000125_7
MPKDKLSRTYIKNIANSVGVPKVSGLIYEQLRSMTKDYLQKVLSKAVTYSEYYGKNTINKGIIFLATDKEKWSEDLPGKKCKAPNLKVQNTVEKRVQWYKKEQGENRCLHFPTSSFSSFVKDVLEDTTTKKIKITKQAMIYLQYIVEAYMANILKNGLLFVKHRGGPVLQPKDLTIGMRYKKI
>M-3300023174-67.Ga0214921_10015819_1
MNTLVGITKPALIRLAKRAGIVRFSGLSYEESRMILFTYLEEFLRRVAIHTDHDRKSTVSVNHVYASMWPNKVILTTKDVKECKSQSKQIQTCTTFPKASFERLVRAILGEYKSDLRIQHEAALLIQYYAEMYLLKVYGLALRMALHDHHRLTVEPRDLIQARYAIHLSGGKW
>M-3300023184-66.Ga0214919_10000202_63
MCHRMGIQRISQPVFNEIRFFLFSWIGDVLLKLNEIMSYDDHKTTVTWEDVMKVLSPKIILRRRKVPRCRPNTPVLPNCFVFPVQSFLRVLRLHTMDLEQNLRFTPEALTLLQFALETYFAEKVKIAYENMTRARRKTLFSEDFSTH
>S-1096103-95.1096103_contig_823_24
MSRGLGKSRLVFTGGMGLSGAKVYRRPIYRDNTQGITKPVILRLAHRGGVKSLSGVMYEETRGILKVFLQDLIRLSNLSKVYARRSTYQVKDLEFALNVKNKYLVAGVDPKSKTTSSLQSCKLRKRAEKEPGKQRRRAKSGTNAIREIRYVQENSDCLLIPHLAFKRLVLEIAQEYSDDDIRVSDAFARLIQLVTEEYLTALFEDANFAAIHSGRVTVNPKDMRLARRIRKERA
>S-1101174-77.1101174_contig_724_7
MGNEDLKKPQIARLIERVGIDRISGLTYEESHGIIRQFLEYFLKKVVIYTDYYRKKTVSVDHVMMSMKSSLFLIQKNIPTCSLNDRKPTTCLVFQKAPFQRLIREIASYYKKDLSFEQEALQLIQYYTEIYMTNVYTEARYILMASKRETLEPRDLQLARDVTGRGKKIHP
>M-3300005613-23.Ga0074649_1000658_5
MQSNDITNSSINKFLRRGGALDISGDSHGVVRSYIRRVVEKYIRDVSIITNFYKQRTVSLEHFKRALEISNQALASSKPKGSKSEIKQCKDASTVYKKITGKTRVKYQQNNTDCLIIPKATFKRYTKSVLEDVSELQKVSVDFLLYFQLFIEYIISDMALEATKHAKFREGSEDNFKVTAKDINMAIAHDNYI
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MAAKPNKKTTSAKSKKVSAKTTTKTSKPKKNKDNSESLEDTLEGQIRQGPLRNI...KCV'), id='M-3300005613-23.Ga0074649_1000021_124', name='M-3300005613-23.Ga0074649_1000021_124', description='M-3300005613-23.Ga0074649_1000021_124', dbxrefs=[]),
 SeqRecord(seq=Seq('MPPRVRKILRNNIHGVSTPVIKRFYRRGGALAVPTPLVEYTRSYLSRIMKAYLH...HLM'), id='M-3300005613-23.Ga0074649_1000021_130', name='M-3300005613-23.Ga0074649_1000021_130', description='M-3300005613-23.Ga0074649_1000021_130', dbxrefs=[]),
 SeqRecord(seq=Seq('MSCKTQHDGGSPKPPSTFGGTVTKEVKNLTTKKIAKSGTTRKVYRDNVYGITNN...IKT'), id='M-3300010354-11.Ga0129333_10001603_23', name='M-3300010354-11.Ga0129333_10001603_23', description='M-3300010354-11.Ga0129333_10001603_23', dbxrefs=[]),
 SeqRecord(seq=Seq('MPTPKKHRKVLRDNIQGITSPALKRILHRAGVKRINTIVYEELRGRLLNFARVT...ATG'), id='M-3300015360-154.Ga0163144_10009201_26', name='M-3300015360-154.Ga0163144_10009201_26', description='M-3300015360-154.Ga0163144_10009201_26', dbxrefs=[]),
 SeqRecord(seq=Seq('MSTKVKIPVRRARVTLK

In [121]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": f"{type_like}-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300005613-23.Ga0074649_1000021_124
------------------------------------------------
M-3300005613-23.Ga0074649_1000021_130
------------------------------------------------
M-3300010354-11.Ga0129333_10001603_23
------------------------------------------------
M-3300015360-154.Ga0163144_10009201_26
------------------------------------------------
M-3300023087-13.Ga0255774_10000023_167
------------------------------------------------
M-3300023184-71.Ga0214919_10000112_93
------------------------------------------------
M-3300023210-3.Ga0233412_10000202_36
------------------------------------------------
M-3300027721-1.Ga0209492_1002999_3
------------------------------------------------
M-3300027790-3.Ga0209273_10001459_19
------------------------------------------------
S-1041346-124.1041346_contig_12464_3
------------------------------------------------
S-1064190-106.1064190_contig_13255_8
------------------------------------------------

In [122]:
for k, v in data_sequence[0].items():
    print(k, v, type(v))

accession HISTDB_H4_H3_like_0 <class 'str'>
variant H4-H3-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MAAKPNKKTTSAKSKKVSAKTTTKTSKPKKNKDNSESLEDTLEGQIRQGPLRNIMSKAGMERVNEKAFEVVRNHTQKYLTSLLKDAYAFTTADKRTTIGSEDVNKVVEARKYSHTFEDVVKMIDEADEKILFIANANAERTCRKIADSDGFYEGEKKPRFGNDTIKCFRYYLEAELVRLVESVLLVTQHNNRKTIKAEDIELVLRIEERCPKCV <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [123]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [124]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H4_H3_like_0,H4-H3-like_(Viruses),,,,,,,,,,MAAKPNKKTTSAKSKKVSAKTTTKTSKPKKNKDNSESLEDTLEGQI...,,,
1,HISTDB_H4_H3_like_1,H4-H3-like_(Viruses),,,,,,,,,,MPPRVRKILRNNIHGVSTPVIKRFYRRGGALAVPTPLVEYTRSYLS...,,,
2,HISTDB_H4_H3_like_10,H4-H3-like_(Viruses),,,,,,,,,,MSTKVKIPVRRARVTLKNANIAKPSLVRVARRAGVVSLQELVYPEM...,,,
3,HISTDB_H4_H3_like_11,H4-H3-like_(Viruses),,,,,,,,,,MKGYGMIGEGSWDMTKGKRKDMKEMKGNGGGDEISYENEKWAEMAG...,,,
4,HISTDB_H4_H3_like_12,H4-H3-like_(Viruses),,,,,,,,,,MASPEGIESITNPDITRLARKGGAKIIAENVYDFAREIMRDYVGEL...,,,
5,HISTDB_H4_H3_like_13,H4-H3-like_(Viruses),,,,,,,,,,MTTRNRKQVTSTKNTEIEDPITGFKSIQIRRLAYRAGAERIESDTY...,,,
6,HISTDB_H4_H3_like_14,H4-H3-like_(Viruses),,,,,,,,,,MAGRHRMVLRDPIQGITKPAITRIANRAGCKRLSGLIYEETRGIAL...,,,
7,HISTDB_H4_H3_like_15,H4-H3-like_(Viruses),,,,,,,,,,MNDSALKRISYRAGVYRTHSMVYLQLRKIGEKYLSSVIRYAIIYAE...,,,
8,HISTDB_H4_H3_like_16,H4-H3-like_(Viruses),,,,,,,,,,MVQGSKKRPAKKTVSKAYTTKTSRKKYDETRPKRGSLRRMIKRYLD...,,,
9,HISTDB_H4_H3_like_17,H4-H3-like_(Viruses),,,,,,,,,,MSTRKGRRRSSATKRKTKTMKDPIEGLKNPQIKRLAYRAGVQRIDG...,,,


In [125]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [126]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [127]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H4_H3_like_0,H4-H3-like_(Viruses),,,,,,,,,,MAAKPNKKTTSAKSKKVSAKTTTKTSKPKKNKDNSESLEDTLEGQI...,,HISTDB_H4_H3_like_0,irwin_self-assembling_2024
1,HISTDB_H4_H3_like_1,H4-H3-like_(Viruses),,,,,,,,,,MPPRVRKILRNNIHGVSTPVIKRFYRRGGALAVPTPLVEYTRSYLS...,,HISTDB_H4_H3_like_1,irwin_self-assembling_2024
2,HISTDB_H4_H3_like_10,H4-H3-like_(Viruses),,,,,,,,,,MSTKVKIPVRRARVTLKNANIAKPSLVRVARRAGVVSLQELVYPEM...,,HISTDB_H4_H3_like_10,irwin_self-assembling_2024
3,HISTDB_H4_H3_like_11,H4-H3-like_(Viruses),,,,,,,,,,MKGYGMIGEGSWDMTKGKRKDMKEMKGNGGGDEISYENEKWAEMAG...,,HISTDB_H4_H3_like_11,irwin_self-assembling_2024
4,HISTDB_H4_H3_like_12,H4-H3-like_(Viruses),,,,,,,,,,MASPEGIESITNPDITRLARKGGAKIIAENVYDFAREIMRDYVGEL...,,HISTDB_H4_H3_like_12,irwin_self-assembling_2024
5,HISTDB_H4_H3_like_13,H4-H3-like_(Viruses),,,,,,,,,,MTTRNRKQVTSTKNTEIEDPITGFKSIQIRRLAYRAGAERIESDTY...,,HISTDB_H4_H3_like_13,irwin_self-assembling_2024
6,HISTDB_H4_H3_like_14,H4-H3-like_(Viruses),,,,,,,,,,MAGRHRMVLRDPIQGITKPAITRIANRAGCKRLSGLIYEETRGIAL...,,HISTDB_H4_H3_like_14,irwin_self-assembling_2024
7,HISTDB_H4_H3_like_15,H4-H3-like_(Viruses),,,,,,,,,,MNDSALKRISYRAGVYRTHSMVYLQLRKIGEKYLSSVIRYAIIYAE...,,HISTDB_H4_H3_like_15,irwin_self-assembling_2024
8,HISTDB_H4_H3_like_16,H4-H3-like_(Viruses),,,,,,,,,,MVQGSKKRPAKKTVSKAYTTKTSRKKYDETRPKRGSLRRMIKRYLD...,,HISTDB_H4_H3_like_16,irwin_self-assembling_2024
9,HISTDB_H4_H3_like_17,H4-H3-like_(Viruses),,,,,,,,,,MSTRKGRRRSSATKRKTKTMKDPIEGLKNPQIKRLAYRAGVQRIDG...,,HISTDB_H4_H3_like_17,irwin_self-assembling_2024


In [128]:
# Make sure data is committed to the database
conn.commit()

# Add H2B-H2A-H3-like_(Viruses)

In [129]:
data_histone = [{
    "id": f"{htype}-like_(Viruses)",
    "level": "variant",
    "taxonomic_span": "Viruses",
    "taxonomic_span_id": "10239",
    "description": None,
    "parent": "Triplet",
} for htype in ['H2B-H2A-H3']]
for dh in data_histone:
    cursor.execute(add_histone, dh)

In [130]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin([f"{htype}-like_(Viruses)" for htype in ['H2B-H2A-H3']])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
154,H2B-H2A-H3-like_(Viruses),variant,Viruses,10239,,Triplet


In [131]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H2B-H2A-H3-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [132]:
type_like = 'H2B-H2A-H3'
accession_prefix = f"HISTDB_{type_like.replace('-', '_')}_like"
print(f'accession_prefix: {accession_prefix}')

accession_prefix: HISTDB_H2B_H2A_H3_like


In [133]:
fasta_str = '''>M-3300005589-24.Ga0070729_10000551_84
MEAKQQNLTESVASVAKVMRKKTRYFETYISRVLKNVASENGITSNAKQQLNSAICILARILSSVMTKLTVSSKKKTLSVKEVKNASILYITGTLLENAVKHAEESVVKFSQGETKHSSRQDKAGILFPPSITEKFLRDFGLSKVMVTKTAPIYFAAILEYLTTVILENASVLARENTRVRITIRDLEIAVRSDPDMNKLWEKCGISFIGGGVIPQIHDSLLAKKPRRKRKVKDTATATKKGHRFRPGTVSLREIKKYQKASNCLTFAKFPFERLVRSVISEQQEGMKISKDVFIVLQYYIEQFIVDFLRDAGSAAIHSGRVKLMPSDIQFISNLRHYPQLDATPFKKEKKVETEDQQEGQKLELETV
>M-3300012952-6.Ga0163180_10010018_4
MSVEQNSENLNNLDTKHKKKKFRFYDSYIPKILKQSFDNNGITSDARQQLNSILIIFSKQIANLAHELTLIAGKKTISVKEINGAVMIYITGELQNHAITEGKKAVEEFNKNINNKGSSRQTKAGILFPPSVVEKFLRKFDTSMIMVTHGAPVFLAAVLEYICLEIIELSAILAKEDKRIRITVSDLESAIKSDVELSKLLTNNNIKFLGGAVEQYIHPNLISNKMGKQVKRVIKTATNEEKVIKYKAGSIAIKDIKKYQKMGNTLIFAKQPFEKFVRQIISEYKDNVKISKVVFSIIQYVIEDYLVNFLISANAAAIHAGRVKLMAIDIDFIHTQKQNKKNIADNKIYNLFEMLENKQKKDDTNVESDELESEESESEKTNDNSTDVYNFNSDKISEQDIITTNQKSKDHFFLGSQQKHVTVDDDFDQLEEETQYTKILPNVPENSEIVIGS
>M-3300012953-22.Ga0163179_10001168_3
MNRSTQKLNGKRKRTRFYEIYLTKLLKQVSGENGITSNSKQQLNSILCVITRLVSTKVNELTEMAKKKTLSDREVIYALQVLFHGDLGKGMITTCRQAVQKYTLDSDTKGVTRQDRAGIIFPPSVTEKYLRNFGYSKTMVSNNAPVALAAAIEYLAGEMLENAAVVAKQKKRVRITIRDLEIGVRTDRELSTFFAENNLSFLGGGVMPYIHPSLLVKKTKRRRGRKSAGKRTHRYRPGTVSIREIRRFQKTSNCLTLARSPFEKYTRLVVQKLVGDKESNMKISKNVFSTLQYFIEQQMVTTINKSYMAAIHAGRVKLTAPDLKFVKGLVGLKINDEEPVKSLVIDNDDSSKEDVEEDVEEDVEEDVEEEEEEDVEEDVEEEEEEVEEEVEEEVVEEEDVESPDFNEWKDKQIESN
>M-3300017991-11.Ga0180434_10016851_6
MQSRAKSQPPPKAATLVKAAVGKKKKSRFFETYISKVLKQVSPNNGITSNSKQQLNSALCIIARTISLMVVKLTEIAKKKTLSDKEVCNAVKVLFTGELAKHAIAEGTKSVTRFSGTSIKGTSRQGKAGIIFPPSITEKFLRNFGYSKVMVTSSAPVFLASILEYLVAEILDLSSKSANSNKRIRITIRDIQISIGKDEELSVLFDKLNISFLGGGIVPFIHPCLLTKKPRKKKKNTDSTPGVKKPHRFRPGTVAIREIKKFQKMSNCLTFAKFPFERSVRAIVNQYNPSMKISKDVFIILQYFIEQYIVNLLKNANAAAIHAGRVKLMLSDIEFICNIKKLSTEYKLTSVDEVTNGNENQEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEELVDE
>M-3300020163-17.Ga0194039_1005660_3
MSETKDTKNKHKSRYFDTYIFKVFKIISGKHGMTSNAKKQLNSVLCILANIIQERCMKILSVSKKKTITVREIMCAIKLTFSGNLQTAILKECVESLTKYDNLNQTSNRRSKHTRIGMVFSPSVSEKFLRNFGLCKALISDNAPVCLSTALEYICFEILSSSIKYMPQKKSRLSIREIYLGIKNDTEMNTFFVNHGIYLIGEGVKQNFFSDSDEKNCRSLCVTDEIKKIQGVYDCLFISKSFFKKLVRYYVVKHTGATKLNKNMVTILQYVIENYIVHLLQESNRLSIYTGRKKVTGDDIKFILSLTEGEEITVTCGTFQQNDKIQEDDFSDISDVENC
>M-3300021364-5.Ga0213859_10000024_20
MSTETHKPVLKEKYSDSVLSEYYRYNGVSSLKTYIRRNVDKVALKNNLNLSITDETLNLLDYVIILFASKISELSISLAKLNNMKTVFSQNILTASKLTFGDEVYSLLKNEIDGFIENFENTKSSKKTKKQSRSKQTGIDIPISYCERYLRHFDLCVAADAPVALASVLEAFSIYIIDLAKDLAASNKRIRLINNDIYKTVTENSELTRLFSKYNILLLGMGSIQFIDERIIESYKEKSNKKLQKNSGEKRRFRPGTVALKSIRQQQKNTNNIIPRTIFWKIIKSICKNYSDDNNTILTKNSITYLQPLIEYEITNLIKYANDIALCNGRTTLFEEDFAYVLRIMNIDFDDSLDKESEDYCIIGESSMRKLSKKSGVYRYSIDCNNIIKKFTYFTIKKYIHDLILLLSSQGKSTLNVETIQYYLSNVHNIELFVSLKSLKTNSKNKSNEDEDEEDEEDEEDEEVDEDEEDDEEEVDEVDEDEEDDEEEVDEDEEDEDDE
>M-3300021364-5.Ga0213859_10000473_11
MSTENIPTLKNEIEDPFPPAQRPSMSGTESHESQVDLAPEDTILKRSSRELESYILQILKNISPDNSMKNSAKFQFNSILKNLSQSLSEKILFLNTNSSTKTIDTKTVKNTIQIISTQDFFEKIDSFSTSILQNFNNNKSPDQKKFISRQKKADIVFPPSFFEKIIRKHSSKTHINKEAPIYIAAFMEFLCKDILLISSQICSENLKKRLSIRDIFIAINTTEHLKYIFNSLNVKFLGTGVVPSVQHDLYNDKSIRKKNIIKQIRKYQHSSSLIFSKTPFERIVRNKVLSYSENIKISKEVFNILQTYIEQKLTDFFKKVFLIISANGRIKAVSNDLYLVCSLENINIERRDIELLSLASNLEDTDLESNIDNDSNVDYETDSENDSD
>M-3300022752-15.Ga0214917_10000327_47
MEEQQKHAKKKNHFFEIYISKVLKQVSPDNGITSNAKQQLNSFLCILLKEITTIVSELTIISKRKTISLKEIKNALNVILFGELLSCCMKEGERACDAFSIETKGSKHSRANIIFPPSLIDKFLRNNCNFSIASLAPVYLAAVLEFITFDILDISVKHSKETKHNRITVRDMELSVRNDIELDTLFKKYNISFLGGGVVPFIHNSFQTNTKSLAIKNINKQQKNSDSLVLAKSPFEKLVRHVLKEHFSDHSKISKDVFIILQHFIEQYIIDLLYNANYLTIHAGRIKLIPLDIQLYLSFKTQNSFNSLNKNPYLNSEDISLLSIENE
>M-3300023174-125.Ga0214921_10003031_20
MELKKKSHYFEIYISKVLKQISEYSAITLNAKQQLNSFLLIILKCICTYIFNLISVTKKKTINIKEVENSLKLILSGELLSNSIKEGNKSCQTFSQNNVKGNRQHKAKIIFPVSVIENFLRNTNSNIMISTLVPVYIASVLEYLTFEILDMSVILTNEYKHNRITVRDLELSVRNDIEFDLLFKKHNISFLGGGVVPYIHESLVKKKSTLAIQNIIKQQNKTSLIFSKLPFKKLVRHIFKSKLNYSIKICKNVFTVLQCFIEQYIIKLLYESNFLSIHAGRVKMIPIDIDLYESLCNNKVNPYTNSKNVELLILNSDENLI
>M-3300023174-198.Ga0214921_10000796_50
MDEQQQKTNIVAKKKTHFFETYISKLLKNIAPQNGITSNAKQQLNSFLCFFLKTIASTINDLTIIAKKKTISTKEVENSLNIILLGELRNLCINEGKKACDSFSSHDNKGSKQTRANIVFPPSMIEKFLRNFGYSKIMVANLAPVYLASVLEFITHEILDISVNYCNQGKHVRITVRDMELAVRNDVELHALFKKLNISFLGGGVVPFIHSSLLAKKNKKKVVPKKDTHRYRYGTMALKNIKKQQKQSDSLVLSKSPFEKLVRQIIKNNLDDNVKISKDVFTILQYFIEQYIIDILYHSNYLTIHAGRVKLLPVDIQLYNSFQNHNNISSLNKNPYITNEIVNLLSIDSNNDNENEI
>M-3300023179-108.Ga0214923_10000479_30
MEEKNRKKKTHFFEIYISKVLKQICDCGITGNAKQQLNSFLCILSKKVSNTVLDLTLFGKKKTISDKEIINALNVILPGELLKNSISEGKKSIDSFKNNNEKDIKGTRQTKAQIIFPPSITEKFLRNFGNTKIMVTSTSPVFLAAVLEYLTYEILDLASIYCKDNKRIRITIRDLEVVIRSDEELNKLFTKLNITFLGGGVVPYIHESLLKKNKKKRTTNNVKNTNNPKQHRFRPGTVAIRDIKKYQKTSDNLILAKSSFEKLVRQIFKENRDDNENVKISKDVFIILQHFIEQYIVKLFYNSNFLAIHSGRVKLLSIDIAFISYLYNDSKNPYNSLLNNNSVFSIYENNEEVNGEVNYEEVNDEEVNDEEVNDEEVNDEEVNDEEVNDEEVNDEVNDEVNEE
>M-3300023179-111.Ga0214923_10000143_36
MDEQQKIIKKKNHFFETYISKVLKQIAPNNGITSNAKQQLNSFLCILIKKIAFTVSELTSLAKKKTISNKEIENALKIILFGELLNCCIKEGRKASEMYVSQDNIKGSRQNRANIIFSPSIVEKFLRNSKLMVSNLSPIYLAAVLEFITYEILDISIGICKQHKRIRLTIRDLELAVRNDIELNVLFKKLNVSFLGGGVVPFIHSSLLNKKKITSKKDTHKYRYGTIALKNIKKQQKNSDSLVLAKSQFEKLVRYLLKNNLEDDTNIKICKGVFIVLQYFIEQYIINILYNANYLTIHAGRIKLLPVDIQLYNSFLTHNSFNLFNKNPYINSENMTLLSIEDVNDEEDVNEE
>M-3300023179-146.Ga0214923_10000147_30
MEEKQMQNEEILNSKILKKKRTHMFETYISKVLKQISSQNGITHNAKQQLNSALCHILKYISQNTIKLTSIAKKKTISLKEVENALKLSLSGKLLVNSLNIGNKSLENIASNGTNVNVNSSRQLKAGIIFPPSIVEKFLRCFGTSKIMVSGNSPIFLASVLEYICYEILDLSVGLCSEGKHIRITIRDLELSVRNDTELNDLFCKNNISFLGGGVVPFIHSSLLNKTKIKKKTKKEQTTSHHRFRYGTLAIKNIKKQQKISNSLVLSKSPFEKLLRNYFKMNQSENQKISKEVFTVLQYFIEQYIVNILKNSNFLAIHSGRVKVIPSDISLYMSFMRNGKNNPYTSLNLNLVSVDLIDDDLSFPNNDNDLDEENCDENSDE
>M-3300023179-159.Ga0214923_10009027_6
MEEQKIETISNKVLTKKKKSHMFETYISKVLKQISSSNGITNNAKQQLNSVLCFISKHISLLTYKLTSAGKRKTISLKEVENALNIVLSGKLLENALKEGHKSCGNISNSSAADVNYSSRQNKAGIIFPPSLTEKFLRDFGSSNIMISNLSPIFLASVLEYICFEILDLSINNCKENKHVRITIRDLELSVRNDEELNNLFIKLNISFLGGGVIPYIHSSLLKKTKIKKKKNNETKTKENNHRFRYGTLAIKNIRKQQKISNCLILSKSPFEKLVRTIFWKNNTSNQKFSKDVFVVLQYFIEQYIVNLLRNSNFLSIHAGRVKLIPYDILLYNSFTNGGKGNPYTKPILNLFSLENENLNFMLETNAESSNNDINCVDDDDLEEEDEEELEEEENSSEE
>M-3300023179-32.Ga0214923_10001249_7
MEEPKEVININKNCIKKKKSHMFEIYISKVLKQISATNGITNNAKQQLNSAMCHILKYISSLILKLTISGRKKTISVKEVENSLKIVLSGELLNCALEEGNKSCLNISSSSAENINLSRQQKASIIFPPSVVEKFLRNFGSSKIMVNSLAPVFLASVIEFISYEILELSVNFCKENKHNRITIRDLELSVRSDVELNLLFQKLNLTFLGGGVLPYIHTSLFNKVTKKKKPLNAAAKESHHRFRHGTLAIKNIKKQQKLSNTLTLAKTTFEKIIRSKFRQFHQPNDGLIKISKEVFVVLQYFIEQYIVQILNHSNYLSIHSGRVKVIPNDILLYLFFKNENQNNPYILSKLNLFSLDSENLLNQQSPTSNMTSSILSDNSLFTEPSLTNSNSDENNFEDDEQPQLIEE
>M-3300023179-45.Ga0214923_10001383_29
MELKKKSHYFEIYISKVLKQISEHSAITLNAKQQLNSFLLIILKCICTYIFNLISITKKKTITIKEVENSLKLVLSGELLSNSIKEGNKSCEIYTTNDIKGNRQNKAKIIFPVSVIENFLRNNNTNIMISALVPVYIASVLEYLTYEILDMSVILTNEYKHNRITVRDLELSVRNDIEFDLLFKKHNISFLGGGVIPYIHESLIKKKSTLSIQNIIKQQNKTSLIFSKLPFKKLVRHIFKSKLNHPIKICKNVFIILQYFIEQYIIKLLYESNFLSIHAGRVKMIPIDIDLYESLCNNKLNPYTTSNKNIELLILNSDEI
>M-3300023179-83.Ga0214923_10000013_148
MKNENEYATTNKKKKTHLFEIFIVKVLKQISPSNNLTNNAKQQLNSFVCVFLKQLVNILFNLVSYAKKKTYSVREVENALQFLLSGDLLHNCLLEGKSAVERYMSWKNLSVKTSSSKQNKAGILFPPTIIEKILKTSNVMIASNTPIYLATICEYIIAEILEVAVFYCKQERRTRIIIRDIELGIQNDIELKKLLSSLKVNFIGGGVVPYIHNSLFTKKTTKKHTSKMMKAMKIQQKLSNHLVMAKSSFERMVRHIISNDSFNSSQTVKIDKISKSVFLILQYYIEQYIIKLLHKANFLTIHANRIKLMPIDIELINSLLNNTANPYIIEETILLDIENTSV
>M-3300023184-29.Ga0214919_10008089_6
MEENKNVINNKVVKKKKSHMFETYISKVLKQISPSNGITNNAKQQLNSILCHIIKHISMLTVKLTMAGKKKTISLKEVENSLTLVLFGKLLENSLKEGKKSCDNISNNEKNINSSRQNKAGIIFPPSLVEKFLRDFGSSNIMIGNLSPIFLASVLEYICFEILDLSVNYCKENKHIRITIRDLELSVRNDVELNNLFVKVNMSFLGGGVLPYIHSSLLNKTKTKIKKKKNDEKENTHRFRYGTLAIKNIKKQQKLSNCLILSKSPFEKLVRSIFKKNKIEFQKFSKDVFIVLQYFIEQYIVELLRNSNFLAIHSNRVKLIPYDILLYNSFINGGRENPYTKSSLNLFSLENQNLIFDGTQDQDEETSDDVQIPYLEEDEDDENNY
>M-3300024319-1.Ga0228670_1000005_5
MDTTTTRVNKKKRSRVFETYISKILKTIAPSNGITANSKQQLNSVICSISKIICDKVFSLTEISKKKTISEKEIKNAIKVLFPTDLADSITEQGDMAISNFKNKEISKGVSRQDKACIIFPPSQSEKFLRNFGYSKTMVTSHAPVFLAGSLEYLTTLILESAVSQAVDNKRVRLTIRDLELSIRSNKNINHFFKDMNICFLGGGVEPYIHPSLLKKKNKKKKSKKNDDTEINEKKKHRFRPGTVSIREIKKYQKLSNSLTFAKYPFEKILRETINQENQTDIPIKISKDVFTITQYFLEQKLIEILKKANFAAIHAGRVKLMPIDIKFISSICLGHENPHSFQELDKNEVEVDDLDDVEDKDDFIEEVDDEIIEEFLEEEEEIDEEEDESLCEVTN
>M-3300025676-16.Ga0209657_1000031_9
MDTTTIRVNKKKRSRVFETYISKILKSIAPSNGITANSKQQLNSVICSISKIICDKVFLLTEISKKKTISEKEIKNAIKVLFPTDLAYSITEQGDMAISNFKNKEISKGVSRQAKACIIFPPSQSEKFLRNFGYSKTMVTSHAPVFLAGSLEFLTTLILENAVSQAVENKRVRLTIRDLELSIRSNKNINHFFKEMNICFLGGGVQPYIHPSLLKKKNKKKKSKKSDDADSNEKKKHRFRPGTVSIREIKKYQKLSNSLTFAKYPFEKILRETINQENQTDIPIKISKDVFTITQYFLEQKLIEILKKANFAAIHAGRVKLMPIDIKFISSICLGHENPHLNEELDKNEVEVEVEVEVGDLDDLDDLDDVEDKEDFEEEVDDDIIEEFLEEDEEIDEDGESLCEAAN
>M-3300027697-22.Ga0209033_1001294_12
MVVPQKDVVVSQKIKKKKTRFFETYISKVLKQVSESNGITANSKQQLNSALCLISRLIASTVITLTEMAKKKTMSDKEVKNALLLILPEQLAANAIIEGQKAVASFEKGDNVKGTSRQEKASILFSPAISEKFLRNFGYSKVMVTSQAPVYMAGALEYLTSEILENASASARDNKRVRISIRDLELGVRNDNELNTFFTNNNISFLGGGVTPFIHQSLLLKKNRNKKRCKKTETDGDKKKHRFRPGTVSLREIRRFQKMSNCLTFAKFPFEKLVRQVVKTHNNDSSMKISKDVFIVLQYFIEQQLTSLLRNANFAAIHAGRVKLMPIDIDFVSAISSGTQNPYQTGSIANIEEGPVINDLVSADEEENGENGGEDGGGEDGGEDGDEDGDEDGDEEELLEEE
>M-3300027770-59.Ga0209086_10000072_16
MPNLFENHIQRVFKNVLNQTYTSIPIYGNSIVSFTKTQLSNALNIICLKIIKISIELSKYKLRKKINEDDITNAVSLVFPGELLKNFIRFNVNINSSKYFIFPLNSMKKIIKFNTDDMILSKKCPNILSVALEYICAEILTISIVEAHFGKKARINTLHLESAIRKDKEIGSIFKNITFSTNSFTIPKEIFKQFINKMYTDFNISKNVLLNIQSYIEQYIKNIIKSAKLLYTHSGRDFLKARDLNFVLKNIMCKEHNENSLIII
>S-1016713-169.1016713_contig_219_50
MLKGKNYDQYLSKLLKRVSPTNGITSNSKQQLCSVLCYITRVISYTVFELLSVTNKRTISDKEIIKSIIILFPKELAKTMISMCEQSIENFTKVNELGVTKQDRAGIIFPPSISEKYIRKFGMSKIMVSNTSSITLVTAIECLAEEILEISSLSAKQNKRVRITIRDLEIGVRTDKDICKFFVDNKISFLGGGVIPCIHPKLLNNKNLPQKFKNQTRYRPGVISLRNIKRVQKTSNCLILSKLPFERYTRFKLKEYQPYSDKTVKVSKDVFIILQHFVEQRIISLLKKAGMLVIHAGRLKLMAADINLVKIIDSGHNNYLHDDWKEI
>S-1016716-111.1016716_contig_9580_6
METQQQITGHKKKKTRFFETYISKVLKQVSDNNGITSNSKQQLNSALCIIARVLSETVGKLTEIAHKKTLSEKEVINALAMVLPGELSKNAIGEGTKAVEKFQRTPGKGSRQDKAGIIFPPSIAEKFLRGFGYSKIMVTSSAPVCLAAALEYITAEILELASNSAKDHKHVRITIRDIELGVRNDCELNNFFVKYKITFLGGGSQPFIHPSLLVKKNRKKKKISATAEPGVKKPHRFRPGTVSIREIRKFQKMSNCLTFAKFPFEKAVRGVVNTNNTRWCTMKISKEVFIVLQYFIEQSIVNVLRNANYAAIHAGRVKLMPNDINFVCAVQNGNDNPYSTKDNNVSPTISEEESVEESVEEELVEEELVEEELVEEESVEEESVEEELVEEELVEEESVEEELEEEESVEEELEEEEIEEDELEEEEIEEEELEEEELEEE
>S-1030632-100.1030632_contig_8437_6
MNGSTQKLNGKKKRTRFYEIYLTKLLKQISGENGITSNSKQQLNSILCSTTRLISTRVNDLTEMAKKKTMSDKEVVNALHVLFPGDLGKGMSAMCDQAVENYRNDDTSKGITRQKRAGIIFPPSVTEKYLRNFGYSKTMVSNTAPVALAAAMEYLAGEILENASVYAKQKKRVRITIRDLEIGVRTDNEINSFFEENNLSFLGGGVVPYIHPSLLVKKVQKRKSKKPVGKRTHRYRPGTVSIREIRRFQKTSNCLTLARSPFEKYTRSVIQKILGDGNSIKVSKKVFLSLQYFVEQRLVSLLQKAYMAAIHARRVKLSVADIEFIRSLNGTKAVEDPVKTLNVDDESSFRDETKEEFDKIDTNHDGKISVTEFKEWKEKKLSRKSTLYAI
>S-1038524-41.1038524_contig_17_103
MSSKAGIKTKTAPRVKETTPDKATASDKGAAAKRKKSRFFETYISKVLKQVADKNGITSNSKQQLNSALCIVARTISLVVTRLTEIAKKKTMSDKEVANTVRVLFSGDLAENSIREGVKSVEKFSAEASKGSSRQGKAGIIFPPSIAEKFLRNFGYSKVMVTSTAPVFLAAVLEYLVAEILILASKSATNNKRMRITIRDLQLSVGEDQELSTLFDKLNVSFLGGGVVPYIHSCLITKKPRKKKTKTVDAAGVKKPHRFRPGTVALREIKKFQKMSNCLTFAKFPFERFVRQVVNKNNTGMKISKDVFIILQYFIEQYVVGILKDANAAAIHAGRVKLMLTDIEFISSIRGLSTNGFSDKPVKKITKTKKTVDNADNADNAETVENESESEDEEPVVEDEEESSDEDEPSDEDELVDE
>S-1101173-79.1101173_contig_5_105
MEEHKNVINKVVKKKKSHMFETYISKVLKQISASNGITNNAKQQLNSILCHITKHISMLTVKLTLAGKKKTISLKEVENSLTLVLSGKLLENSFKEGRKSCDNISNNEKNINSSRQNKAGIIFPPSLVEKFLRDFGSSNIMIGSLSPIFLASVLEYICFEILDLSVNCCKENKHVRITIRDLELSVRNDIELNDLFVKLNMSFLGGGVLPYIHASLLNKTKTKIKKKKTSDDNNENTHRFRYGTLAIKNIKKQQKLSNCLILSKSPFEKLVRNIFKQNSVETPKFSKDVFTVLQYFMEQYIVEFLRNSNFLAIHAGRVKIIPYDILLYNSFINGGRENPYTKSSVNLFSLENQSNLNLIFDGNQDEEKSDESPIFIDEEDYDEEN
>S-3300002186-40.3300002186_a_JGI24539J26755_10000119_25
MHFFDTYISKVLRQVSCENGIKSNAKQQLNSILCLICERISSQVIRLTKMSKKKTLSVKEVRNAIRITMNGELAENSILHGDTAIKSFNDNNCKNTSRQNKSGIIFPPSITEKFLRDFEMSNIMITGDSPIYLAAVMEYICREILDLACNIAIEYKRKRISIRDLELAVQTDSEINLLFISLNLSFLGGGVVPYIHPLLFNKKSKKKKKICITSEKDRSLKKHRFKPGTVALREIRKYQKSTTPLFAKQPFARMIRSTVNKYNYEMKISKEVFTIMQYFIEQYITDVLYTIGLAAIHSGRVKIISDDISFVCRCKNIQDPTIIDQQEIEILSISDQDSNLDDIDSESESDLGVCKEENDLSDNGNDMSDKDDLSDNGNDMSDKDDDDGNDDCCNSEK
>S-3300010368-70.3300010368_a_Ga0129324_10000075_59
MDAQQKQSCYKKKKSRFFETYISKIIKQISETNGITYNAKQQLNSFLCIISKFISHKVRELTEIAKKKTLSEKEIINTLQIILSGQLAKLTISESITAVKQYNSKECNGSSRQNKAGIIFPPSITEKFLRQFGYSKIMVSCNAPVCLAAALEYITTEILKLASTSAKNNKRIRITIRDLELSVRNDTELNILFNKLNINFLGGGSTPFIHSSLVVKKSKKKSSKQPLSITNDTTKKTRRFRPGTVAIREIKRFQKISNCLTFAKYPFVKLVRQIVGEIEPNHKISKEVFIVLQYFIEQYIVDILKQANLAAIHAGRVKLIPSDIKFILSVKNYKHINQLQDILSVEEEVEEVDEEVDEEDEEDEDEEEEEDEEDEDEEELEEEDEER
>S-ERX556045-99.ERX556045_contig_436_42
MNGTNQKLNNKKKRTRFYEIYLTKLLKQVSSENGITSNSKQQLNSILCSTTKLISTKVNQLTEMVRKKTTSEKEVINALNVMFLGDLGKGMTAMCKQAVENYNDDVSTKGITRQERAGIIFPPSVAEKYLRNFGYSRTMISNGAPVALAAAMEYFAGEILENAAVLAKQKKRVRITIRDLEMGVRTDNEINNFFEKNKFSFLGGGVVPYIHPNLLVKRTRRRKSKKSEGKRSHRYRPGTVSIREIRRFQKSSNCLTLAKSPFEKYTRSVIQKFIGNSDSIKVSKNVFLTLQYFVEQRIVSLLHKAYLAAIHTGRVKLTVSDIEFIRSLNITNTEDDPVKTLNMDDSSLPRDIKEEFEEIDSNNDGKISITEFKDWKDKQVKDSLLF
>M-3300009786-10.Ga0114999_10020985_3
MNERNCAKRKSQYFLNYIPIVLKSICNKSGITTDAKVHLNKSFIIISRNISELAFKLTVDENRKTISKKNLLAALGMFLIGDLYKNSLKESDKALYNFNKNKYKKKCGRHIKACIIFPPYLCEKFLRNFNNNQIMICKETPIILAAVLEYICSQILECVRDNLYNEKRTRFKTIDIDFAVKNDVELSYLFKKNNIKFINGFLIPYIHPTLLLKIEQTKPKKKSKKDPPILRNIKKFQKHGYKLTLCKTSFITLVKNIIDGKFHKNVRLSPNIITVLQYLVEADMIELLFKANLLTLHSERVQLLERDVIFMSQIIKDMHHIKNNMHLIKLGDVSDVEKEENIVI
>S-3300010158-109.3300010158_a_Ga0114960_10000830_4
MIYETYVENDNINSRNVNNSGGLDTDHLRSSPPADPTDPIKKKKKKTRHFEHHIRKILKEISSDRDITQIAKTQLNDLAVITCKLIKQKVLVVLQSNKKRTITNVEIEAAIHLLFTGQLEQRLIEEGKKCVQQYIDNTKTKELKGQSRNTKAEILIPPSILERMLRSDSFQMFQMGFSAPIFLAGVIEYFIAQVLQLSVLTSKTVRITVQDLEHGIKSDREIKRYMIDQNIYLFEAGIVPFIHPGIRQLTGRNDKKSVKLMAKLQESNSSIFPKRFFEKLCKHFVMLLYPDIRFQKGCFVYLQDYIEKWIVGILQHTNILTLYSKKSRVTATDIEMVVSIMERRSPSFLHQDQLISDMEVLVLN
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MEAKQQNLTESVASVAKVMRKKTRYFETYISRVLKNVASENGITSNAKQQLNSA...ETV'), id='M-3300005589-24.Ga0070729_10000551_84', name='M-3300005589-24.Ga0070729_10000551_84', description='M-3300005589-24.Ga0070729_10000551_84', dbxrefs=[]),
 SeqRecord(seq=Seq('MSVEQNSENLNNLDTKHKKKKFRFYDSYIPKILKQSFDNNGITSDARQQLNSIL...IGS'), id='M-3300012952-6.Ga0163180_10010018_4', name='M-3300012952-6.Ga0163180_10010018_4', description='M-3300012952-6.Ga0163180_10010018_4', dbxrefs=[]),
 SeqRecord(seq=Seq('MNRSTQKLNGKRKRTRFYEIYLTKLLKQVSGENGITSNSKQQLNSILCVITRLV...ESN'), id='M-3300012953-22.Ga0163179_10001168_3', name='M-3300012953-22.Ga0163179_10001168_3', description='M-3300012953-22.Ga0163179_10001168_3', dbxrefs=[]),
 SeqRecord(seq=Seq('MQSRAKSQPPPKAATLVKAAVGKKKKSRFFETYISKVLKQVSPNNGITSNSKQQ...VDE'), id='M-3300017991-11.Ga0180434_10016851_6', name='M-3300017991-11.Ga0180434_10016851_6', description='M-3300017991-11.Ga0180434_10016851_6', dbxrefs=[]),
 SeqRecord(seq=Seq('MSETKDTKNKHKSRYFDTYIFKVFKIISGKHG

In [134]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": f"{type_like}-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300005589-24.Ga0070729_10000551_84
------------------------------------------------
M-3300012952-6.Ga0163180_10010018_4
------------------------------------------------
M-3300012953-22.Ga0163179_10001168_3
------------------------------------------------
M-3300017991-11.Ga0180434_10016851_6
------------------------------------------------
M-3300020163-17.Ga0194039_1005660_3
------------------------------------------------
M-3300021364-5.Ga0213859_10000024_20
------------------------------------------------
M-3300021364-5.Ga0213859_10000473_11
------------------------------------------------
M-3300022752-15.Ga0214917_10000327_47
------------------------------------------------
M-3300023174-125.Ga0214921_10003031_20
------------------------------------------------
M-3300023174-198.Ga0214921_10000796_50
------------------------------------------------
M-3300023179-108.Ga0214923_10000479_30
------------------------------------------------

In [135]:
for k, v in data_sequence[0].items():
    print(k, v, type(v))

accession HISTDB_H2B_H2A_H3_like_0 <class 'str'>
variant H2B-H2A-H3-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MEAKQQNLTESVASVAKVMRKKTRYFETYISRVLKNVASENGITSNAKQQLNSAICILARILSSVMTKLTVSSKKKTLSVKEVKNASILYITGTLLENAVKHAEESVVKFSQGETKHSSRQDKAGILFPPSITEKFLRDFGLSKVMVTKTAPIYFAAILEYLTTVILENASVLARENTRVRITIRDLEIAVRSDPDMNKLWEKCGISFIGGGVIPQIHDSLLAKKPRRKRKVKDTATATKKGHRFRPGTVSLREIKKYQKASNCLTFAKFPFERLVRSVISEQQEGMKISKDVFIVLQYYIEQFIVDFLRDAGSAAIHSGRVKLMPSDIQFISNLRHYPQLDATPFKKEKKVETEDQQEGQKLELETV <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [136]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [137]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_H3_like_0,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEAKQQNLTESVASVAKVMRKKTRYFETYISRVLKNVASENGITSN...,,,
1,HISTDB_H2B_H2A_H3_like_1,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MSVEQNSENLNNLDTKHKKKKFRFYDSYIPKILKQSFDNNGITSDA...,,,
2,HISTDB_H2B_H2A_H3_like_10,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEKNRKKKTHFFEIYISKVLKQICDCGITGNAKQQLNSFLCILSK...,,,
3,HISTDB_H2B_H2A_H3_like_11,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MDEQQKIIKKKNHFFETYISKVLKQIAPNNGITSNAKQQLNSFLCI...,,,
4,HISTDB_H2B_H2A_H3_like_12,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEKQMQNEEILNSKILKKKRTHMFETYISKVLKQISSQNGITHNA...,,,
5,HISTDB_H2B_H2A_H3_like_13,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEQKIETISNKVLTKKKKSHMFETYISKVLKQISSSNGITNNAKQ...,,,
6,HISTDB_H2B_H2A_H3_like_14,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEPKEVININKNCIKKKKSHMFEIYISKVLKQISATNGITNNAKQ...,,,
7,HISTDB_H2B_H2A_H3_like_15,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MELKKKSHYFEIYISKVLKQISEHSAITLNAKQQLNSFLLIILKCI...,,,
8,HISTDB_H2B_H2A_H3_like_16,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MKNENEYATTNKKKKTHLFEIFIVKVLKQISPSNNLTNNAKQQLNS...,,,
9,HISTDB_H2B_H2A_H3_like_17,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEENKNVINNKVVKKKKSHMFETYISKVLKQISPSNGITNNAKQQL...,,,


In [138]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [139]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [140]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_H3_like_0,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEAKQQNLTESVASVAKVMRKKTRYFETYISRVLKNVASENGITSN...,,HISTDB_H2B_H2A_H3_like_0,irwin_self-assembling_2024
1,HISTDB_H2B_H2A_H3_like_1,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MSVEQNSENLNNLDTKHKKKKFRFYDSYIPKILKQSFDNNGITSDA...,,HISTDB_H2B_H2A_H3_like_1,irwin_self-assembling_2024
2,HISTDB_H2B_H2A_H3_like_10,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEKNRKKKTHFFEIYISKVLKQICDCGITGNAKQQLNSFLCILSK...,,HISTDB_H2B_H2A_H3_like_10,irwin_self-assembling_2024
3,HISTDB_H2B_H2A_H3_like_11,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MDEQQKIIKKKNHFFETYISKVLKQIAPNNGITSNAKQQLNSFLCI...,,HISTDB_H2B_H2A_H3_like_11,irwin_self-assembling_2024
4,HISTDB_H2B_H2A_H3_like_12,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEKQMQNEEILNSKILKKKRTHMFETYISKVLKQISSQNGITHNA...,,HISTDB_H2B_H2A_H3_like_12,irwin_self-assembling_2024
5,HISTDB_H2B_H2A_H3_like_13,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEQKIETISNKVLTKKKKSHMFETYISKVLKQISSSNGITNNAKQ...,,HISTDB_H2B_H2A_H3_like_13,irwin_self-assembling_2024
6,HISTDB_H2B_H2A_H3_like_14,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEEPKEVININKNCIKKKKSHMFEIYISKVLKQISATNGITNNAKQ...,,HISTDB_H2B_H2A_H3_like_14,irwin_self-assembling_2024
7,HISTDB_H2B_H2A_H3_like_15,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MELKKKSHYFEIYISKVLKQISEHSAITLNAKQQLNSFLLIILKCI...,,HISTDB_H2B_H2A_H3_like_15,irwin_self-assembling_2024
8,HISTDB_H2B_H2A_H3_like_16,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MKNENEYATTNKKKKTHLFEIFIVKVLKQISPSNNLTNNAKQQLNS...,,HISTDB_H2B_H2A_H3_like_16,irwin_self-assembling_2024
9,HISTDB_H2B_H2A_H3_like_17,H2B-H2A-H3-like_(Viruses),,,,,,,,,,MEENKNVINNKVVKKKKSHMFETYISKVLKQISPSNGITNNAKQQL...,,HISTDB_H2B_H2A_H3_like_17,irwin_self-assembling_2024


In [141]:
# Make sure data is committed to the database
conn.commit()

# Add H2B-H2A-H3-H4-like_(Viruses), H2B-H2A-H4-H3-like_(Viruses), H4-H3-H2B-H2A-like_(Viruses)

In [142]:
data_histone = [{
    "id": f"{htype}-like_(Viruses)",
    "level": "variant",
    "taxonomic_span": "Viruses",
    "taxonomic_span_id": "10239",
    "description": None,
    "parent": "Quadruplet",
} for htype in ['H2B-H2A-H3-H4', 'H2B-H2A-H4-H3', 'H4-H3-H2B-H2A']]
for dh in data_histone:
    cursor.execute(add_histone, dh)

In [144]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].isin([f"{htype}-like_(Viruses)" for htype in ['H2B-H2A-H3-H4', 'H2B-H2A-H4-H3', 'H4-H3-H2B-H2A']])]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
154,H2B-H2A-H3-H4-like_(Viruses),variant,Viruses,10239,,Quadruplet
156,H2B-H2A-H4-H3-like_(Viruses),variant,Viruses,10239,,Quadruplet
201,H4-H3-H2B-H2A-like_(Viruses),variant,Viruses,10239,,Quadruplet


In [145]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H2B-H2A-H3-H4-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [146]:
type_like = 'H2B-H2A-H3-H4'
accession_prefix = f"HISTDB_{type_like.replace('-', '_')}_like"
print(f'accession_prefix: {accession_prefix}')

accession_prefix: HISTDB_H2B_H2A_H3_H4_like


In [147]:
fasta_str = '''>M-3300017963-53.Ga0180437_10003874_10
MSTKSAKTSKNEKSTPNKPETASEPVDVAVAVENVDDDVDQTTKSRKKTKDREPFKVHFKNLLKNIDVSNNCVSAQLLSQLNSLAMVVSKDWASTTSNIVKGIGKGRITSHYLETALCAMLGESRASEYAAHANEALDSYEKAAEAEDVKKCDAIKCTLSLPPHIFRKVLRETGVHVSKDSPIYMAAVVEKMFADLLGSLIKLTETKGRKTLTPKSLYKAVTQDETLVQFASNVLWCETELGELINKQLFNDDRKRKLATARRNRRKAEDTGDADGATTSGKPRLLPGTKVLRQIKKLQRGTETLQRKAHFERFIRAQIGDDAKCGSDVMTEIQRIVENRMVQLHRDALEIMVHAKRNTLEASDLDLAWKMTQPARLNTDNYDGIDNLAQPGLHRISLKAGVKCEATSCYDAVRRIMCEYTNHLLSPVACIVKANGTRIINHSVLRKYLGAVGYNVL
>M-3300020201-24.Ga0163154_10002449_25
MHKSFNVYIHRTLKKSATSFHITKPALEAIDSVIRVTAVNLVNKSLLLTLTNNKKTVSQAELETSVKLVLPPLLMEGSLAFANTAVESFNNFVEGEEKSRTRESRAGLIFSVSATEKYLRCFGQNGLHVSATAPVFLAGVLEFLISEILKLSSVHTCKKVTITVKHIFTAIHSDPEFERFINNIGLVFLDAGVTAHIEPSLLEPKNRKRINSNAGGVKKPHRWRPGTKTLMNIRKLQKTSDLIIQHAPFNRLVRDVGSETKKLRYTNDFLLSLQSFVEDRMIRVMKCANRLALHTGRETVYACDVTLAQEFMEPLLVTSKEPTDNTVIPEAALRKLALRAGIKRYGDDSTETYTRLVVDFVGNYVRDIILCAELHQVQTLTVKLMIESLGMRGLHPATIPRTRKMSKKTNSSRSTSVAVTDNNVSDVEDGELGLPDIDE
>M-3300024336-7.Ga0233447_1001120_5
MSDSEDTIIDRASEESEIESETEESSEETFDDETDSDVEISDEDKHTTEDEDKHATEDKHATEDEDITEDKDEDEEKKEEKVVSSVARKSKTERVASSIFGRYISRVLKTVQSDMDLTSAGKEEINYIIVAIGEHLSNISTELCVSNKKKTITPEMMRDAIKILIPGELGKYANSEAVKMETKWHTYIYETTTKKTPGARKESRIGMIFPVSRCASMLRRYGTTSLAVSATAPFALATTLEYLAAEILELSGSAAKDHSRTRITPRHIFTACYNDKEISALLTTCNIQLLGGGAECHIDAVLTKKTTKKRTKKVQPGVKKAHRFRPGTVALREIRKYQMSYGLLFQRQPFERFVRYITNLLKTDVRFSRNAIINFQYHIERQLVKLYQKAQQIAIHAHRTTIDTSDFRLLFGLERGSIYDNPGVTAHKWTTMFGGEETELTDMSKLPLSIPSFKRLARRAGVKRISHQSYGLVMSHIEYLMVKYLRVVLIVAESRKRRTITVDMLRVGLKLSGVNLIALKFLQR
>M-3300024529-3.Ga0255044_10001051_6
MTNSLLKKKQNRFFASYIGKLLKVEHPKKGITTNTKQQLNSVFCYLSNCLSADSKKMARISSKKTISESEILAVVCINFPRKIYLKVKKKFTAACDKSEKSDEKSVSRQEKAGVTFPPSITEKFLRDDSNMMITKKAPLALSCALEGLCEEIFDSAVGYSDLNKKIRLTIRDIELAVKNTAANEVFKKFSIKLLGCGYVPFIHPALTVKKPQKRKKKGDTSGPKTHRFKPGTVALRDIKKLQKVFNRVILAKSPFEKVIRALVKNKGHNVKISKQVFSILQYYLESYIVGILHRSNLAAVHAGRVKLLPEDIDFILALEDRSLPLNINIDDVKEDDDPVDTKSITAPALQRLARQGGVKTMSSECVDVIRNLILMKAGFICDAILVVNNQGGTKTIMPSDIYSAIEYLGIRLAKAEKFGEPNAN
>S-1101172-105.1101172_contig_94_5
MYEKSFKVYIYRILKKIHPEIHIAKVSTDAINSIISILSKHIVNKSLLLTLHENKKTISSNEIKTSTNLLLSDYKNITEFASTAETLYEKSEQERKNAPVEKPQTRESRSGLIISVSAVEKFLRQSNYHVSSTAPVYLAGVIEHIVTELLNKAGNITKESKRITVTIRHLFLAVSQIPNLNTLLPNLGVVFLQSGVEPQTIENKKHRPIRRRRIVKVAQSEQESPETPVHTPVHTPVHTPVHTPVHMPVHTPVHTPHRWRSGTKTIMEIRRLQKTTDTLIQHAPFKRLVKQITALFTQTKLRFTGEFFSALQGLTEDRIISLMKSSNRLASHANRETVYSRDITLAGELTGLFIRKNEVDTLIPEATLRHLALRAGIQRYGDCCTDTYRDYIFTFLSSCLRDIVMCSEHHKVQTLNTKIFLEAMNMKGIYPTITPKKRKAIKKEETSQVPELSDIEEEAAN
>S-3300010316-51.3300010316_a_Ga0136655_1000533_15
MNDRSFKTYIHKVLRQVHPKCGVAKDVTDTYDGIVRSLFQHLSYTARSLAIKSNTKTVSDRNVRSAVQLTFPAELSDIACKEGDSACLKYTTSTLTKSDSLDENGVSENKTVAIMRESRAGLQFPVSLVEKYLRSFSNLNVGSTAPVFLTAAVECVMAELLDLSGNVAREYKRTRISVRHLALAISEHDSFIKLFDSCNIVMLNGGVAPNMDDRLKINPKVRRTRRILPVQTDENGQRVPRPHRFRPGIVARREVARYQKTCDTLLQHAPFERLVRSLTSRFINDVRFTKDVLLKIQSFVEMKLVNLFEVCQQISLHASRETVSDSDFNLFCRLNKIENISHLAEKNTEIVNAGIQRVGRRGGVVRIKSEFYDCVRSYIVYILDEMLHNMAVLVELERRRTISVKTFQESLSLQGVNLAVIYKKIRYNKSKRVGDGEDGDDGDEEVDGDEEVDEKANEDVDEKVNEEVKSEVKSDVKSEVKSDVKSEVKSEMKSEMKSEVDVKVNYKSTNKALKSTLAKKPRRVHRVKVDA
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MSTKSAKTSKNEKSTPNKPETASEPVDVAVAVENVDDDVDQTTKSRKKTKDREP...NVL'), id='M-3300017963-53.Ga0180437_10003874_10', name='M-3300017963-53.Ga0180437_10003874_10', description='M-3300017963-53.Ga0180437_10003874_10', dbxrefs=[]),
 SeqRecord(seq=Seq('MHKSFNVYIHRTLKKSATSFHITKPALEAIDSVIRVTAVNLVNKSLLLTLTNNK...IDE'), id='M-3300020201-24.Ga0163154_10002449_25', name='M-3300020201-24.Ga0163154_10002449_25', description='M-3300020201-24.Ga0163154_10002449_25', dbxrefs=[]),
 SeqRecord(seq=Seq('MSDSEDTIIDRASEESEIESETEESSEETFDDETDSDVEISDEDKHTTEDEDKH...LQR'), id='M-3300024336-7.Ga0233447_1001120_5', name='M-3300024336-7.Ga0233447_1001120_5', description='M-3300024336-7.Ga0233447_1001120_5', dbxrefs=[]),
 SeqRecord(seq=Seq('MTNSLLKKKQNRFFASYIGKLLKVEHPKKGITTNTKQQLNSVFCYLSNCLSADS...NAN'), id='M-3300024529-3.Ga0255044_10001051_6', name='M-3300024529-3.Ga0255044_10001051_6', description='M-3300024529-3.Ga0255044_10001051_6', dbxrefs=[]),
 SeqRecord(seq=Seq('MYEKSFKVYIYRILKKIHPEIHIAKVSTDAINSII

In [148]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": f"{type_like}-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300017963-53.Ga0180437_10003874_10
------------------------------------------------
M-3300020201-24.Ga0163154_10002449_25
------------------------------------------------
M-3300024336-7.Ga0233447_1001120_5
------------------------------------------------
M-3300024529-3.Ga0255044_10001051_6
------------------------------------------------
S-1101172-105.1101172_contig_94_5
------------------------------------------------
S-3300010316-51.3300010316_a_Ga0136655_1000533_15


In [149]:
for k, v in data_sequence[0].items():
    print(k, v, type(v))

accession HISTDB_H2B_H2A_H3_H4_like_0 <class 'str'>
variant H2B-H2A-H3-H4-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MSTKSAKTSKNEKSTPNKPETASEPVDVAVAVENVDDDVDQTTKSRKKTKDREPFKVHFKNLLKNIDVSNNCVSAQLLSQLNSLAMVVSKDWASTTSNIVKGIGKGRITSHYLETALCAMLGESRASEYAAHANEALDSYEKAAEAEDVKKCDAIKCTLSLPPHIFRKVLRETGVHVSKDSPIYMAAVVEKMFADLLGSLIKLTETKGRKTLTPKSLYKAVTQDETLVQFASNVLWCETELGELINKQLFNDDRKRKLATARRNRRKAEDTGDADGATTSGKPRLLPGTKVLRQIKKLQRGTETLQRKAHFERFIRAQIGDDAKCGSDVMTEIQRIVENRMVQLHRDALEIMVHAKRNTLEASDLDLAWKMTQPARLNTDNYDGIDNLAQPGLHRISLKAGVKCEATSCYDAVRRIMCEYTNHLLSPVACIVKANGTRIINHSVLRKYLGAVGYNVL <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [150]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [151]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_H3_H4_like_0,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MSTKSAKTSKNEKSTPNKPETASEPVDVAVAVENVDDDVDQTTKSR...,,,
1,HISTDB_H2B_H2A_H3_H4_like_1,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MHKSFNVYIHRTLKKSATSFHITKPALEAIDSVIRVTAVNLVNKSL...,,,
2,HISTDB_H2B_H2A_H3_H4_like_2,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MSDSEDTIIDRASEESEIESETEESSEETFDDETDSDVEISDEDKH...,,,
3,HISTDB_H2B_H2A_H3_H4_like_3,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MTNSLLKKKQNRFFASYIGKLLKVEHPKKGITTNTKQQLNSVFCYL...,,,
4,HISTDB_H2B_H2A_H3_H4_like_4,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MYEKSFKVYIYRILKKIHPEIHIAKVSTDAINSIISILSKHIVNKS...,,,
5,HISTDB_H2B_H2A_H3_H4_like_5,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MNDRSFKTYIHKVLRQVHPKCGVAKDVTDTYDGIVRSLFQHLSYTA...,,,


In [152]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [153]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [154]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_H3_H4_like_0,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MSTKSAKTSKNEKSTPNKPETASEPVDVAVAVENVDDDVDQTTKSR...,,HISTDB_H2B_H2A_H3_H4_like_0,irwin_self-assembling_2024
1,HISTDB_H2B_H2A_H3_H4_like_1,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MHKSFNVYIHRTLKKSATSFHITKPALEAIDSVIRVTAVNLVNKSL...,,HISTDB_H2B_H2A_H3_H4_like_1,irwin_self-assembling_2024
2,HISTDB_H2B_H2A_H3_H4_like_2,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MSDSEDTIIDRASEESEIESETEESSEETFDDETDSDVEISDEDKH...,,HISTDB_H2B_H2A_H3_H4_like_2,irwin_self-assembling_2024
3,HISTDB_H2B_H2A_H3_H4_like_3,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MTNSLLKKKQNRFFASYIGKLLKVEHPKKGITTNTKQQLNSVFCYL...,,HISTDB_H2B_H2A_H3_H4_like_3,irwin_self-assembling_2024
4,HISTDB_H2B_H2A_H3_H4_like_4,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MYEKSFKVYIYRILKKIHPEIHIAKVSTDAINSIISILSKHIVNKS...,,HISTDB_H2B_H2A_H3_H4_like_4,irwin_self-assembling_2024
5,HISTDB_H2B_H2A_H3_H4_like_5,H2B-H2A-H3-H4-like_(Viruses),,,,,,,,,,MNDRSFKTYIHKVLRQVHPKCGVAKDVTDTYDGIVRSLFQHLSYTA...,,HISTDB_H2B_H2A_H3_H4_like_5,irwin_self-assembling_2024


In [155]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H2B-H2A-H4-H3-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [156]:
type_like = 'H2B-H2A-H4-H3'
accession_prefix = f"HISTDB_{type_like.replace('-', '_')}_like"
print(f'accession_prefix: {accession_prefix}')

accession_prefix: HISTDB_H2B_H2A_H4_H3_like


In [157]:
fasta_str = '''>M-3300010354-74.Ga0129333_10001180_1
MKELNYNIGIPKVLKIVHPDAQLSKSALLLLNDMIIRLAYKLIISSNKILKSVNKKLLSINDISTAVKILLGDQLAKIAIEEGTKAVNIYKKARKNNKKNMAKENYASLYFKISTTHILIKKYITNGNSVFEDATVFLTSVLEYMALEILELSGNRARDNKRIQIMPRDIYITIKNDFELNQIFKGYILGTGVIPNINYKITNGGAPKKEIVKDSILGITKPGLQRLMYRAGVKYISGIIYEESRSILKTFLEKILYNTMILTERKNHTTVMYEDGIEALNILNISIYNAKGYPGTIAPCKGSEKISDLFSNKTSKKGKKKRRKNKPKTNLLKIIEKYQKTSCTLLPHESIAKLIREIGSDYNKKITRYESNYLWLIHAVVEDYMVSLYNKAMLLALHADRLTLMPKDIKLVQKIEDL
>M-3300021958-16.Ga0222718_10000241_21
MEKRIWFLMLQVYKFFVNKNPNYNIMKQINFITDIRVVHKQLGSDIRLSDASLKLINNMIIDLARRIVFLSIKIAQYSETNVISWKDIKFGTNIILSGEVIKHAISNGDKILQKYNKIETTNEKNTNIKDSAIGLVFKSSKCKHIISEFIGKRNTISHRACIYLAAIIEYIAAEILENSSNAAKDNRKGTLFLKHIYLAVRNDEELSHSFMGYIIGSPNILSIKDDSDKYIIQEPFSKKFFTDASIKRILYKAGVKYISKDVYIKVRSLIYDFVKNILTVIFNIQLKNNKKIITYEDGLMALKTMHISFYTSDNFGIKGNCQRSINVSDLESKTKRKTRRKPRTNIPKLIQKYTKTECAILPHSSVKRIIKEIGFEIRSSRKYTIKMTDKFVWLVQGILEQYLIGILGVSYSVTIHSRRTKLEPKDIGIILTITHNGFNF
>S-1035145-40.1035145_contig_2366_8
MSAFFGNSQRFERALPSVAASQARRASPLARSASSTSGASSPRRASPKRSKSSTSSSPSQRSVSPKRSPDSQDGSEPPRKKRPHCADGTLSSGKPIFRRYISQVLHGTDSKISGQGMGLLNKLVESIADDVWKTAILITQKNNRKTVGSRDANTATTILFTRKLMAQFNPFVKERLDQYVATSRGEKSHPVTRSDRAQLRFSVGRVERLARSKHLRSHQRISSTCAVYVSSVLEFITTEIIKESEAVMQTTNKKILTPRHFLLGIANNADLGCLVQNYNIINAGVVPTHAASRGQSGGASAKDEDDGYEYDECNDGEQHGGKDGYPPNVTKAGIVRLMRRAGVERVGNMNSGTYETSLSIIDDALGLLLKATIAMLNHRRLKTIRVREVVDAIEILNELKQAEVFKINLQGNCLSDINVKHAKPNRSPRAKSQDAAEADLVADSAPKRKSKPGVKARRDIKRYQNTVELLLQPTGFSRAIRYECDAHNSDTRHSATALRFIQSIVENYLVGIFQDANKLAAHAKRKTLLVKDILLTQRSNDLGMCV
>S-1037377-56.1037377_contig_17_48
MEKRIRSLMLQVYKFFVNKNPNYNIMKQINFITDLREVHKQLGNDIRLSEVSLKLINNMIIDLARRIVFLSIKIAQYSETNVISWKDIKFGTNIVLSGEVIKHAISSGDKILQKYNKIETTNEKNTNIKDSAIGLVFKSSKCKHIISEFIVKRNTISHRACIYLAAIIEYIAAEILEISSNAAKDNRNGTLFLKHIYLAVRNDEELSHSFMGYIIGSPNILSIKDDSDKYIIQEPFGKKFFTDASIKRILYKAGVKYISKDVYIKVRSLIYDFVKNILTVIFNIQLKNNKKIITYEDGLMALKTMHISFYTSDNFGTKGNCRRSINVLDLESKTKRKTRRKPRTNIPKLIQKYTKTECAILPHSSVKRIIKEIGFEIRSSRKYTIKMTDQFVWLVHGILEQYLIGILGVSYSVTIHSRTKLEPKDIGIILAITHNGFNF
>S-403982-43.403982_contig_128_34
MENKINFKSGIIKVQKQLHENSLQKSSLNIINDMIFKLGEKIIRTAVRLGKFGNKKTIGHQEIKIASILLIPGELGKHALSEGNKYILKDNWKKLQFKLGKSRKLIELNILKNNLIGQRGIIYLSAVLEYIAAEIIELGGRAARDNNFIQIKPRHVMLAIRNDEELNKLLKGYILGSGVFPNINDALCNNRQKGGKIMKKVQQDGKVLQKNKTNIKDPAFRKLLYKAGVKYTSKSVYEESRKSLQDILENILRIVFQHVNYEGRKTVMYKDGIEALKILNISIFNSRGYPGKLFSCKGSDKIKDLPKTKIRRKPSNNLLRLIRKYQNTHCTLLPHAPIKRLIKQIGQDIGDVKYETDFIWLLQAVCEEYLISLYRSSLLITLHCERKTLTPKDILLVQEIRSEM
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MKELNYNIGIPKVLKIVHPDAQLSKSALLLLNDMIIRLAYKLIISSNKILKSVN...EDL'), id='M-3300010354-74.Ga0129333_10001180_1', name='M-3300010354-74.Ga0129333_10001180_1', description='M-3300010354-74.Ga0129333_10001180_1', dbxrefs=[]),
 SeqRecord(seq=Seq('MEKRIWFLMLQVYKFFVNKNPNYNIMKQINFITDIRVVHKQLGSDIRLSDASLK...FNF'), id='M-3300021958-16.Ga0222718_10000241_21', name='M-3300021958-16.Ga0222718_10000241_21', description='M-3300021958-16.Ga0222718_10000241_21', dbxrefs=[]),
 SeqRecord(seq=Seq('MSAFFGNSQRFERALPSVAASQARRASPLARSASSTSGASSPRRASPKRSKSST...MCV'), id='S-1035145-40.1035145_contig_2366_8', name='S-1035145-40.1035145_contig_2366_8', description='S-1035145-40.1035145_contig_2366_8', dbxrefs=[]),
 SeqRecord(seq=Seq('MEKRIRSLMLQVYKFFVNKNPNYNIMKQINFITDLREVHKQLGNDIRLSEVSLK...FNF'), id='S-1037377-56.1037377_contig_17_48', name='S-1037377-56.1037377_contig_17_48', description='S-1037377-56.1037377_contig_17_48', dbxrefs=[]),
 SeqRecord(seq=Seq('MENKINFKSGIIKVQKQLHENSLQKSSLNIINDMIFKLGEKIIR

In [158]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": f"{type_like}-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300010354-74.Ga0129333_10001180_1
------------------------------------------------
M-3300021958-16.Ga0222718_10000241_21
------------------------------------------------
S-1035145-40.1035145_contig_2366_8
------------------------------------------------
S-1037377-56.1037377_contig_17_48
------------------------------------------------
S-403982-43.403982_contig_128_34


In [159]:
for k, v in data_sequence[0].items():
    print(k, v, type(v))

accession HISTDB_H2B_H2A_H4_H3_like_0 <class 'str'>
variant H2B-H2A-H4-H3-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MKELNYNIGIPKVLKIVHPDAQLSKSALLLLNDMIIRLAYKLIISSNKILKSVNKKLLSINDISTAVKILLGDQLAKIAIEEGTKAVNIYKKARKNNKKNMAKENYASLYFKISTTHILIKKYITNGNSVFEDATVFLTSVLEYMALEILELSGNRARDNKRIQIMPRDIYITIKNDFELNQIFKGYILGTGVIPNINYKITNGGAPKKEIVKDSILGITKPGLQRLMYRAGVKYISGIIYEESRSILKTFLEKILYNTMILTERKNHTTVMYEDGIEALNILNISIYNAKGYPGTIAPCKGSEKISDLFSNKTSKKGKKKRRKNKPKTNLLKIIEKYQKTSCTLLPHESIAKLIREIGSDYNKKITRYESNYLWLIHAVVEDYMVSLYNKAMLLALHADRLTLMPKDIKLVQKIEDL <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [160]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [161]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_H4_H3_like_0,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MKELNYNIGIPKVLKIVHPDAQLSKSALLLLNDMIIRLAYKLIISS...,,,
1,HISTDB_H2B_H2A_H4_H3_like_1,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MEKRIWFLMLQVYKFFVNKNPNYNIMKQINFITDIRVVHKQLGSDI...,,,
2,HISTDB_H2B_H2A_H4_H3_like_2,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MSAFFGNSQRFERALPSVAASQARRASPLARSASSTSGASSPRRAS...,,,
3,HISTDB_H2B_H2A_H4_H3_like_3,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MEKRIRSLMLQVYKFFVNKNPNYNIMKQINFITDLREVHKQLGNDI...,,,
4,HISTDB_H2B_H2A_H4_H3_like_4,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MENKINFKSGIIKVQKQLHENSLQKSSLNIINDMIFKLGEKIIRTA...,,,


In [162]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [163]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [164]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_H2A_H4_H3_like_0,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MKELNYNIGIPKVLKIVHPDAQLSKSALLLLNDMIIRLAYKLIISS...,,HISTDB_H2B_H2A_H4_H3_like_0,irwin_self-assembling_2024
1,HISTDB_H2B_H2A_H4_H3_like_1,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MEKRIWFLMLQVYKFFVNKNPNYNIMKQINFITDIRVVHKQLGSDI...,,HISTDB_H2B_H2A_H4_H3_like_1,irwin_self-assembling_2024
2,HISTDB_H2B_H2A_H4_H3_like_2,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MSAFFGNSQRFERALPSVAASQARRASPLARSASSTSGASSPRRAS...,,HISTDB_H2B_H2A_H4_H3_like_2,irwin_self-assembling_2024
3,HISTDB_H2B_H2A_H4_H3_like_3,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MEKRIRSLMLQVYKFFVNKNPNYNIMKQINFITDLREVHKQLGNDI...,,HISTDB_H2B_H2A_H4_H3_like_3,irwin_self-assembling_2024
4,HISTDB_H2B_H2A_H4_H3_like_4,H2B-H2A-H4-H3-like_(Viruses),,,,,,,,,,MENKINFKSGIIKVQKQLHENSLQKSSLNIINDMIFKLGEKIIRTA...,,HISTDB_H2B_H2A_H4_H3_like_4,irwin_self-assembling_2024


In [165]:
# Make sure data is committed to the database
conn.commit()

# Add other archaeal sequences as H4-H3-H2B-H2A-like_(Viruses)

These seqs from [article](https://www.nature.com/articles/s41564-024-01707-9#data-availability)

pid='irwin_self-assembling_2024'

**Идентификаторы accession не искала, необходимо найти!!!**

**Последовательности добавлены не все возможно ??**

In [166]:
type_like = 'H4-H3-H2B-H2A'
accession_prefix = f"HISTDB_{type_like.replace('-', '_')}_like"
print(f'accession_prefix: {accession_prefix}')

accession_prefix: HISTDB_H4_H3_H2B_H2A_like


In [167]:
fasta_str = '''>M-3300002367-2.B570J29646_100015_7
MDIIEGMNIVSTINKPSIVRMARRAGVKSLNGLAYDEVRGHMFYMLDKWISNIVNYTSYNRKKTIDVNAVTAGIPHKYFSKPVSKSLCKPKKYKVNIADNEIKYYQELSGCLMIPKLIFARIVKSLIQYYNTELRVSRDAMVLIQHCAENCVINMLNHANKNAIHAGRIGVKPSDINLYNVNSEAGRCGNGGLSAGAPVANFALFLSRIQKTICPDMKLNKISKSQINQFLNLLASAICEKAKFLNEKKKKATISPGTVLYASRILLSGGLSKAVEVTGTKAVANYASSKVSGGPRKGKQERAGLVLPVTRVSKFFKKYNCRVGSATAVYLAAVLEHISVEIFDICASTARELGKNMINSRILKLSLGNDDELSELAKALCFDVVDGGVVPNNA
>S-1101182-111.1101182_contig_689_6
MPRMKIVRDSIYGITKPAIARLARKAGVKRISSFIYEEARGQMKIFLENVLRQAVMNVDFHSRSTVLSQDIENGLPVKMFSEDMKKKLCKVKSVTRKRASGVKSIDEIKHYQGCEYLILPRLSFARFVREVAQDFKTDLRFSTDAITVLQYAVENYVIEMLEDANLAAIHAKRTTIQPKDIQLASRIAGDNVVNNPGPMPASKVKVHQTFDLYIKAVLKQVHPALRISKDSVSQINFIVNQLAAKIGLAAKGMMTSEKTITVKMIKVAVNLSLPGELARHGTGEGTKAVTNFANSSKSGKKTDRSGLKVPPSRIHRIIADTCKQRIGTTASIYCAAVVEYIIAEILELAGSRTTQETRKVISSGDLFMAINHDEELHKLVKNNLNIEILTGSVIPNIHSILLPRNK
'''
fasta_io = StringIO(fasta_str)
seq_records = list(SeqIO.parse(fasta_io, "fasta"))
fasta_io.close() 
seq_records

[SeqRecord(seq=Seq('MDIIEGMNIVSTINKPSIVRMARRAGVKSLNGLAYDEVRGHMFYMLDKWISNIV...NNA'), id='M-3300002367-2.B570J29646_100015_7', name='M-3300002367-2.B570J29646_100015_7', description='M-3300002367-2.B570J29646_100015_7', dbxrefs=[]),
 SeqRecord(seq=Seq('MPRMKIVRDSIYGITKPAIARLARKAGVKRISSFIYEEARGQMKIFLENVLRQA...RNK'), id='S-1101182-111.1101182_contig_689_6', name='S-1101182-111.1101182_contig_689_6', description='S-1101182-111.1101182_contig_689_6', dbxrefs=[])]

In [168]:
data_sequence = []
for i, record in enumerate(seq_records):
    print("------------------------------------------------")
    print(record.id)
    data_sequence.append(
        {
            "accession": f'{accession_prefix}_{i}',
            "variant": f"{type_like}-like_(Viruses)",
            "gi": None,
            "ncbi_gene_id": None,
            "hgnc_gene_name": None,
            "taxonomy_id": None,
            "organism": None,
            "phylum": None,
            "class": None,
            "taxonomy_group": None,
            "info": None,
            "sequence": str(record.seq),
            "variant_under_consideration": None,
        }
    )
    # for k, v in data_sequence[-1].items():
    #     print(k, v, type(v))

------------------------------------------------
M-3300002367-2.B570J29646_100015_7
------------------------------------------------
S-1101182-111.1101182_contig_689_6


In [169]:
for k, v in data_sequence[0].items():
    print(k, v, type(v))

accession HISTDB_H4_H3_H2B_H2A_like_0 <class 'str'>
variant H4-H3-H2B-H2A-like_(Viruses) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id None <class 'NoneType'>
organism None <class 'NoneType'>
phylum None <class 'NoneType'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MDIIEGMNIVSTINKPSIVRMARRAGVKSLNGLAYDEVRGHMFYMLDKWISNIVNYTSYNRKKTIDVNAVTAGIPHKYFSKPVSKSLCKPKKYKVNIADNEIKYYQELSGCLMIPKLIFARIVKSLIQYYNTELRVSRDAMVLIQHCAENCVINMLNHANKNAIHAGRIGVKPSDINLYNVNSEAGRCGNGGLSAGAPVANFALFLSRIQKTICPDMKLNKISKSQINQFLNLLASAICEKAKFLNEKKKKATISPGTVLYASRILLSGGLSKAVEVTGTKAVANYASSKVSGGPRKGKQERAGLVLPVTRVSKFFKKYNCRVGSATAVYLAAVLEHISVEIFDICASTARELGKNMINSRILKLSLGNDDELSELAKALCFDVVDGGVVPNNA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [170]:
failed_toadd = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence, ds)
    except Exception as e:
        print(ds["accession"])
        print(e)
        for k, v in ds.items():
            print(k, v, type(v))
        failed_toadd.append(ds["accession"])

In [171]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H4_H3_H2B_H2A_like_0,H4-H3-H2B-H2A-like_(Viruses),,,,,,,,,,MDIIEGMNIVSTINKPSIVRMARRAGVKSLNGLAYDEVRGHMFYML...,,,
1,HISTDB_H4_H3_H2B_H2A_like_1,H4-H3-H2B-H2A-like_(Viruses),,,,,,,,,,MPRMKIVRDSIYGITKPAIARLARKAGVKRISSFIYEEARGQMKIF...,,,


In [172]:
pid = 'irwin_self-assembling_2024'
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,irwin_self-assembling_2024,Self-assembling viral histones are evolutionar...,10.1038/s41564-024-01707-9,,2024


In [173]:
failed_toadd_publication = []
for ds in data_sequence:
    try:
        cursor.execute(add_sequence_has_publication, (ds["accession"], pid))
    except:
        print(ds["accession"])
        failed_toadd_publication.append(ds["accession"])

In [174]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant = '{type_like}-like_(Viruses)' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].str.startswith(accession_prefix)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H4_H3_H2B_H2A_like_0,H4-H3-H2B-H2A-like_(Viruses),,,,,,,,,,MDIIEGMNIVSTINKPSIVRMARRAGVKSLNGLAYDEVRGHMFYML...,,HISTDB_H4_H3_H2B_H2A_like_0,irwin_self-assembling_2024
1,HISTDB_H4_H3_H2B_H2A_like_1,H4-H3-H2B-H2A-like_(Viruses),,,,,,,,,,MPRMKIVRDSIYGITKPAIARLARKAGVKRISSFIYEEARGQMKIF...,,HISTDB_H4_H3_H2B_H2A_like_1,irwin_self-assembling_2024


In [175]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [176]:
cursor.close()
conn.close()
tunnel.stop()