In [15]:
import pandas as pd
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

from Bio import Entrez
from Bio import SeqIO
Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

40079


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [52]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)

# Update H2B.O sequences

In [6]:
query = "SELECT * FROM sequence WHERE variant='H2B.O'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,XP_001511074.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFERIASEASRLAR...,
1,XP_001521160.2,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,WENYVYKVLKQVHPLTSISTKAVGIVDSFIDIFKRITSDASHLARY...,
2,XP_028926523.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLAR...,


In [11]:
query = "SELECT accession FROM sequence WHERE variant='H2B.O'"
cursor.execute(query)
accessions = ",".join([a[0] for a in cursor.fetchall()])
accessions

'XP_001511074.1,XP_001521160.2,XP_028926523.1'

In [21]:
accessions = 'XP_001511074.1,XP_001521160.2,XP_028926540.1'

## Get all sequence from NCBI

In [22]:
with Entrez.efetch(
    db="protein", id=accessions, rettype="gb", retmode="text"
) as handle:
    records = list(SeqIO.parse(handle, "genbank"))
    print(records)

[SeqRecord(seq=Seq('MSDSVKSVPVPTEGSWKAVTKDQKAREKRKHRRRENYSIYVYKVLKQVHPLTSI...SAK'), id='XP_001511074.1', name='XP_001511074', description='histone H2B type 1-P-like [Ornithorhynchus anatinus]', dbxrefs=['BioProject:PRJNA534073']), SeqRecord(seq=Seq('MPVPTEGSWKAVTKDQKTHKKRKHSCWENYVYKVLKQVHPLTSISTKAVGIVDS...TAK'), id='XP_001521160.2', name='XP_001521160', description='histone H2B type 2-E-like [Ornithorhynchus anatinus]', dbxrefs=['BioProject:PRJNA534073']), SeqRecord(seq=Seq('MSDTVKSVPVPTEGSWKAVAKDQKTRKKRKHSRRENYSIYVYKVLKQVHPLTSI...SAK'), id='XP_028926540.1', name='XP_028926540', description='histone H2B type 1-P-like [Ornithorhynchus anatinus]', dbxrefs=['BioProject:PRJNA534073'])]


In [23]:
for r in records:
    print(r.id)
    print(r.seq)
    print(len(r.seq))

XP_001511074.1
MSDSVKSVPVPTEGSWKAVTKDQKAREKRKHRRRENYSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFERIASEASRLARYNKRSTITSREIQTAVLLTLPGELARHAVSEGTKAITKYTSAK
125
XP_001521160.2
MPVPTEGSWKAVTKDQKTHKKRKHSCWENYVYKVLKQVHPLTSISTKAVGIVDSFIDIFKRITSDASHLARYNKCSTITSREIQTAVQLMLPGELDRYAGSEGTKAITKYTTAK
114
XP_028926540.1
MSDTVKSVPVPTEGSWKAVAKDQKTRKKRKHSRRENYSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLARYTKRSTIASREIQTAVLLTLPGELARHAVSEGTKAITKYTSAK
125


## Updating XP_001511074.1 and XP_001521160.2

In [33]:
query = (
    f"UPDATE sequence SET sequence='{str(records[0].seq)}' "
    "WHERE accession='XP_001511074.1'"
)
print(query)

UPDATE sequence SET sequence='MSDSVKSVPVPTEGSWKAVTKDQKAREKRKHRRRENYSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFERIASEASRLARYNKRSTITSREIQTAVLLTLPGELARHAVSEGTKAITKYTSAK' WHERE accession='XP_001511074.1'


In [34]:
cursor.execute(query)

In [28]:
query = (
    f"UPDATE sequence SET sequence='{str(records[1].seq)}' "
    "WHERE accession='XP_001521160.2'"
)
print(query)

UPDATE sequence SET sequence='MPVPTEGSWKAVTKDQKTHKKRKHSCWENYVYKVLKQVHPLTSISTKAVGIVDSFIDIFKRITSDASHLARYNKCSTITSREIQTAVQLMLPGELDRYAGSEGTKAITKYTTAK' WHERE accession='XP_001521160.2'


In [29]:
cursor.execute(query)

In [35]:
query = "SELECT * FROM sequence WHERE variant='H2B.O'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,XP_001511074.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSDSVKSVPVPTEGSWKAVTKDQKAREKRKHRRRENYSIYVYKVLK...,
1,XP_001521160.2,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MPVPTEGSWKAVTKDQKTHKKRKHSCWENYVYKVLKQVHPLTSIST...,
2,XP_028926523.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLAR...,


In [36]:
# Make sure data is committed to the database
conn.commit()

## Updating XP_028926540.1 (prev XP_028926523.1)

In [37]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_028926523.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_028926523.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,YSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLAR...,,XP_028926523.1,35099534


In [38]:
df.groupby(["accession"])["publication_id"].apply(
    lambda x: [] if x.dropna().empty else list(x.unique())
).reset_index()

Unnamed: 0,accession,publication_id
0,XP_028926523.1,[35099534]


### Delete relations with publications befor updating sequence record

In [39]:
publications = ["35099534"]

In [40]:
query = "DELETE FROM sequence_has_publication WHERE sequence_accession='XP_028926523.1'"
cursor.execute(query)

In [41]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_028926523.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.groupby(["accession"])["publication_id"].apply(
    lambda x: [] if x.dropna().empty else list(x.unique())
).reset_index()

Unnamed: 0,accession,publication_id
0,XP_028926523.1,[]


### Updating

In [46]:
str(records[2].id), str(records[2].seq)

('XP_028926540.1',
 'MSDTVKSVPVPTEGSWKAVAKDQKTRKKRKHSRRENYSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLARYTKRSTIASREIQTAVLLTLPGELARHAVSEGTKAITKYTSAK')

In [47]:
query = (
    "UPDATE sequence SET accession='XP_028926540.1', "
    f"sequence='{str(records[2].seq)}' "
    "WHERE accession='XP_028926523.1'"
)
print(query)

UPDATE sequence SET accession='XP_028926540.1', sequence='MSDTVKSVPVPTEGSWKAVAKDQKTRKKRKHSRRENYSIYVYKVLKQVHPLTSISTKAVGIMDSFINDIFDRIASEASRLARYTKRSTIASREIQTAVLLTLPGELARHAVSEGTKAITKYTSAK' WHERE accession='XP_028926523.1'


In [48]:
cursor.execute(query)

In [49]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_028926523.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [50]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_028926540.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_028926540.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSDTVKSVPVPTEGSWKAVAKDQKTRKKRKHSRRENYSIYVYKVLK...,,,


### Return relations to publications

In [51]:
publications

['35099534']

In [53]:
for pid in publications:
    cursor.execute(add_sequence_has_publication, ("XP_028926540.1", pid))

In [54]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_028926540.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_028926540.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSDTVKSVPVPTEGSWKAVAKDQKTRKKRKHSRRENYSIYVYKVLK...,,XP_028926540.1,35099534


In [55]:
# Make sure data is committed to the database
conn.commit()

In [56]:
query = "SELECT * FROM sequence WHERE variant='H2B.O'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,XP_001511074.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSDSVKSVPVPTEGSWKAVTKDQKAREKRKHRRRENYSIYVYKVLK...,
1,XP_001521160.2,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MPVPTEGSWKAVTKDQKTHKKRKHSCWENYVYKVLKQVHPLTSIST...,
2,XP_028926540.1,H2B.O,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSDTVKSVPVPTEGSWKAVAKDQKTRKKRKHSRRENYSIYVYKVLK...,


# Close connections

In [57]:
cursor.close()
conn.close()
tunnel.stop()