In [43]:
import pandas as pd
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

from Bio import Entrez
from Bio import SeqIO
Entrez.email = "l.singh@intbio.org"

In [44]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [45]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

34419


In [46]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [47]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [48]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)

In [49]:
def get_taxonomy_data(record):
    import sys
    import re
    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print("Unexpected error: {}, Retrying, attempt {}".format(sys.exc_info()[0], i))
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None: taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None: taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Correct gH2B sequences

In [10]:
g_h2b_accessions = [
    "CUT18449.1", # ena accession LN906619.1 (26846354, alvarez-venegas_canonical_2019)
    "CUT18450.1", # ena accession LN906620.1 (26846354, alvarez-venegas_canonical_2019)
    "BAA96095", # (alvarez-venegas_canonical_2019)
    "CUT18445.1", # ?
    "CUT18446.1", # ?
    "CUT18447.1", # ?
    "CUT18448.1", # ?
    "CUT18451.1", # ?
    "CUT18452.1", # ?
]

In [8]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant='gH2B'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,CUT18445.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKSEKKPAEKKPVAEKPAAEEEKKSAPAPAAAEKKPAEKKPKAG...,,CUT18445.1,26846354
1,CUT18446.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AEKKPKAGKKVPASKEGEKKKKRSKKSVETYKIYIFKVLKQVHPDI...,,CUT18446.1,26846354
2,CUT18447.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKAEKKPAAKKPAATPPPEEEKEVVPPPPAEKKPKAGKKLPAAK...,,CUT18447.1,26846354
3,CUT18448.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AATPPPEEEKEVVPPPAEKKPAEKKPKAGKKLPASKEGDAKKKKKS...,,CUT18448.1,26846354
4,CUT18449.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MPPRRKKTAAGAAAGGKAAAAAVGKAGFMPPKKPKKGKKKTPIMRY...,,CUT18449.1,26846354
5,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,26846354
6,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,32716939
7,CUT18451.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKAEKKPAAKKPAATPPPEEEKEVVPPPPAEKKPKAGKKLPAAK...,,CUT18451.1,26846354
8,CUT18452.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKSEKKPAEKKPVAEKPAAEEEKKAAPAAAPAEKKAAEKKPKA,,CUT18452.1,26846354


# Add publicdation ID for CUT18449.1 and CUT18450.1

In [26]:
query = "SELECT * FROM publication WHERE id='alvarez-venegas_canonical_2019'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,alvarez-venegas_canonical_2019,,,,


In [27]:
cursor.execute(add_sequence_has_publication, ("CUT18449.1", "alvarez-venegas_canonical_2019"))
cursor.execute(add_sequence_has_publication, ("CUT18450.1", "alvarez-venegas_canonical_2019"))

In [28]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant='gH2B'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,BAA96095.1,gH2B,,54145.0,H2BC12L,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGK...,,BAA96095.1,alvarez-venegas_canonical_2019
1,CUT18445.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKSEKKPAEKKPVAEKPAAEEEKKSAPAPAAAEKKPAEKKPKAG...,,CUT18445.1,26846354
2,CUT18446.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AEKKPKAGKKVPASKEGEKKKKRSKKSVETYKIYIFKVLKQVHPDI...,,CUT18446.1,26846354
3,CUT18447.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKAEKKPAAKKPAATPPPEEEKEVVPPPPAEKKPKAGKKLPAAK...,,CUT18447.1,26846354
4,CUT18448.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AATPPPEEEKEVVPPPAEKKPAEKKPKAGKKLPASKEGDAKKKKKS...,,CUT18448.1,26846354
5,CUT18449.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MPPRRKKTAAGAAAGGKAAAAAVGKAGFMPPKKPKKGKKKTPIMRY...,,CUT18449.1,26846354
6,CUT18449.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MPPRRKKTAAGAAAGGKAAAAAVGKAGFMPPKKPKKGKKKTPIMRY...,,CUT18449.1,alvarez-venegas_canonical_2019
7,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,26846354
8,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,32716939
9,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,alvarez-venegas_canonical_2019


# Add new sequence BAA96095.1

In [12]:
ACCESSION = "BAA96095.1"

## Get sequnce from NCBI

In [13]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: BAA96095.1
Name: BAA96095
Description: gH2B [Lilium longiflorum]
Number of features: 7
/topology=linear
/data_file_division=PLN
/date=02-APR-2004
/accessions=['BAA96095']
/sequence_version=1
/db_source=accession AB003780.1
/keywords=['']
/source=Lilium longiflorum (trumpet lily)
/organism=Lilium longiflorum
/taxonomy=['Eukaryota', 'Viridiplantae', 'Streptophyta', 'Embryophyta', 'Tracheophyta', 'Spermatophyta', 'Magnoliopsida', 'Liliopsida', 'Liliales', 'Liliaceae', 'Lilium']
/references=[Reference(title='Unusual core histones specifically expressed in male gametic cells of Lilium longiflorum', ...), Reference(title='Direct Submission', ...)]
/molecule_type=protein
Seq('MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGKKKIPLMKY...QQT')


In [14]:
print(record.seq)

MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGKKKIPLMKYRVYIRRVLTQVRPELGISSKSMLIMNNFVVHNFQNIAKEASILAQYSKKKTITVKELKAAVKLVLPHQLLEYADRDGDRAVHNFESETSKKNSQGRKRGRGQQT


In [15]:
record.annotations["organism"]

'Lilium longiflorum'

In [16]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI 4690


{'organism': 'Lilium longiflorum',
 'taxonomy_id': 4690,
 'phylum': 'Streptophyta',
 'class': 'Magnoliopsida'}

## Adding

In [17]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "gH2B",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9606,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'BAA96095.1',
 'variant': 'gH2B',
 'gi': None,
 'ncbi_gene_id': 54145,
 'hgnc_gene_name': 'H2BC12L',
 'taxonomy_id': 4690,
 'organism': 'Lilium longiflorum',
 'phylum': 'Streptophyta',
 'class': 'Magnoliopsida',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGKKKIPLMKYRVYIRRVLTQVRPELGISSKSMLIMNNFVVHNFQNIAKEASILAQYSKKKTITVKELKAAVKLVLPHQLLEYADRDGDRAVHNFESETSKKNSQGRKRGRGQQT',
 'variant_under_consideration': None}

In [18]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'int'>
hgnc_gene_name <class 'str'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [20]:
cursor.execute(add_sequence, data_sequence)

In [21]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,BAA96095.1,gH2B,,54145,H2BC12L,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGK...,,,


## Add publication

In [22]:
pid = "alvarez-venegas_canonical_2019"
query = f"SELECT * FROM publication WHERE id='{pid}'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,alvarez-venegas_canonical_2019,,,,


In [23]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [24]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,BAA96095.1,gH2B,,54145,H2BC12L,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGK...,,BAA96095.1,alvarez-venegas_canonical_2019


In [30]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant='gH2B'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,BAA96095.1,gH2B,,54145.0,H2BC12L,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGK...,,BAA96095.1,alvarez-venegas_canonical_2019
1,CUT18445.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKSEKKPAEKKPVAEKPAAEEEKKSAPAPAAAEKKPAEKKPKAG...,,CUT18445.1,26846354
2,CUT18446.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AEKKPKAGKKVPASKEGEKKKKRSKKSVETYKIYIFKVLKQVHPDI...,,CUT18446.1,26846354
3,CUT18447.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKAEKKPAAKKPAATPPPEEEKEVVPPPPAEKKPKAGKKLPAAK...,,CUT18447.1,26846354
4,CUT18448.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AATPPPEEEKEVVPPPAEKKPAEKKPKAGKKLPASKEGDAKKKKKS...,,CUT18448.1,26846354
5,CUT18449.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MPPRRKKTAAGAAAGGKAAAAAVGKAGFMPPKKPKKGKKKTPIMRY...,,CUT18449.1,26846354
6,CUT18449.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MPPRRKKTAAGAAAGGKAAAAAVGKAGFMPPKKPKKGKKKTPIMRY...,,CUT18449.1,alvarez-venegas_canonical_2019
7,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,26846354
8,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,32716939
9,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,alvarez-venegas_canonical_2019


In [31]:
# Make sure data is committed to the database
conn.commit()

# Edit other accessions

In [34]:
g_h2b_accessions[3:]

['CUT18445.1',
 'CUT18446.1',
 'CUT18447.1',
 'CUT18448.1',
 'CUT18451.1',
 'CUT18452.1']

In [35]:
for a in g_h2b_accessions[3:]:
    query = (
        f"UPDATE sequence SET variant=null, variant_under_consideration='gH2B' "
        f"WHERE accession='{a}'"
    )
    cursor.execute(query)

In [40]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant='gH2B'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,BAA96095.1,gH2B,,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MPPRRKKKAAAAAAAAAAAAAAAGKAAAGKDGKAGIMTPKKPKKGK...,,BAA96095.1,alvarez-venegas_canonical_2019
1,CUT18449.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MPPRRKKTAAGAAAGGKAAAAAVGKAGFMPPKKPKKGKKKTPIMRY...,,CUT18449.1,26846354
2,CUT18449.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MPPRRKKTAAGAAAGGKAAAAAVGKAGFMPPKKPKKGKKKTPIMRY...,,CUT18449.1,alvarez-venegas_canonical_2019
3,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,26846354
4,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,32716939
5,CUT18450.1,gH2B,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKKKPSKLVGTVTKTRKVTETQTLKVSLTKGLKPEDQQTTTNKF...,,CUT18450.1,alvarez-venegas_canonical_2019


In [37]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.variant_under_consideration='gH2B'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,CUT18445.1,,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKSEKKPAEKKPVAEKPAAEEEKKSAPAPAAAEKKPAEKKPKAG...,gH2B,CUT18445.1,26846354
1,CUT18446.1,,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AEKKPKAGKKVPASKEGEKKKKRSKKSVETYKIYIFKVLKQVHPDI...,gH2B,CUT18446.1,26846354
2,CUT18447.1,,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKAEKKPAAKKPAATPPPEEEKEVVPPPPAEKKPKAGKKLPAAK...,gH2B,CUT18447.1,26846354
3,CUT18448.1,,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,AATPPPEEEKEVVPPPAEKKPAEKKPKAGKKLPASKEGDAKKKKKS...,gH2B,CUT18448.1,26846354
4,CUT18451.1,,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKAEKKPAAKKPAATPPPEEEKEVVPPPPAEKKPKAGKKLPAAK...,gH2B,CUT18451.1,26846354
5,CUT18452.1,,,,,1473204,Lilium davidii var. unicolor,Streptophyta,Magnoliopsida,,,MAPKSEKKPAEKKPVAEKPAAEEEKKAAPAAAPAEKKAAEKKPKA,gH2B,CUT18452.1,26846354


In [41]:
# Make sure data is committed to the database
conn.commit()

# Delete GBG59214.1 and GBG60584.1

Удалим эти последовательности, так как это не H2B.S, согласно [статье](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1008964).

In [50]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["GBG59214.1", "GBG60584.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
68,GBG59214.1,H2B.S,,,,69332.0,Chara braunii,Streptophyta,Charophyceae,,,MAEGGYPLEAVAGDICGTSMDPLDPSGTVRRRSPRGDGPDDQGVGR...,,GBG59214.1,32716939
69,GBG60584.1,H2B.S,,,,69332.0,Chara braunii,Streptophyta,Charophyceae,,,MWNLLRMPPGQWSSRSAASSLPRQNGVVGVRRGRSAAVVVLEDSGD...,,GBG60584.1,32716939


## Delete relations with publications before deleting sequence records

In [51]:
query = "DELETE FROM sequence_has_publication WHERE sequence_accession='GBG59214.1'"
cursor.execute(query)

In [52]:
query = "DELETE FROM sequence_has_publication WHERE sequence_accession='GBG60584.1'"
cursor.execute(query)

In [53]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["GBG59214.1", "GBG60584.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
68,GBG59214.1,H2B.S,,,,69332.0,Chara braunii,Streptophyta,Charophyceae,,,MAEGGYPLEAVAGDICGTSMDPLDPSGTVRRRSPRGDGPDDQGVGR...,,,
69,GBG60584.1,H2B.S,,,,69332.0,Chara braunii,Streptophyta,Charophyceae,,,MWNLLRMPPGQWSSRSAASSLPRQNGVVGVRRGRSAAVVVLEDSGD...,,,


## Delete sequence records

In [54]:
query = "DELETE FROM sequence WHERE accession='GBG59214.1'"
cursor.execute(query)

In [55]:
query = "DELETE FROM sequence WHERE accession='GBG60584.1'"
cursor.execute(query)

In [56]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["GBG59214.1", "GBG60584.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [57]:
df[df["variant"]=="H2B.S"]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
28,CAF1924216.1,H2B.S,,,,3708.0,Brassica napus,Streptophyta,Magnoliopsida,,,MAPKKSKKVVSVTKKKKVVEETIKVTVTDGVPNVTTETDTQETQEL...,,CAF1924216.1,32716939
67,ESR37664.1,H2B.S,,,,85681.0,Citrus clementina,Streptophyta,Magnoliopsida,,,MPPRRSARVVLTKKVVTETVEVSVVNEKKKGKQEIAIHSEETLPSK...,,ESR37664.1,32716939
109,HISTDB_H2B_S_0,H2B.S,,,,264402.0,Capsella grandiflora,Streptophyta,Magnoliopsida,,,MAPRKPKVVSVTKKKTVVEETVKVTVAEGGDPNVTTEITENDQETQ...,,HISTDB_H2B_S_0,32716939
110,HISTDB_H2B_S_1,H2B.S,,,,72658.0,Boechera stricta,Streptophyta,Magnoliopsida,,,MAPRKPKVVSVTKKKKVVEETVKVTVTEGGDPNATTEITENDQETQ...,,HISTDB_H2B_S_1,32716939
111,HISTDB_H2B_S_10,H2B.S,,,,13216.0,Piper nigrum,Streptophyta,Magnoliopsida,,,MASTRQGRRNTPEVVSTVVKKKTTRKVVNETTIAAVAVVESNEPPI...,,HISTDB_H2B_S_10,32716939
112,HISTDB_H2B_S_2,H2B.S,,,,29656.0,Spirodela polyrhiza,Streptophyta,Magnoliopsida,,,MVRTTRKVVQETIEVSVVKEKDATAGRKKVVEVKVQDTTEMPQPQA...,,HISTDB_H2B_S_2,32716939
113,HISTDB_H2B_S_3,H2B.S,,,,38727.0,Panicum virgatum,Streptophyta,Magnoliopsida,,,MAPKRRGGGKVVGSVVKTKVVQETVEVTTAVVPDGEPEQRGTEALA...,,HISTDB_H2B_S_3,32716939
114,HISTDB_H2B_S_4,H2B.S,,,,4577.0,Zea mays,Streptophyta,Magnoliopsida,,,MAPKRRGNKVVGSVVKTKLVQETVEVIVADDDGLHAEKQQVPEALA...,,HISTDB_H2B_S_4,32716939
115,HISTDB_H2B_S_5,H2B.S,,,,1071399.0,Brachypodium stacei,Streptophyta,Magnoliopsida,,,MAPKRRGKQVVSSVVRKTTKVVKETVQVSTAAIVADDSTHPEYTEP...,,HISTDB_H2B_S_5,32716939
116,HISTDB_H2B_S_6,H2B.S,,,,4641.0,Musa acuminata,Streptophyta,Magnoliopsida,,,MAPKRTSRVLKTTKTVIEETVEVVVEAKDAQGPKEDLGEGKEAEPE...,,HISTDB_H2B_S_6,32716939


In [58]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [59]:
cursor.close()
conn.close()
tunnel.stop()