In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

39737


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [7]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [39]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

# Delete H3.6_(Homo_sapiens)

This is pseudogene according to [article](https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-022-00467-2). 

In [8]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='H3.6_(Homo_sapiens)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H3.6_(Homo_sapiens),variant_group,,,53,H3,53,H3.6_(Homo_sapiens) is a human histone variant...,,,...,,,,,,,H3.6 nucleosomes are substantially unstable du...,,,


In [9]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='H3.6_(Homo_sapiens)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H3.6_(Homo_sapiens),variant_group,,,53,H3,H3.6_(Homo_sapiens),taguchi_crystal_2017


## Delete relationships first

In [19]:
query = "DELETE FROM histone_has_publication WHERE histone_id = 'H3.6_(Homo_sapiens)'" 
cursor.execute(query) 

In [16]:
query = "UPDATE histone SET description=null WHERE id = 'H3.6_(Homo_sapiens)'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=null WHERE id = 'H3.6_(Homo_sapiens)'


## Delete H3.6_(Homo_sapiens)

In [19]:
query = "DELETE FROM histone WHERE id = 'H3.6_(Homo_sapiens)'" 
print(query) 
cursor.execute(query) 

In [22]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='H3.8_(Homo_sapiens)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H3.8_(Homo_sapiens),variant_group,,,55,H3,H3.8_(Homo_sapiens),taguchi_crystal_2017


In [21]:
# Make sure data is committed to the database
conn.commit()

# Delete H3.8_(Homo_sapiens)

This is pseudogene according to [article](https://epigeneticsandchromatin.biomedcentral.com/articles/10.1186/s13072-022-00467-2). 

In [23]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='H3.8_(Homo_sapiens)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H3.8_(Homo_sapiens),variant_group,,,55,H3,H3.8_(Homo_sapiens),taguchi_crystal_2017


## Delete relationships first

In [25]:
query = "DELETE FROM histone_has_publication WHERE histone_id = 'H3.8_(Homo_sapiens)'" 
cursor.execute(query) 

In [27]:
query = "UPDATE histone SET description=null WHERE id = 'H3.8_(Homo_sapiens)'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=null WHERE id = 'H3.8_(Homo_sapiens)'


In [28]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='H3.8_(Homo_sapiens)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H3.8_(Homo_sapiens),variant_group,,,,H3,,


## Delete H3.8_(Homo_sapiens)

In [29]:
query = "DELETE FROM histone WHERE id = 'H3.8_(Homo_sapiens)'" 
print(query) 
cursor.execute(query) 

DELETE FROM histone WHERE id = 'H3.8_(Homo_sapiens)'


In [30]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='H3.8_(Homo_sapiens)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id


In [31]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='H3.6_(Homo_sapiens)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id


In [32]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [33]:
cursor.close()
conn.close()
tunnel.stop()