In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

33727


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [39]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

# Update H3.6_(Homo_sapiens)

In [11]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.6")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
362,H3.6_(Mammals?)?,variant_group,,,53,H3,,


In [12]:
query = (
    f"UPDATE histone SET id='H3.6_(Homo_sapiens)' "
    f"WHERE id='H3.6_(Mammals?)?'"
)
cursor.execute(query)

In [17]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.6")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
362,H3.6_(Homo_sapiens),variant_group,,,53,H3,,


In [15]:
histone_df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,Archaeal,type,,,1,,,
1,cenH3_(Animals),variant,Homo sapiens,9606,94,cenH3_(Eukarya),,
2,cenH3_(Eukarya),variant_group,,,46,H3,,
3,cenH3_(Fungi),variant,Homo sapiens,9606,95,cenH3_(Eukarya),,
4,cenH3_(Homo_sapiens),variant,Homo sapiens,9606,158,cenH3_(Mammalia),,
...,...,...,...,...,...,...,...,...
401,TS H3.10,variant,,,139,H3.3-like_(Plants),,
402,TS_H1.6,variant_group,Mammalia,40674,19,H1,TS_H1.6,22650316
403,TS_H1.7,variant_group,Mammalia,40674,20,H1,TS_H1.7,22650316
404,TS_H1.9,variant_group,Mammalia,40674,21,H1,TS_H1.9,22650316


In [13]:
# Make sure data is committed to the database
conn.commit()

## Update decription of H3.6_(Homo_sapiens) (id=53)

In [16]:
query = "SELECT * FROM histone_description"
cursor.execute(query)
cursor.fetchall()
[i[0] for i in cursor.description]

['id',
 'summary',
 'taxonomy',
 'genes',
 'evolution',
 'expression',
 'knock_out',
 'function',
 'sequence',
 'localization',
 'deposition',
 'structure',
 'interactions',
 'disease',
 'caveats']

In [29]:
summary = "H3.6_(Homo_sapiens) is a human histone variant, similar to histone variant H3.3 in amino acid sequence, encoded by the H3F3AP6 gene. Expression levels of H3F3AP6 are extremely low, as compared to H3F3B gene (encodes histone H3.3) [taguchi_crystal_2017]. H3.6 nucleosomes are substantially unstable due to the Val62 residue, which weakens interactions with H4 [taguchi_crystal_2017]."
structure = "H3.6 nucleosomes are substantially unstable due to the Val62 residue, which weakens interactions with H4 [taguchi_crystal_2017]."

In [30]:
query = (
    f"UPDATE histone_description SET summary='{summary}', structure='{structure}' "
    f"WHERE id=53"
)
cursor.execute(query)

In [31]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d "
    "ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.6")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
172,H3.6_(Homo_sapiens),variant_group,,,53,H3,H3.6_(Homo_sapiens) is a human histone variant...,,,,,,,,,,H3.6 nucleosomes are substantially unstable du...,,,


In [32]:
# Make sure data is committed to the database
conn.commit()

## Update publications of H3.6_(Homo_sapiens)

In [33]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"]=="H3.6_(Homo_sapiens)"]

Unnamed: 0,histone_id,publication_id


In [34]:
query = "SELECT * FROM publication WHERE id='taguchi_crystal_2017'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year


In [37]:
data_publication = {
    "id": "taguchi_crystal_2017",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [38]:
query = "SELECT * FROM publication WHERE id='taguchi_crystal_2017'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,taguchi_crystal_2017,,,,


In [41]:
cursor.execute(add_histone_has_publication, ("H3.6_(Homo_sapiens)", "taguchi_crystal_2017"))

In [42]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"]=="H3.6_(Homo_sapiens)"]

Unnamed: 0,histone_id,publication_id
286,H3.6_(Homo_sapiens),taguchi_crystal_2017


In [43]:
# Make sure data is committed to the database
conn.commit()

In [44]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.6")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
362,H3.6_(Homo_sapiens),variant_group,,,53,H3,H3.6_(Homo_sapiens),taguchi_crystal_2017


# Update H3.7_(Homo_sapiens)

In [45]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.7")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
363,H3.7_(Mammals?)?,variant_group,,,54,H3,,


In [46]:
query = (
    f"UPDATE histone SET id='H3.7_(Homo_sapiens)' "
    f"WHERE id='H3.7_(Mammals?)?'"
)
cursor.execute(query)

In [47]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.7")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
363,H3.7_(Homo_sapiens),variant_group,,,54,H3,,


In [48]:
# Make sure data is committed to the database
conn.commit()

## Update decription of H3.7_(Homo_sapiens) (id=54)

In [49]:
summary = "H3.7_(Homo_sapiens) is a human histone variant, similar to histone variant H3.1 in amino acid sequence, encoded by the HIST2H3PS2 gene. Expression levels of HIST2H3PS2 are extremely low, as compared to H3F3B gene (encodes histone H3.3) [taguchi_crystal_2017]. In vitro results showed that H3.7 failed to form nucleosomes [taguchi_crystal_2017]."
structure = "In vitro results showed that H3.7 failed to form nucleosomes [taguchi_crystal_2017]."

In [50]:
query = (
    f"UPDATE histone_description SET summary='{summary}', structure='{structure}' "
    f"WHERE id=54"
)
cursor.execute(query)

In [51]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d "
    "ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.7")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
173,H3.7_(Homo_sapiens),variant_group,,,54,H3,H3.7_(Homo_sapiens) is a human histone variant...,,,,,,,,,,In vitro results showed that H3.7 failed to fo...,,,


In [52]:
# Make sure data is committed to the database
conn.commit()

## Update publications of H3.7_(Homo_sapiens)

In [54]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"]=="H3.7_(Homo_sapiens)"]

Unnamed: 0,histone_id,publication_id


In [55]:
query = "SELECT * FROM publication WHERE id='taguchi_crystal_2017'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,taguchi_crystal_2017,,,,


In [56]:
cursor.execute(add_histone_has_publication, ("H3.7_(Homo_sapiens)", "taguchi_crystal_2017"))

In [57]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"]=="H3.7_(Homo_sapiens)"]

Unnamed: 0,histone_id,publication_id
287,H3.7_(Homo_sapiens),taguchi_crystal_2017


In [58]:
# Make sure data is committed to the database
conn.commit()

In [59]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.7")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
363,H3.7_(Homo_sapiens),variant_group,,,54,H3,H3.7_(Homo_sapiens),taguchi_crystal_2017


# Update H3.8_(Homo_sapiens)

In [60]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.8")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
364,H3.8_(Mammals?)?,variant_group,,,55,H3,,


In [61]:
query = (
    f"UPDATE histone SET id='H3.8_(Homo_sapiens)' "
    f"WHERE id='H3.8_(Mammals?)?'"
)
cursor.execute(query)

In [62]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.8")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
364,H3.8_(Homo_sapiens),variant_group,,,55,H3,,


In [63]:
# Make sure data is committed to the database
conn.commit()

## Update decription of H3.8_(Homo_sapiens) (id=55)

In [64]:
summary = "H3.8_(Homo_sapiens) is a human histone variant, similar to histone variant H3.3 in amino acid sequence, encoded by the H3F3AP5 gene. Expression levels of H3F3AP5 are extremely low, as compared to H3F3B gene (encodes histone H3.3) [taguchi_crystal_2017]. H3.8 nucleosomes are extremely unstable [taguchi_crystal_2017]."
structure = "H3.8 nucleosomes are extremely unstable [taguchi_crystal_2017]."

In [65]:
query = (
    f"UPDATE histone_description SET summary='{summary}', structure='{structure}' "
    f"WHERE id=55"
)
cursor.execute(query)

In [66]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d "
    "ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.8")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
174,H3.8_(Homo_sapiens),variant_group,,,55,H3,H3.8_(Homo_sapiens) is a human histone variant...,,,,,,,,,,H3.8 nucleosomes are extremely unstable [taguc...,,,


In [67]:
# Make sure data is committed to the database
conn.commit()

## Update publications of H3.8_(Homo_sapiens)

In [68]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"]=="H3.8_(Homo_sapiens)"]

Unnamed: 0,histone_id,publication_id


In [69]:
query = "SELECT * FROM publication WHERE id='taguchi_crystal_2017'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,taguchi_crystal_2017,,,,


In [70]:
cursor.execute(add_histone_has_publication, ("H3.8_(Homo_sapiens)", "taguchi_crystal_2017"))

In [71]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"]=="H3.8_(Homo_sapiens)"]

Unnamed: 0,histone_id,publication_id
288,H3.8_(Homo_sapiens),taguchi_crystal_2017


In [72]:
# Make sure data is committed to the database
conn.commit()

In [73]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.8")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
364,H3.8_(Homo_sapiens),variant_group,,,55,H3,H3.8_(Homo_sapiens),taguchi_crystal_2017


# Close connections

In [74]:
cursor.close()
conn.close()
tunnel.stop()