In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

37653


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [106]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

# Change name for some variants

gH2A → gH2A_(Lilium)

gH2B → gH2B_(Lilium)

H2B.S → H2B.S_(Magnoliopsida)

H2A.M → H2A.M_(Viridiplantae)

H2A.W → H2A.W_(Magnoliopsida)

## Change name from gH2A to gH2A_(Lilium)

In [8]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='gH2A'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2A,variant_group,Lilium,4688,23,H2A,gH2A,alvarez-venegas_canonical_2019
1,gH2A,variant_group,Lilium,4688,23,H2A,gH2A,ueda_male_2005


In [7]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='gH2A'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,gH2A,variant_group,Lilium,4688,23,H2A,23,gH2A is a male-gamete-specific variant found i...,Suggested to be a distinctive variant that evl...,gH2A gene in Lilium longiflorum. The gene is i...,...,,,This histone variant is expected to be specifi...,,,,,,,


### Save publications and description id

In [11]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='gH2A'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['alvarez-venegas_canonical_2019', 'ueda_male_2005'], dtype=object)

In [14]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='gH2A'"
)
cursor.execute(query)
desc_id = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
    "description"
].values[0]
desc_id

23

### Delete relations

In [16]:
query = "DELETE FROM histone_has_publication WHERE histone_id = 'gH2A'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'gH2A'


In [17]:
query = "UPDATE histone SET description=null WHERE id = 'gH2A'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=null WHERE id = 'gH2A'


In [18]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='gH2A'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2A,variant_group,Lilium,4688,,H2A,,


### Update name

In [20]:
query = "UPDATE histone SET id='gH2A_(Lilium)' WHERE id = 'gH2A'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='gH2A_(Lilium)' WHERE id = 'gH2A'


In [22]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='gH2A_(Lilium)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2A_(Lilium),variant_group,Lilium,4688,,H2A,,


### Return relations

In [25]:
query = f"UPDATE histone SET description={desc_id} WHERE id = 'gH2A_(Lilium)'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=23 WHERE id = 'gH2A_(Lilium)'


In [27]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='gH2A_(Lilium)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,gH2A_(Lilium),variant_group,Lilium,4688,23,H2A,23,gH2A is a male-gamete-specific variant found i...,Suggested to be a distinctive variant that evl...,gH2A gene in Lilium longiflorum. The gene is i...,...,,,This histone variant is expected to be specifi...,,,,,,,


In [29]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='gH2A_(Lilium)'"
)
cursor.execute(query)
summary = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])["summary"].values[0].replace("gH2A", "gH2A_(Lilium)", 1)
summary

'gH2A_(Lilium) is a male-gamete-specific variant found in the genus Lilium [alvarez-venegas_canonical_2019,ueda_male_2005].'

In [33]:
query = f"UPDATE histone_description SET summary='{summary}' WHERE id = '{desc_id}'" 
print(query) 
cursor.execute(query) 

UPDATE histone_description SET summary='gH2A_(Lilium) is a male-gamete-specific variant found in the genus Lilium [alvarez-venegas_canonical_2019,ueda_male_2005].' WHERE id = '23'


In [34]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='gH2A_(Lilium)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,gH2A_(Lilium),variant_group,Lilium,4688,23,H2A,23,gH2A_(Lilium) is a male-gamete-specific varian...,Suggested to be a distinctive variant that evl...,gH2A gene in Lilium longiflorum. The gene is i...,...,,,This histone variant is expected to be specifi...,,,,,,,


In [35]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, ("gH2A_(Lilium)", pid))

In [36]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    "WHERE h.id='gH2A_(Lilium)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2A_(Lilium),variant_group,Lilium,4688,23,H2A,gH2A_(Lilium),alvarez-venegas_canonical_2019
1,gH2A_(Lilium),variant_group,Lilium,4688,23,H2A,gH2A_(Lilium),ueda_male_2005


In [37]:
# Make sure data is committed to the database
conn.commit()

## Change name from gH2B to gH2B_(Lilium)

In [38]:
prev_name, new_name = "gH2B", "gH2B_(Lilium)"

In [39]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2B,variant_group,Lilium,4688,35,H2B,gH2B,alvarez-venegas_canonical_2019
1,gH2B,variant_group,Lilium,4688,35,H2B,gH2B,jiang_evolution_2020
2,gH2B,variant_group,Lilium,4688,35,H2B,gH2B,ueda_unusual_2000
3,gH2B,variant_group,Lilium,4688,35,H2B,gH2B,yang_proteomic_2016


In [40]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,gH2B,variant_group,Lilium,4688,35,H2B,35,gH2B is a group of plant H2B variants found in...,,,...,,,,,,,,,,


### Save publications and description id

In [41]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['alvarez-venegas_canonical_2019', 'jiang_evolution_2020',
       'ueda_unusual_2000', 'yang_proteomic_2016'], dtype=object)

In [42]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
desc_id = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
    "description"
].values[0]
desc_id

35

### Delete relations

In [43]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'gH2B'


In [44]:
query = f"UPDATE histone SET description=null WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=null WHERE id = 'gH2B'


In [45]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2B,variant_group,Lilium,4688,,H2B,,


### Update name

In [46]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='gH2B_(Lilium)' WHERE id = 'gH2B'


In [47]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2B_(Lilium),variant_group,Lilium,4688,,H2B,,


### Return relations

In [48]:
query = f"UPDATE histone SET description={desc_id} WHERE id = '{new_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=35 WHERE id = 'gH2B_(Lilium)'


In [49]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,gH2B_(Lilium),variant_group,Lilium,4688,35,H2B,35,gH2B is a group of plant H2B variants found in...,,,...,,,,,,,,,,


In [50]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
summary = (
    pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
        "summary"
    ]
    .values[0]
    .replace(prev_name, new_name, 1)
)
summary

'gH2B_(Lilium) is a group of plant H2B variants found in Lilium that are highly divergent from the canonical H2B and are expressed in the generative cell of the bicellular pollen where it may be necessary for chromatin remodeling of the male germline [alvarez-venegas_canonical_2019,yang_proteomic_2016,ueda_unusual_2000]. The subvariants so far identified are named gH2B in Lilium longiflorum, mgH2B in Lilium davidii, and mgH2B.in in Lilium davidii. These subvariants are rather different and are grouped due to lack of further information. Some phylogenetic reconstructions cluster these variant together with H2B.S [alvarez-venegas_canonical_2019], however, this may be likely to long branch attraction, and the exact phylogeny remains to be studied [jiang_evolution_2020].'

In [51]:
query = f"UPDATE histone_description SET summary='{summary}' WHERE id = '{desc_id}'" 
print(query) 
cursor.execute(query) 

UPDATE histone_description SET summary='gH2B_(Lilium) is a group of plant H2B variants found in Lilium that are highly divergent from the canonical H2B and are expressed in the generative cell of the bicellular pollen where it may be necessary for chromatin remodeling of the male germline [alvarez-venegas_canonical_2019,yang_proteomic_2016,ueda_unusual_2000]. The subvariants so far identified are named gH2B in Lilium longiflorum, mgH2B in Lilium davidii, and mgH2B.in in Lilium davidii. These subvariants are rather different and are grouped due to lack of further information. Some phylogenetic reconstructions cluster these variant together with H2B.S [alvarez-venegas_canonical_2019], however, this may be likely to long branch attraction, and the exact phylogeny remains to be studied [jiang_evolution_2020].' WHERE id = '35'


In [52]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,gH2B_(Lilium),variant_group,Lilium,4688,35,H2B,35,gH2B_(Lilium) is a group of plant H2B variants...,,,...,,,,,,,,,,


In [53]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [54]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,gH2B_(Lilium),variant_group,Lilium,4688,35,H2B,gH2B_(Lilium),alvarez-venegas_canonical_2019
1,gH2B_(Lilium),variant_group,Lilium,4688,35,H2B,gH2B_(Lilium),jiang_evolution_2020
2,gH2B_(Lilium),variant_group,Lilium,4688,35,H2B,gH2B_(Lilium),ueda_unusual_2000
3,gH2B_(Lilium),variant_group,Lilium,4688,35,H2B,gH2B_(Lilium),yang_proteomic_2016


In [55]:
# Make sure data is committed to the database
conn.commit()

## Change name from H2B.S → H2B.S_(Magnoliopsida)

In [56]:
prev_name, new_name = "H2B.S", "H2B.S_(Magnoliopsida)"

In [57]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2B.S,variant_group,Magnoliopsida,3398,40,H2B,H2B.S,jiang_evolution_2020


In [58]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2B.S,variant_group,Magnoliopsida,3398,40,H2B,40,H2B.S is a class a new class of highly diverg...,Flowering plants (angiosperms) [jiang_evolutio...,"HTB8 gene in Arabidopsis, Solyc06g074750.1 in ...",...,Arabidopsis HTB8 is specifically expressed in ...,,For Arabidopsis an adaptive function in cell t...,The angiosperm-specific clade of Arabidopsis H...,,,"Two HTB8 residues, Arg152 and Met179, were hig...",,,


### Save publications and description id

In [59]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['jiang_evolution_2020'], dtype=object)

In [60]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
desc_id = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
    "description"
].values[0]
desc_id

40

### Delete relations

In [61]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'H2B.S'


In [62]:
query = f"UPDATE histone SET description=null WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=null WHERE id = 'H2B.S'


In [63]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2B.S,variant_group,Magnoliopsida,3398,,H2B,,


### Update name

In [64]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='H2B.S_(Magnoliopsida)' WHERE id = 'H2B.S'


In [65]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2B.S_(Magnoliopsida),variant_group,Magnoliopsida,3398,,H2B,,


### Return relations

In [66]:
query = f"UPDATE histone SET description={desc_id} WHERE id = '{new_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=40 WHERE id = 'H2B.S_(Magnoliopsida)'


In [67]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2B.S_(Magnoliopsida),variant_group,Magnoliopsida,3398,40,H2B,40,H2B.S is a class a new class of highly diverg...,Flowering plants (angiosperms) [jiang_evolutio...,"HTB8 gene in Arabidopsis, Solyc06g074750.1 in ...",...,Arabidopsis HTB8 is specifically expressed in ...,,For Arabidopsis an adaptive function in cell t...,The angiosperm-specific clade of Arabidopsis H...,,,"Two HTB8 residues, Arg152 and Met179, were hig...",,,


In [68]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
summary = (
    pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
        "summary"
    ]
    .values[0]
    .replace(prev_name, new_name, 1)
)
summary

'H2B.S_(Magnoliopsida)  is a class a new class of highly divergent H2B variants identified by Jiang et al. that specifically accumulate during chromatin compaction of dry seed embryos in multiple species of flowering plants [jiang_evolution_2020].'

In [69]:
query = f"UPDATE histone_description SET summary='{summary}' WHERE id = '{desc_id}'" 
print(query) 
cursor.execute(query) 

UPDATE histone_description SET summary='H2B.S_(Magnoliopsida)  is a class a new class of highly divergent H2B variants identified by Jiang et al. that specifically accumulate during chromatin compaction of dry seed embryos in multiple species of flowering plants [jiang_evolution_2020].' WHERE id = '40'


In [70]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2B.S_(Magnoliopsida),variant_group,Magnoliopsida,3398,40,H2B,40,H2B.S_(Magnoliopsida) is a class a new class ...,Flowering plants (angiosperms) [jiang_evolutio...,"HTB8 gene in Arabidopsis, Solyc06g074750.1 in ...",...,Arabidopsis HTB8 is specifically expressed in ...,,For Arabidopsis an adaptive function in cell t...,The angiosperm-specific clade of Arabidopsis H...,,,"Two HTB8 residues, Arg152 and Met179, were hig...",,,


In [71]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [72]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2B.S_(Magnoliopsida),variant_group,Magnoliopsida,3398,40,H2B,H2B.S_(Magnoliopsida),jiang_evolution_2020


In [73]:
# Make sure data is committed to the database
conn.commit()

## Change name from H2A.M → H2A.M_(Viridiplantae)

In [74]:
prev_name, new_name = "H2A.M", "H2A.M_(Viridiplantae)"

In [75]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.M,variant_group,Viridiplantae,33090,25,H2A,H2A.M,kawashima_diversification_2015


In [76]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.M,variant_group,Viridiplantae,33090,25,H2A,25,H2A.M is a plant specific variant related to H...,H2A.M was characterized in genomes of liverwor...,,...,,,,"In the L1 loop, all H2A.M variants share the m...",,,,,,


### Save publications and description id

In [77]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['kawashima_diversification_2015'], dtype=object)

In [78]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
desc_id = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
    "description"
].values[0]
desc_id

25

### Delete relations

In [79]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'H2A.M'


In [80]:
query = f"UPDATE histone SET description=null WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=null WHERE id = 'H2A.M'


In [81]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.M,variant_group,Viridiplantae,33090,,H2A,,


### Update name

In [82]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='H2A.M_(Viridiplantae)' WHERE id = 'H2A.M'


In [83]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.M_(Viridiplantae),variant_group,Viridiplantae,33090,,H2A,,


### Return relations

In [84]:
query = f"UPDATE histone SET description={desc_id} WHERE id = '{new_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=25 WHERE id = 'H2A.M_(Viridiplantae)'


In [85]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.M_(Viridiplantae),variant_group,Viridiplantae,33090,25,H2A,25,H2A.M is a plant specific variant related to H...,H2A.M was characterized in genomes of liverwor...,,...,,,,"In the L1 loop, all H2A.M variants share the m...",,,,,,


In [86]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
summary = (
    pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
        "summary"
    ]
    .values[0]
    .replace(prev_name, new_name, 1)
)
summary

'H2A.M_(Viridiplantae) is a plant specific variant related to H2A.W found in non-flowering plants such as liverworts, mosses, and lycophytes. H2A.M variants are characterized by having a long C-terminal tail domain, rich in lysine, serine and acidic residues, not present in the other H2A variants. The variant was described by Kawashima et al. [kawashima_diversification_2015].'

In [87]:
query = f"UPDATE histone_description SET summary='{summary}' WHERE id = '{desc_id}'" 
print(query) 
cursor.execute(query) 

UPDATE histone_description SET summary='H2A.M_(Viridiplantae) is a plant specific variant related to H2A.W found in non-flowering plants such as liverworts, mosses, and lycophytes. H2A.M variants are characterized by having a long C-terminal tail domain, rich in lysine, serine and acidic residues, not present in the other H2A variants. The variant was described by Kawashima et al. [kawashima_diversification_2015].' WHERE id = '25'


In [88]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.M_(Viridiplantae),variant_group,Viridiplantae,33090,25,H2A,25,H2A.M_(Viridiplantae) is a plant specific vari...,H2A.M was characterized in genomes of liverwor...,,...,,,,"In the L1 loop, all H2A.M variants share the m...",,,,,,


In [89]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [90]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.M_(Viridiplantae),variant_group,Viridiplantae,33090,25,H2A,H2A.M_(Viridiplantae),kawashima_diversification_2015


In [91]:
# Make sure data is committed to the database
conn.commit()

## Change name from H2A.W → H2A.W_(Magnoliopsida)

In [92]:
prev_name, new_name = "H2A.W", "H2A.W_(Magnoliopsida)"

In [93]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.W,variant_group,Magnoliopsida,3398,27,H2A,,


In [94]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.W,variant_group,Magnoliopsida,3398,27,H2A,27,H2A.W is a plant specific variant found in ang...,H2A.W variant is found exclusively in angiospe...,"Arabidopsis has three H2A.W genes (HTA6, HTA7,...",...,In Arabidopsis HTA6 and HTA7 were found to hav...,In Arabidopsis single mutants of H2A.W genes d...,H2A.W participates in constitutive heterochrom...,A characteristic feature of H2A.W sequences is...,Genome-wide analysis showed that H2A.W variant...,Specific deposition mechanism are not known. C...,The extended C-terminal tail of H2A.W interact...,It was hypothesized that C-terminal tail of H2...,,


### Set publications list and save description id

In [95]:
publications = [
    "alvarez-venegas_canonical_2019",
    "kawashima_diversification_2015",
    "menges_genome-wide_2003",
    "yelagandula_histone_2014",
    "lorkovic_compartmentalization_2017",
    "osakabe_histone_2018"
]
publications

['alvarez-venegas_canonical_2019',
 'kawashima_diversification_2015',
 'menges_genome-wide_2003',
 'yelagandula_histone_2014',
 'lorkovic_compartmentalization_2017',
 'osakabe_histone_2018']

In [96]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
desc_id = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
    "description"
].values[0]
desc_id

27

### Delete relations

In [97]:
query = f"UPDATE histone SET description=null WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=null WHERE id = 'H2A.W'


In [98]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.W,variant_group,Magnoliopsida,3398,,H2A,,


### Update name

In [99]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='H2A.W_(Magnoliopsida)' WHERE id = 'H2A.W'


In [100]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,,H2A,,


### Return relations

In [101]:
query = f"UPDATE histone SET description={desc_id} WHERE id = '{new_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET description=27 WHERE id = 'H2A.W_(Magnoliopsida)'


In [102]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,27,H2A.W is a plant specific variant found in ang...,H2A.W variant is found exclusively in angiospe...,"Arabidopsis has three H2A.W genes (HTA6, HTA7,...",...,In Arabidopsis HTA6 and HTA7 were found to hav...,In Arabidopsis single mutants of H2A.W genes d...,H2A.W participates in constitutive heterochrom...,A characteristic feature of H2A.W sequences is...,Genome-wide analysis showed that H2A.W variant...,Specific deposition mechanism are not known. C...,The extended C-terminal tail of H2A.W interact...,It was hypothesized that C-terminal tail of H2...,,


In [103]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
summary = (
    pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])[
        "summary"
    ]
    .values[0]
    .replace(prev_name, new_name, 1)
)
summary

'H2A.W_(Magnoliopsida) is a plant specific variant found in angiosperms (flowering plants) having a potentially DNA minor-groove-binding SPKK (sometime reffered to as KSPKKA) motif within its C-terminal tail, it is enriched in heterochromatin and implicated in gene silencing and DNA damage response.'

In [104]:
query = f"UPDATE histone_description SET summary='{summary}' WHERE id = '{desc_id}'" 
print(query) 
cursor.execute(query) 

UPDATE histone_description SET summary='H2A.W_(Magnoliopsida) is a plant specific variant found in angiosperms (flowering plants) having a potentially DNA minor-groove-binding SPKK (sometime reffered to as KSPKKA) motif within its C-terminal tail, it is enriched in heterochromatin and implicated in gene silencing and DNA damage response.' WHERE id = '27'


In [105]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,27,H2A.W_(Magnoliopsida) is a plant specific vari...,H2A.W variant is found exclusively in angiospe...,"Arabidopsis has three H2A.W genes (HTA6, HTA7,...",...,In Arabidopsis HTA6 and HTA7 were found to hav...,In Arabidopsis single mutants of H2A.W genes d...,H2A.W participates in constitutive heterochrom...,A characteristic feature of H2A.W sequences is...,Genome-wide analysis showed that H2A.W variant...,Specific deposition mechanism are not known. C...,The extended C-terminal tail of H2A.W interact...,It was hypothesized that C-terminal tail of H2...,,


In [107]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        data_publication = {
            "id": pid,
            "title": None,
            "doi": None,
            "author": None,
            "year": None,
        }
        cursor.execute(add_publication, data_publication)
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [108]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,H2A.W_(Magnoliopsida),alvarez-venegas_canonical_2019
1,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,H2A.W_(Magnoliopsida),kawashima_diversification_2015
2,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,H2A.W_(Magnoliopsida),lorkovic_compartmentalization_2017
3,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,H2A.W_(Magnoliopsida),menges_genome-wide_2003
4,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,H2A.W_(Magnoliopsida),osakabe_histone_2018
5,H2A.W_(Magnoliopsida),variant_group,Magnoliopsida,3398,27,H2A,H2A.W_(Magnoliopsida),yelagandula_histone_2014


In [109]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [110]:
cursor.close()
conn.close()
tunnel.stop()