In [1]:
import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

43717


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [24]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
# add_sequence_has_publication = (
#     "INSERT INTO sequence_has_publication "
#     "(sequence_accession, publication_id) "
#     "VALUES (%s, %s)"
# )
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

# Change name for some variants

short_H2A → short_H2A_(Eutheria)

## Change name from short_H2A → short_H2A_(Eutheria)

In [6]:
prev_name, new_name = "short_H2A", "short_H2A_(Eutheria)"

In [7]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,short_H2A,variant_group,Eutheria,9347,31,H2A,short_H2A,chew_short_2021
1,short_H2A,variant_group,Eutheria,9347,31,H2A,short_H2A,jiang_short_2020
2,short_H2A,variant_group,Eutheria,9347,31,H2A,short_H2A,molaro_evolutionary_2018


In [9]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.B,variant,Eutheria,9347,83,short_H2A
1,H2A.L,variant,Eutheria,9347,84,short_H2A
2,H2A.P,variant,Eutheria,9347,85,short_H2A
3,H2A.Q,variant,Eutheria,9347,86,short_H2A


### Save children and publications

In [10]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
children = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])["id"].values
children

array(['H2A.B', 'H2A.L', 'H2A.P', 'H2A.Q'], dtype=object)

In [11]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['chew_short_2021', 'jiang_short_2020', 'molaro_evolutionary_2018'],
      dtype=object)

### Delete relations

In [12]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'short_H2A'


In [13]:
query = f"UPDATE histone SET parent=null WHERE parent = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET parent=null WHERE parent = 'short_H2A'


In [14]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,short_H2A,variant_group,Eutheria,9347,31,H2A,,


In [15]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


### Update name

In [16]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='short_H2A_(Eutheria)' WHERE id = 'short_H2A'


In [17]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,short_H2A_(Eutheria),variant_group,Eutheria,9347,31,H2A,,


### Return relations

In [19]:
query = (
    "SELECT * FROM histone "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(children)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
98,H2A.B,variant,Eutheria,9347,83,
108,H2A.L,variant,Eutheria,9347,84,
117,H2A.P,variant,Eutheria,9347,85,
120,H2A.Q,variant,Eutheria,9347,86,


In [20]:
for ch in children:
    query = f"UPDATE histone SET parent='{new_name}' WHERE id = '{ch}'" 
    print(query)
    cursor.execute(query)

UPDATE histone SET parent='short_H2A_(Eutheria)' WHERE id = 'H2A.B'
UPDATE histone SET parent='short_H2A_(Eutheria)' WHERE id = 'H2A.L'
UPDATE histone SET parent='short_H2A_(Eutheria)' WHERE id = 'H2A.P'
UPDATE histone SET parent='short_H2A_(Eutheria)' WHERE id = 'H2A.Q'


In [22]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria)
1,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria)
2,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria)
3,H2A.Q,variant,Eutheria,9347,86,short_H2A_(Eutheria)


In [25]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [26]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,short_H2A_(Eutheria),variant_group,Eutheria,9347,31,H2A,short_H2A_(Eutheria),chew_short_2021
1,short_H2A_(Eutheria),variant_group,Eutheria,9347,31,H2A,short_H2A_(Eutheria),jiang_short_2020
2,short_H2A_(Eutheria),variant_group,Eutheria,9347,31,H2A,short_H2A_(Eutheria),molaro_evolutionary_2018


In [27]:
# Make sure data is committed to the database
conn.commit()

# Поменять поле description у short_H2A_(Eutheria) и всех подвариантов

Все значения полей у short_H2A_(Eutheria) и его подвариантов попали в summary. Необходимо разнести их на свои места.

In [28]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{new_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,short_H2A_(Eutheria),variant_group,Eutheria,9347,31,H2A,31,short_H2A is a class encompassing several hist...,,,...,,,,,,,,,,


In [32]:
df["summary"].values[0].replace(prev_name, new_name, 1)

'short_H2A_(Eutheria) is a class encompassing several histone H2A variants in placental (eutherian) mammals with shortened C-terminus expressed mainly during mammalian male germ cell development before the nearly complete replacement of histones by protamines in sperm nuclei. The repertoires of short histone H2A variants vary extensively among eutherian mammals due to lineage-specific gains and losses. Short H2A variants include H2A.B, H2A.L, H2A.P, H2A.Q, their genes are usually located on X chromosome and are intronless. These four clades of eutherian mammal short H2A variants emerged from a single, well-supported monophyletic clade, confirming their common ancestry [molaro_evolutionary_2018]. Due to shortened docking domain and changes within the acidic patch nucleosomes incorporating short H2As wrap less DNA (120-130 bp) and form loosely packed chromatin. There are few conserved residues in the histone fold domain of sH2As that distinguish them from each other, instead much of thei

In [34]:
desc_dict = {
    "summary": "short_H2A_(Eutheria) is a class encompassing several histone H2A variants in placental (eutherian) mammals with shortened C-terminus expressed mainly during mammalian male germ cell development before the nearly complete replacement of histones by protamines in sperm nuclei. The repertoires of short histone H2A variants vary extensively among eutherian mammals due to lineage-specific gains and losses. Short H2A variants include H2A.B, H2A.L, H2A.P, H2A.Q, their genes are usually located on X chromosome and are intronless. These four clades of eutherian mammal short H2A variants emerged from a single, well-supported monophyletic clade, confirming their common ancestry [molaro_evolutionary_2018]. Due to shortened docking domain and changes within the acidic patch nucleosomes incorporating short H2As wrap less DNA (120-130 bp) and form loosely packed chromatin. There are few conserved residues in the histone fold domain of sH2As that distinguish them from each other, instead much of their specialization may stem from changes in the N- and C-terminal tails of these variatns [molaro_evolutionary_2018].  Abberant short H2A upregulation was reported in a broad range of cancers [chew_short_2021].", 
    "caveats": "H2A.B is also expressed in the brain [jiang_short_2020].",
}
desc_dict

{'summary': 'short_H2A_(Eutheria) is a class encompassing several histone H2A variants in placental (eutherian) mammals with shortened C-terminus expressed mainly during mammalian male germ cell development before the nearly complete replacement of histones by protamines in sperm nuclei. The repertoires of short histone H2A variants vary extensively among eutherian mammals due to lineage-specific gains and losses. Short H2A variants include H2A.B, H2A.L, H2A.P, H2A.Q, their genes are usually located on X chromosome and are intronless. These four clades of eutherian mammal short H2A variants emerged from a single, well-supported monophyletic clade, confirming their common ancestry [molaro_evolutionary_2018]. Due to shortened docking domain and changes within the acidic patch nucleosomes incorporating short H2As wrap less DNA (120-130 bp) and form loosely packed chromatin. There are few conserved residues in the histone fold domain of sH2As that distinguish them from each other, instead 

In [36]:
desk_str = ', '.join([f'{k}="{v}"' for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    "WHERE id = 31"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary="short_H2A_(Eutheria) is a class encompassing several histone H2A variants in placental (eutherian) mammals with shortened C-terminus expressed mainly during mammalian male germ cell development before the nearly complete replacement of histones by protamines in sperm nuclei. The repertoires of short histone H2A variants vary extensively among eutherian mammals due to lineage-specific gains and losses. Short H2A variants include H2A.B, H2A.L, H2A.P, H2A.Q, their genes are usually located on X chromosome and are intronless. These four clades of eutherian mammal short H2A variants emerged from a single, well-supported monophyletic clade, confirming their common ancestry [molaro_evolutionary_2018]. Due to shortened docking domain and changes within the acidic patch nucleosomes incorporating short H2As wrap less DNA (120-130 bp) and form loosely packed chromatin. There are few conserved residues in the histone fold domain of sH2As that distinguish the

In [38]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{new_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,short_H2A_(Eutheria),variant_group,Eutheria,9347,31,H2A,31,short_H2A_(Eutheria) is a class encompassing s...,,,...,,,,,,,,,,H2A.B is also expressed in the brain [jiang_sh...


In [39]:
df["summary"].values[0]

'short_H2A_(Eutheria) is a class encompassing several histone H2A variants in placental (eutherian) mammals with shortened C-terminus expressed mainly during mammalian male germ cell development before the nearly complete replacement of histones by protamines in sperm nuclei. The repertoires of short histone H2A variants vary extensively among eutherian mammals due to lineage-specific gains and losses. Short H2A variants include H2A.B, H2A.L, H2A.P, H2A.Q, their genes are usually located on X chromosome and are intronless. These four clades of eutherian mammal short H2A variants emerged from a single, well-supported monophyletic clade, confirming their common ancestry [molaro_evolutionary_2018]. Due to shortened docking domain and changes within the acidic patch nucleosomes incorporating short H2As wrap less DNA (120-130 bp) and form loosely packed chromatin. There are few conserved residues in the histone fold domain of sH2As that distinguish them from each other, instead much of thei

## Поменять поле description у H2A.B

In [78]:
ch_name = "H2A.B"

In [41]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),83,"H2A.B, previously known as ""Barr body deficien...",,,...,,,,,,,,,,


In [80]:
desc_id = df["description"].values[0]
desc_id

83

In [None]:
df["summary"].values[0]

In [82]:
desc_dict = {
    "summary": 'H2A.B_(Eutheria), previously known as "Barr body deficient" (H2A.Bbd) variant  is a short replication independent H2A variant found in eutherian mammals implicated in spermiogenesis, transcription regulation, splicing, DNA synthesis.',
    "genes": "Common ancestor of eutherian mammals encoded two or three H2A.B genes. Genes are usually located on X chromosome in three conserved locations (except in mouse, where H2A.B genes are located at other locations on X chromosome with unclear kinship to the ancestral genes that relocated to autosomes and decayed [molaro_evolutionary_2018]). Human and mouse have three genes encoding H2A.Bs [molaro_evolutionary_2018].",
    "evolution": "H2A.B is a rapildy evolving variant which is closely related to H2A.L, H2A.P, H2A.Q. It was suggested that H2A.B have been the subject to diversifying selection in simian primates, although mucj of the increased divergence of short histone H2A variants may be better explained by relaxed purifying selection [molaro_evolutionary_2018].",
    "knock_out": "H2A.B knock-out mice are viable, subfertile and display changes in splicing events [anuar_gene_2019].",
    "sequence": "round 50% identity with the canonical H2A, has truncated docking domain, divergent histone fold domain, altered acidic patch, arginine rich N-terminus [molaro_evolutionary_2018].",
    "localization": "H2A.B is expressed during mammalian male germ cell development and in the brain [molaro_evolutionary_2018,jiang_short_2020]. Originally, H2A.B  was characterized by its exclusion from the inactive X chromosome if overexpressed in female somatic cells [chadwick_novel_2001]. However, experiments in mouse testis revealed that H2A.B is in fact present on the inactive X chromosome  [soboleva_unique_2011]. Short H2A variants localize to sites of open chromatin and potentiate DNA synthesis, transcription, and splicing [molaro_evolutionary_2018]. This histone variant can bind to RNA directly in vitro and in vivo, and associates with mRNA at intron—exon boundaries [soboleva_new_2017]. Structural effects: H2A.B containing nucleosomes wrap less DNA (~120-130 bp instead of ~150 bp) [sugiyama_distinct_2014,doyen_dissection_2006], form loosely packed chromatin.",
    "disease": "H2A.B is upregulated in cancer as other short H2A variants [chew_short_2021].",
    "caveats": "Due to rapid evolution H2A.B function in different species may vary. For example, human H2A.B is retained during spermiogenesis, while is mouse it disappears and H2A.L is retained instead [molaro_evolutionary_2018]. Mouse H2A.B has additional negative residue in acidic patch, which is thought to increase its propensity to compact nucleosomal arrays relative to human H2A.B. H2A.B is also expressed in the brain [jiang_short_2020].",
}
desc_dict

{'summary': 'H2A.B_(Eutheria), previously known as "Barr body deficient" (H2A.Bbd) variant  is a short replication independent H2A variant found in eutherian mammals implicated in spermiogenesis, transcription regulation, splicing, DNA synthesis.',
 'genes': 'Common ancestor of eutherian mammals encoded two or three H2A.B genes. Genes are usually located on X chromosome in three conserved locations (except in mouse, where H2A.B genes are located at other locations on X chromosome with unclear kinship to the ancestral genes that relocated to autosomes and decayed [molaro_evolutionary_2018]). Human and mouse have three genes encoding H2A.Bs [molaro_evolutionary_2018].',
 'evolution': 'H2A.B is a rapildy evolving variant which is closely related to H2A.L, H2A.P, H2A.Q. It was suggested that H2A.B have been the subject to diversifying selection in simian primates, although mucj of the increased divergence of short histone H2A variants may be better explained by relaxed purifying selection 

In [84]:
desk_str = ', '.join([f"{k}='{v}'" for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    f"WHERE id = {desc_id}"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='H2A.B_(Eutheria), previously known as "Barr body deficient" (H2A.Bbd) variant  is a short replication independent H2A variant found in eutherian mammals implicated in spermiogenesis, transcription regulation, splicing, DNA synthesis.', genes='Common ancestor of eutherian mammals encoded two or three H2A.B genes. Genes are usually located on X chromosome in three conserved locations (except in mouse, where H2A.B genes are located at other locations on X chromosome with unclear kinship to the ancestral genes that relocated to autosomes and decayed [molaro_evolutionary_2018]). Human and mouse have three genes encoding H2A.Bs [molaro_evolutionary_2018].', evolution='H2A.B is a rapildy evolving variant which is closely related to H2A.L, H2A.P, H2A.Q. It was suggested that H2A.B have been the subject to diversifying selection in simian primates, although mucj of the increased divergence of short histone H2A variants may be better explained by relaxed p

In [85]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),83,"H2A.B_(Eutheria), previously known as ""Barr bo...",,Common ancestor of eutherian mammals encoded t...,...,,"H2A.B knock-out mice are viable, subfertile an...",,"round 50% identity with the canonical H2A, has...",H2A.B is expressed during mammalian male germ ...,,,,H2A.B is upregulated in cancer as other short ...,Due to rapid evolution H2A.B function in diffe...


In [86]:
df["summary"].values[0]

'H2A.B_(Eutheria), previously known as "Barr body deficient" (H2A.Bbd) variant  is a short replication independent H2A variant found in eutherian mammals implicated in spermiogenesis, transcription regulation, splicing, DNA synthesis.'

In [87]:
df["caveats"].values[0]

'Due to rapid evolution H2A.B function in different species may vary. For example, human H2A.B is retained during spermiogenesis, while is mouse it disappears and H2A.L is retained instead [molaro_evolutionary_2018]. Mouse H2A.B has additional negative residue in acidic patch, which is thought to increase its propensity to compact nucleosomal arrays relative to human H2A.B. H2A.B is also expressed in the brain [jiang_short_2020].'

## Поменять поле description у H2A.L

In [88]:
ch_name = "H2A.L"

In [58]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),84,H2A.L - is a class of short H2A variants in eu...,,,...,,,,,,,,,,


In [90]:
desc_id = df["description"].values[0]
desc_id

84

In [60]:
df["summary"].values[0]

"H2A.L - is a class of short H2A variants in eutherian mammals implicated in spermatogenesis and replacement of histones with protamines. Genes: Сommon ancestor of eutherian mammals encoded three H2A.L genes. These genes are usually located on X chromosome at three locations (named H2A.L.1, H2A.L.2 and H2A.L.3 by Molaro et al.) [molaro_evolutionary_2018]. Humans have two putative genes (H2AL1Q, H2AL3, located at H2A.L.1 and H2A.L.3 loci, respectively) and H2AL1MP pseudogene (located at H2A.L.2 loci has inactivating mutation in all primates). The H2AL1Q and H2AL3 genes have unusually long extensions at their 3'-ends and have not so far been detected at protein level. Mouse genome has a leneage specific expansion of H2A.L genes, encoding a total of 18 genes (15 genes on X chromosome H2a1a-H2a1o and H2al3; two genes on Y-chromosome H2al2c and H2al2b; one gene on chromosome 2 H2al2a) and two pseudogenes [molaro_evolutionary_2018].  At H2A.L.1 locus mouse has a pseudogene H2al1q-ps. The mos

In [91]:
desc_dict = {
    "summary": "H2A.L_(Eutheria) - is a class of short H2A variants in eutherian mammals implicated in spermatogenesis and replacement of histones with protamines.",
    "genes": "Сommon ancestor of eutherian mammals encoded three H2A.L genes. These genes are usually located on X chromosome at three locations (named H2A.L.1, H2A.L.2 and H2A.L.3 by Molaro et al.) [molaro_evolutionary_2018]. Humans have two putative genes (H2AL1Q, H2AL3, located at H2A.L.1 and H2A.L.3 loci, respectively) and H2AL1MP pseudogene (located at H2A.L.2 loci has inactivating mutation in all primates). The H2AL1Q and H2AL3 genes have unusually long extensions at their 3'-ends and have not so far been detected at protein level. Mouse genome has a leneage specific expansion of H2A.L genes, encoding a total of 18 genes (15 genes on X chromosome H2a1a-H2a1o and H2al3; two genes on Y-chromosome H2al2c and H2al2b; one gene on chromosome 2 H2al2a) and two pseudogenes [molaro_evolutionary_2018].  At H2A.L.1 locus mouse has a pseudogene H2al1q-ps. The most studied gene in mice is H2al2a. This gene was first identified in [govin_pericentric_2007] (NCBI protein id NP_080903) and was refered to as H2AL2 or H2A.L.2 histone in the following papers. However, this gene is not located at the H2A.L.2 locus (as defined by Molaro et al.) on X-chromosome, but it is rather located on chromosome 2. A synthenic location of H2A.L.2 locus in mouse is occupied by H2al1m gene [Seal et al. unpublished]. Previous names for mouse H2A.Ls include H2A.Lap2, H2A.Lap3, H2A.Lap4, H2AL1, H2AL2 [soboleva_unique_2011, govin_pericentric_2007]. H2AL1 has been used to reffer to H2al1a gene [govin_pericentric_2007].",
    "evolution": "Molaro et al. found that short H2A variants show greater evolutionary divergence between species than even CENPA, the fastest-evolving histone variant examined to date [molaro_evolutionary_2018]. Evolutionary analysis of purifying selection suggested that H2A.L function may have been lost in Old World monkeys and hominoids but retained in New World monkeys [molaro_evolutionary_2018].",
    "knock_out": "H2A.L.2 knock out mice are infertile because transition proteins can no longer associate with chromatin [barral_histone_2017].",
    "function": "Required for the histone–protamine exchange process [barral_histone_2017].",
    "sequence": "Sequence is divergent from cH2A. Identity with cH2A may be as low as 30%. H2A.L variants have a shortened C-termus, truncated docking domain, altered acidic patch, arginine rich N-terminus. Nucleosomes incorporating H2A.L warp less DNA and from more loosely packed chromatin [molaro_evolutionary_2018].",
    "localization": "Accumulates in spermatid nuclei until the end of spermatogenesis and remains in mature sperm chromatin even after protamine exchange in mouse, eventually disappearing from the paternal pronucleus following fertilization [molaro_evolutionary_2018]. Involved in pericentric chromatin organization in spermatids, is retained after histone-to-protamine replacement [hoghoughi_rna-guided_2020]. H2A.L.2 is maximally expressed at later stages of spermatogenesis (condensing spermatids) when histones are bound by transition proteins and then replaced with protamines.",
    "deposition": "Likely mediated by interaction with RNA [hoghoughi_rna-guided_2020].",
    "interactions": "H2A.L.2 preferentially dimerizes with H2B.1 (TH2B) at least in mice [govin_pericentric_2007]. Intranuclear localization of H2A.L.2 is controlled by its ability to bind RNA via its N-terminus [hoghoughi_rna-guided_2020].",
    "disease": "No information, since no H2A.L have been so far detect at protein level in humans.",
    "caveats": "There is some confusion in literature with respect to numbering H2A.L subvariants, especially in mouse (see caveats in description of mouse H2A.L variants).",
}
desc_dict

{'summary': 'H2A.L_(Eutheria) - is a class of short H2A variants in eutherian mammals implicated in spermatogenesis and replacement of histones with protamines.',
 'genes': "Сommon ancestor of eutherian mammals encoded three H2A.L genes. These genes are usually located on X chromosome at three locations (named H2A.L.1, H2A.L.2 and H2A.L.3 by Molaro et al.) [molaro_evolutionary_2018]. Humans have two putative genes (H2AL1Q, H2AL3, located at H2A.L.1 and H2A.L.3 loci, respectively) and H2AL1MP pseudogene (located at H2A.L.2 loci has inactivating mutation in all primates). The H2AL1Q and H2AL3 genes have unusually long extensions at their 3'-ends and have not so far been detected at protein level. Mouse genome has a leneage specific expansion of H2A.L genes, encoding a total of 18 genes (15 genes on X chromosome H2a1a-H2a1o and H2al3; two genes on Y-chromosome H2al2c and H2al2b; one gene on chromosome 2 H2al2a) and two pseudogenes [molaro_evolutionary_2018].  At H2A.L.1 locus mouse has a 

In [93]:
desk_str = ', '.join([f'{k}="{v}"' for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    f"WHERE id = {desc_id}"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary="H2A.L_(Eutheria) - is a class of short H2A variants in eutherian mammals implicated in spermatogenesis and replacement of histones with protamines.", genes="Сommon ancestor of eutherian mammals encoded three H2A.L genes. These genes are usually located on X chromosome at three locations (named H2A.L.1, H2A.L.2 and H2A.L.3 by Molaro et al.) [molaro_evolutionary_2018]. Humans have two putative genes (H2AL1Q, H2AL3, located at H2A.L.1 and H2A.L.3 loci, respectively) and H2AL1MP pseudogene (located at H2A.L.2 loci has inactivating mutation in all primates). The H2AL1Q and H2AL3 genes have unusually long extensions at their 3'-ends and have not so far been detected at protein level. Mouse genome has a leneage specific expansion of H2A.L genes, encoding a total of 18 genes (15 genes on X chromosome H2a1a-H2a1o and H2al3; two genes on Y-chromosome H2al2c and H2al2b; one gene on chromosome 2 H2al2a) and two pseudogenes [molaro_evolutionary_2018].  At H2A

In [94]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),84,H2A.L_(Eutheria) - is a class of short H2A var...,,Сommon ancestor of eutherian mammals encoded t...,...,,H2A.L.2 knock out mice are infertile because t...,Required for the histone–protamine exchange pr...,Sequence is divergent from cH2A. Identity with...,Accumulates in spermatid nuclei until the end ...,Likely mediated by interaction with RNA [hogho...,,H2A.L.2 preferentially dimerizes with H2B.1 (T...,"No information, since no H2A.L have been so fa...",There is some confusion in literature with res...


In [95]:
df["summary"].values[0]

'H2A.L_(Eutheria) - is a class of short H2A variants in eutherian mammals implicated in spermatogenesis and replacement of histones with protamines.'

## Поменять поле description у H2A.P

In [96]:
ch_name = "H2A.P"

In [67]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),85,H2A.P - is a class of testis-specific short H2...,,,...,,,,,,,,,,


In [68]:
desc_id = df["description"].values[0]
desc_id

85

In [69]:
df["summary"].values[0]

"H2A.P - is a class of testis-specific short H2A variants in eutherian mammals expressed at post-meiotic stages of spermatogensis. It is not well studied. Its expression in mouse has been shown only at mRNA level [el_kennani_ms_histonedb_2017], although evolutionary analysis strongly argues that it is a protein coding gene [molaro_evolutionary_2018]. In human H2A.P gene H2ap was previously named HYPM (Huntingtin-interacting protein M) since in yeast two-hybrid experiments it was shown to interact with huntingtin, which contains an expanded polyglutamine tract in individuals with Huntington's disease [faber_huntingtin_1998]. Genes: Сommon ancestor of eutherian mammals encoded a single H2A.P gene. These genes are usually located on X chromosome as do other short H2As [molaro_evolutionary_2018]. Humans and mouse have one gene name H2AP and H2ap, respectively. Evolution: Molaro et al. found that short H2A variants show greater evolutionary divergence between species than even CENPA, the fa

In [71]:
desc_dict = {
    "summary": "H2A.P_(Eutheria) - is a class of testis-specific short H2A variants in eutherian mammals expressed at post-meiotic stages of spermatogensis. It is not well studied. Its expression in mouse has been shown only at mRNA level [el_kennani_ms_histonedb_2017], although evolutionary analysis strongly argues that it is a protein coding gene [molaro_evolutionary_2018]. In human H2A.P gene H2ap was previously named HYPM (Huntingtin-interacting protein M) since in yeast two-hybrid experiments it was shown to interact with huntingtin, which contains an expanded polyglutamine tract in individuals with Huntington's disease [faber_huntingtin_1998].",
    "genes": "Сommon ancestor of eutherian mammals encoded a single H2A.P gene. These genes are usually located on X chromosome as do other short H2As [molaro_evolutionary_2018]. Humans and mouse have one gene name H2AP and H2ap, respectively.",
    "evolution": "Molaro et al. found that short H2A variants show greater evolutionary divergence between species than even CENPA, the fastest-evolving histone variant examined to date. Their results indicate that H2A.P and possible H2A.B have been subject to diversifying selection in simian primates, which could partly ecplame the greater divergence of short H2A histone variants compared to other H2A histones in mammals [molaro_evolutionary_2018].",
    "knock_out": "No studies.",
    "function": "No specific studies have been reported, but by its similarity to H2A.L it is likely that it participates in the histone–protamine exchange process.",
    "sequence": "Has one of the most divergent sequences from cH2A. Identity with cH2A may be as low as 24%. H2A.P variants have a shortened C-termus, truncated docking domain, altered acidic patch, arginine rich N-terminus. H2A.P have lost two key conserved arginine (R) residues in Loop 1 and 2 that contact the DNA minor-groove and acquired many acidic residues at sites including contacts with DNA and H2B. The last 14 residues in H2A.P are more evolutionary contrained than the rest of the protein, suggesting their potential interaction with non-histone proteins.",
    "localization": "H2A.P presence has not been confirmed at protein level. H2A.P mRNA were found to be strongly enriched in round and elongating spermatids [govin_pericentric_2007], expressed in the post-meiotic stages of spermatogenesis [el_kennani_ms_histonedb_2017].",
    "structure": "Nucleosomes incorporating H2A.P are predicted to be highly destabilized [molaro_evolutionary_2018].",
    "interactions": "In yeast two-hybrid screens was shown to interact with huntingtin [faber_huntingtin_1998].",
    "disease": "No information.",
    "caveats": "H2A.P in mice was previously name H2AL3 [govin_pericentric_2007] and H2A.Lap4 [soboleva_unique_2011].",
}
desc_dict

{'summary': "H2A.P_(Eutheria) - is a class of testis-specific short H2A variants in eutherian mammals expressed at post-meiotic stages of spermatogensis. It is not well studied. Its expression in mouse has been shown only at mRNA level [el_kennani_ms_histonedb_2017], although evolutionary analysis strongly argues that it is a protein coding gene [molaro_evolutionary_2018]. In human H2A.P gene H2ap was previously named HYPM (Huntingtin-interacting protein M) since in yeast two-hybrid experiments it was shown to interact with huntingtin, which contains an expanded polyglutamine tract in individuals with Huntington's disease [faber_huntingtin_1998].",
 'genes': 'Сommon ancestor of eutherian mammals encoded a single H2A.P gene. These genes are usually located on X chromosome as do other short H2As [molaro_evolutionary_2018]. Humans and mouse have one gene name H2AP and H2ap, respectively.',
 'evolution': 'Molaro et al. found that short H2A variants show greater evolutionary divergence betw

In [73]:
desk_str = ', '.join([f'{k}="{v}"' for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    f"WHERE id = {desc_id}"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary="H2A.P_(Eutheria) - is a class of testis-specific short H2A variants in eutherian mammals expressed at post-meiotic stages of spermatogensis. It is not well studied. Its expression in mouse has been shown only at mRNA level [el_kennani_ms_histonedb_2017], although evolutionary analysis strongly argues that it is a protein coding gene [molaro_evolutionary_2018]. In human H2A.P gene H2ap was previously named HYPM (Huntingtin-interacting protein M) since in yeast two-hybrid experiments it was shown to interact with huntingtin, which contains an expanded polyglutamine tract in individuals with Huntington's disease [faber_huntingtin_1998].", genes="Сommon ancestor of eutherian mammals encoded a single H2A.P gene. These genes are usually located on X chromosome as do other short H2As [molaro_evolutionary_2018]. Humans and mouse have one gene name H2AP and H2ap, respectively.", evolution="Molaro et al. found that short H2A variants show greater evolution

In [74]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),85,H2A.P_(Eutheria) - is a class of testis-specif...,,Сommon ancestor of eutherian mammals encoded a...,...,,No studies.,"No specific studies have been reported, but by...",Has one of the most divergent sequences from c...,H2A.P presence has not been confirmed at prote...,,Nucleosomes incorporating H2A.P are predicted ...,In yeast two-hybrid screens was shown to inter...,No information.,H2A.P in mice was previously name H2AL3 [govin...


In [75]:
df["summary"].values[0]

"H2A.P_(Eutheria) - is a class of testis-specific short H2A variants in eutherian mammals expressed at post-meiotic stages of spermatogensis. It is not well studied. Its expression in mouse has been shown only at mRNA level [el_kennani_ms_histonedb_2017], although evolutionary analysis strongly argues that it is a protein coding gene [molaro_evolutionary_2018]. In human H2A.P gene H2ap was previously named HYPM (Huntingtin-interacting protein M) since in yeast two-hybrid experiments it was shown to interact with huntingtin, which contains an expanded polyglutamine tract in individuals with Huntington's disease [faber_huntingtin_1998]."

## Поменять поле description у H2A.B_(Homo_sapiens)

In [97]:
ch_name = "H2A.B_(Homo_sapiens)"

In [98]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B_(Homo_sapiens),variant,Homo sapiens,9606,119,H2A.B,119,"H2A.B_(Homo_sapiens), previously known as ""Bar...",,,...,,,,,,,,,,


In [99]:
desc_id = df["description"].values[0]
desc_id

119

In [100]:
df["summary"].values[0]

'H2A.B_(Homo_sapiens), previously known as "Barr body deficient" (H2A.Bbd) variant  is a group of short replication independent H2A variants in humans encoded by H2AB1, H2AB2 and H2AB3 genes. They are involved in spermiogenesis, transcription regulation, splicing, DNA synthesis. Genes: In human H2A.B is encoded by H2AB1, H2AB2 and H2AB3 genes. Genes are intronless. H2AB2 and H2AB3 encode identical proteins. H2AB1 protein which differes by ??one?? amino acid. Genes are located on X chromosome. Evolution: H2A.B is a rapildy evolving variant which is closely related to H2A.L, H2A.P, H2A.Q [molaro_evolutionary_2018]. Common ancestor of eutherian mammals encoded two or three H2A.B genes. Knock-out: H2A.B knock-out mice are viable, subfertile and display changes in splicing events [anuar_gene_2019]. Sequence: Around 50% identity with the canonical H2A, has truncated docking domain, divergent histone fold domain, altered acidic patch, arginine rich N-terminus [molaro_evolutionary_2018]. Local

In [101]:
desc_dict = {
    "summary": 'H2A.B_(Homo_sapiens), previously known as "Barr body deficient" (H2A.Bbd) variant  is a group of short replication independent H2A variants in humans encoded by H2AB1, H2AB2 and H2AB3 genes. They are involved in spermiogenesis, transcription regulation, splicing, DNA synthesis.',
    "genes": "In human H2A.B is encoded by H2AB1, H2AB2 and H2AB3 genes. Genes are intronless. H2AB2 and H2AB3 encode identical proteins. H2AB1 protein which differes by ??one?? amino acid. Genes are located on X chromosome.",
    "evolution": "H2A.B is a rapildy evolving variant which is closely related to H2A.L, H2A.P, H2A.Q [molaro_evolutionary_2018]. Common ancestor of eutherian mammals encoded two or three H2A.B genes.",
    "knock_out": "H2A.B knock-out mice are viable, subfertile and display changes in splicing events [anuar_gene_2019].",
    "sequence": "Around 50% identity with the canonical H2A, has truncated docking domain, divergent histone fold domain, altered acidic patch, arginine rich N-terminus [molaro_evolutionary_2018].",
    "localization": "H2A.B is expressed during mammalian male germ cell development and in the brain [molaro_evolutionary_2018,jiang_short_2020]. Originally, H2A.B  was characterized by its exclusion from the inactive X chromosome if overexpressed in female somatic cells [chadwick_novel_2001]. However, experiments in mouse testis revealed that H2A.B is in fact present on the inactive X chromosome  [soboleva_unique_2011]. Short H2A variants localize to sites of open chromatin and potentiate DNA synthesis, transcription, and splicing [molaro_evolutionary_2018]. In mouse this histone variant was shown to bind to RNA directly in vitro and in vivo, and associate with mRNA at intron—exon boundaries [soboleva_new_2017].",
    "deposition": "It was suggested that H2A.B is incorporated into DNA sites that are transiently exposed, for instance, during DNA replication [jiang_short_2020]. H2A.B-H2A dimers in nucleosomes can spontaneously be replaced by H2A-H2B dimers [hirano_histone_2021].",
    "structure": "H2A.B containing nucleosomes wrap less DNA (~120-130 bp instead of ~150 bp) [sugiyama_distinct_2014,doyen_dissection_2006], form loosely packed chromatin.",
    "interactions": "RNA processing factors, proteins involved in the piRNA pathway [jiang_short_2020].",
    "disease": "H2A.B is upregulated in cancer as other short H2A variants [chew_short_2021].",
    "caveats": "Due to rapid evolution H2A.B function in different species may vary. For example, human H2A.B is retained during spermiogenesis, while is mouse it disappears and H2A.L is retained instead [molaro_evolutionary_2018]. Mouse H2A.B has additional negative residue in acidic patch, which is thought to increase its propensity to compact nucleosomal arrays relative to human H2A.B.",
}
desc_dict

{'summary': 'H2A.B_(Homo_sapiens), previously known as "Barr body deficient" (H2A.Bbd) variant  is a group of short replication independent H2A variants in humans encoded by H2AB1, H2AB2 and H2AB3 genes. They are involved in spermiogenesis, transcription regulation, splicing, DNA synthesis.',
 'genes': 'In human H2A.B is encoded by H2AB1, H2AB2 and H2AB3 genes. Genes are intronless. H2AB2 and H2AB3 encode identical proteins. H2AB1 protein which differes by ??one?? amino acid. Genes are located on X chromosome.',
 'evolution': 'H2A.B is a rapildy evolving variant which is closely related to H2A.L, H2A.P, H2A.Q [molaro_evolutionary_2018]. Common ancestor of eutherian mammals encoded two or three H2A.B genes.',
 'knock_out': 'H2A.B knock-out mice are viable, subfertile and display changes in splicing events [anuar_gene_2019].',
 'sequence': 'Around 50% identity with the canonical H2A, has truncated docking domain, divergent histone fold domain, altered acidic patch, arginine rich N-termin

In [104]:
desk_str = ', '.join([f"{k}='{v}'" for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    f"WHERE id = {desc_id}"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='H2A.B_(Homo_sapiens), previously known as "Barr body deficient" (H2A.Bbd) variant  is a group of short replication independent H2A variants in humans encoded by H2AB1, H2AB2 and H2AB3 genes. They are involved in spermiogenesis, transcription regulation, splicing, DNA synthesis.', genes='In human H2A.B is encoded by H2AB1, H2AB2 and H2AB3 genes. Genes are intronless. H2AB2 and H2AB3 encode identical proteins. H2AB1 protein which differes by ??one?? amino acid. Genes are located on X chromosome.', evolution='H2A.B is a rapildy evolving variant which is closely related to H2A.L, H2A.P, H2A.Q [molaro_evolutionary_2018]. Common ancestor of eutherian mammals encoded two or three H2A.B genes.', knock_out='H2A.B knock-out mice are viable, subfertile and display changes in splicing events [anuar_gene_2019].', sequence='Around 50% identity with the canonical H2A, has truncated docking domain, divergent histone fold domain, altered acidic patch, arginine ri

In [105]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B_(Homo_sapiens),variant,Homo sapiens,9606,119,H2A.B,119,"H2A.B_(Homo_sapiens), previously known as ""Bar...",,"In human H2A.B is encoded by H2AB1, H2AB2 and ...",...,,"H2A.B knock-out mice are viable, subfertile an...",,"Around 50% identity with the canonical H2A, ha...",H2A.B is expressed during mammalian male germ ...,It was suggested that H2A.B is incorporated in...,H2A.B containing nucleosomes wrap less DNA (~1...,"RNA processing factors, proteins involved in t...",H2A.B is upregulated in cancer as other short ...,Due to rapid evolution H2A.B function in diffe...


In [106]:
df["summary"].values[0]

'H2A.B_(Homo_sapiens), previously known as "Barr body deficient" (H2A.Bbd) variant  is a group of short replication independent H2A variants in humans encoded by H2AB1, H2AB2 and H2AB3 genes. They are involved in spermiogenesis, transcription regulation, splicing, DNA synthesis.'

## Поменять поле description у H2A.B_(Mus_musculus)

In [107]:
ch_name = "H2A.B_(Mus_musculus)"

In [108]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B_(Mus_musculus),variant,Mus musculus,10090,120,H2A.B,120,H2A.B_(Mus_musculus) is a group of three isofo...,,,...,,,,,,,,,,


In [109]:
desc_id = df["description"].values[0]
desc_id

120

In [110]:
df["summary"].values[0]

'H2A.B_(Mus_musculus) is a group of three isoforms of H2A.B variant in mouse encoded by H2ab1, H2ab2 and H2ab3 genes. H2ab1 is the most studied gene and corresponds to H2A.B.3 variant. Evolution: Moreover, the ancestral loci encoding H2A.B genes relocated away from the X Chromosome to autosomes in mouse (Chr 3 and Chr 16) and rat genomes (Chr 20 and Chr 18) as determined by flanking genes, and the encoded H2A.B genes have now been deleted or have decayed beyond recognition. However, the rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus [molaro_evolutionary_2018]. Caveats: Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. The H2ab1 gene variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Hence we follow the established convention.'

In [111]:
desc_dict = {
    "summary": "H2A.B_(Mus_musculus) is a group of three isoforms of H2A.B variant in mouse encoded by H2ab1, H2ab2 and H2ab3 genes. H2ab1 is the most studied gene and corresponds to H2A.B.3 variant.",
    "evolution": "Moreover, the ancestral loci encoding H2A.B genes relocated away from the X Chromosome to autosomes in mouse (Chr 3 and Chr 16) and rat genomes (Chr 20 and Chr 18) as determined by flanking genes, and the encoded H2A.B genes have now been deleted or have decayed beyond recognition. However, the rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus [molaro_evolutionary_2018].",
    "caveats": "Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. The H2ab1 gene variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Hence we follow the established convention.",
}
desc_dict

{'summary': 'H2A.B_(Mus_musculus) is a group of three isoforms of H2A.B variant in mouse encoded by H2ab1, H2ab2 and H2ab3 genes. H2ab1 is the most studied gene and corresponds to H2A.B.3 variant.',
 'evolution': 'Moreover, the ancestral loci encoding H2A.B genes relocated away from the X Chromosome to autosomes in mouse (Chr 3 and Chr 16) and rat genomes (Chr 20 and Chr 18) as determined by flanking genes, and the encoded H2A.B genes have now been deleted or have decayed beyond recognition. However, the rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus [molaro_evolutionary_2018].',
 'caveats': 'Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. The H2ab1 gene variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Hence we follow the est

In [113]:
desk_str = ', '.join([f"{k}='{v}'" for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    f"WHERE id = {desc_id}"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='H2A.B_(Mus_musculus) is a group of three isoforms of H2A.B variant in mouse encoded by H2ab1, H2ab2 and H2ab3 genes. H2ab1 is the most studied gene and corresponds to H2A.B.3 variant.', evolution='Moreover, the ancestral loci encoding H2A.B genes relocated away from the X Chromosome to autosomes in mouse (Chr 3 and Chr 16) and rat genomes (Chr 20 and Chr 18) as determined by flanking genes, and the encoded H2A.B genes have now been deleted or have decayed beyond recognition. However, the rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus [molaro_evolutionary_2018].', caveats='Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. The H2ab1 gene variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Henc

In [114]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B_(Mus_musculus),variant,Mus musculus,10090,120,H2A.B,120,H2A.B_(Mus_musculus) is a group of three isofo...,,,...,,,,,,,,,,Unfortunately there is some confusion with res...


In [115]:
df["summary"].values[0]

'H2A.B_(Mus_musculus) is a group of three isoforms of H2A.B variant in mouse encoded by H2ab1, H2ab2 and H2ab3 genes. H2ab1 is the most studied gene and corresponds to H2A.B.3 variant.'

## Поменять поле description у H2A.L_(Mus_musculus)

In [116]:
ch_name = "H2A.L_(Mus_musculus)"

In [117]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.L_(Mus_musculus),variant,Mus musculus,10090,122,H2A.L,122,H2A.L_(Mus_musculus) - is a group of H2A.L his...,,,...,,,,,,,,,,


In [118]:
desc_id = df["description"].values[0]
desc_id

122

In [119]:
df["summary"].values[0]

'H2A.L_(Mus_musculus) - is a group of H2A.L histone variants in mouse. The single autosomal H2al2a (H2A.L.2 variant) gene in mouse located on chromosome 2 is expressed at much higher levels than the sex-linked copies; knockout of this single gene is sufficient to cause male sterility [molaro_evolutionary_2018]. See H2A.L description for more information. Caveats: there is some confusion with the variant names in the literature. Govin et al. initially identified and named three variants H2AL1, H2AL2, H2AL3 [govin_pericentric_2007]. These are encoded by H2al1a, H2al2a and H2ap genes, respectively. The H2Al2 endoded variant was later renamed in the literature as H2A.L.2 [jiang_short_2020]. The H2ap gencoded variant is in fact H2A.P in the current nomenclature. Alternatively, Molaro et al. have named three evolutionary conserved loci on X chromosome that harbor H2A.L genes, these loci are H2A.L.1, H2A.L.2 and H2A.L.3 [molaro_evolutionary_2018]. While human gene and variant naming is in lin

In [120]:
desc_dict = {
    "summary": "H2A.L_(Mus_musculus) - is a group of H2A.L histone variants in mouse. The single autosomal H2al2a (H2A.L.2 variant) gene in mouse located on chromosome 2 is expressed at much higher levels than the sex-linked copies; knockout of this single gene is sufficient to cause male sterility [molaro_evolutionary_2018]. See H2A.L description for more information.",
    "caveats": "There is some confusion with the variant names in the literature. Govin et al. initially identified and named three variants H2AL1, H2AL2, H2AL3 [govin_pericentric_2007]. These are encoded by H2al1a, H2al2a and H2ap genes, respectively. The H2Al2 endoded variant was later renamed in the literature as H2A.L.2 [jiang_short_2020]. The H2ap gencoded variant is in fact H2A.P in the current nomenclature. Alternatively, Molaro et al. have named three evolutionary conserved loci on X chromosome that harbor H2A.L genes, these loci are H2A.L.1, H2A.L.2 and H2A.L.3 [molaro_evolutionary_2018]. While human gene and variant naming is in line with the names of these loci, in mouse names of these loci do not correspond exactly to the originally proposed gene names except for H2A.L.3 locus which harbors H2al3 gene (not H2ap gene(!)). In mouse at H2A.L.1 locus a pseudogene H2al1q-ps is located, at H2A.L.2 locus mouse has H2al1m gene (H2al1a-H2al1o genes located within 5 megabases from this locus have similar protein sequences(?)), while H2A.L.2 variant encoded by H2al2a is located on chromosome 2 (two similar genes H2al2b and H2al2c are located on Y chromosome). In this classification we follow the numbering used in gene names which also corresponds to the one established earlier in the literature. Hence, H2A.L.2 variant in mouse encompasses H2al2a gene and similar H2al2b and H2al2c genes, while H2A.L.1 variant encompasses 14 H2al1a-H2al1o genes including H2al1m gene located at a syntenic location of the H2A.L.2 locus.",
}
desc_dict

{'summary': 'H2A.L_(Mus_musculus) - is a group of H2A.L histone variants in mouse. The single autosomal H2al2a (H2A.L.2 variant) gene in mouse located on chromosome 2 is expressed at much higher levels than the sex-linked copies; knockout of this single gene is sufficient to cause male sterility [molaro_evolutionary_2018]. See H2A.L description for more information.',
 'caveats': 'There is some confusion with the variant names in the literature. Govin et al. initially identified and named three variants H2AL1, H2AL2, H2AL3 [govin_pericentric_2007]. These are encoded by H2al1a, H2al2a and H2ap genes, respectively. The H2Al2 endoded variant was later renamed in the literature as H2A.L.2 [jiang_short_2020]. The H2ap gencoded variant is in fact H2A.P in the current nomenclature. Alternatively, Molaro et al. have named three evolutionary conserved loci on X chromosome that harbor H2A.L genes, these loci are H2A.L.1, H2A.L.2 and H2A.L.3 [molaro_evolutionary_2018]. While human gene and varian

In [122]:
desk_str = ', '.join([f"{k}='{v}'" for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    f"WHERE id = {desc_id}"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='H2A.L_(Mus_musculus) - is a group of H2A.L histone variants in mouse. The single autosomal H2al2a (H2A.L.2 variant) gene in mouse located on chromosome 2 is expressed at much higher levels than the sex-linked copies; knockout of this single gene is sufficient to cause male sterility [molaro_evolutionary_2018]. See H2A.L description for more information.', caveats='There is some confusion with the variant names in the literature. Govin et al. initially identified and named three variants H2AL1, H2AL2, H2AL3 [govin_pericentric_2007]. These are encoded by H2al1a, H2al2a and H2ap genes, respectively. The H2Al2 endoded variant was later renamed in the literature as H2A.L.2 [jiang_short_2020]. The H2ap gencoded variant is in fact H2A.P in the current nomenclature. Alternatively, Molaro et al. have named three evolutionary conserved loci on X chromosome that harbor H2A.L genes, these loci are H2A.L.1, H2A.L.2 and H2A.L.3 [molaro_evolutionary_2018]. Whil

In [123]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.L_(Mus_musculus),variant,Mus musculus,10090,122,H2A.L,122,H2A.L_(Mus_musculus) - is a group of H2A.L his...,,,...,,,,,,,,,,There is some confusion with the variant names...


In [124]:
df["summary"].values[0]

'H2A.L_(Mus_musculus) - is a group of H2A.L histone variants in mouse. The single autosomal H2al2a (H2A.L.2 variant) gene in mouse located on chromosome 2 is expressed at much higher levels than the sex-linked copies; knockout of this single gene is sufficient to cause male sterility [molaro_evolutionary_2018]. See H2A.L description for more information.'

## Поменять поле description у H2A.B.3_(Mus_musculus)

In [125]:
ch_name = "H2A.B.3_(Mus_musculus)"

In [126]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B.3_(Mus_musculus),variant,Mus musculus,10090,151,H2A.B_(Mus_musculus),151,H2A.B.3_(Mus_musculus) is one of the three iso...,,,...,,,,,,,,,,


In [127]:
desc_id = df["description"].values[0]
desc_id

151

In [128]:
df["summary"].values[0]

'H2A.B.3_(Mus_musculus) is one of the three isoforms of H2A.B variant in mouse encoded by H2ab1 gene. This gene has been characterized the most experimentally among all H2A.B. variants. Caveats: Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. This variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Hence we follow the established convention. Caveat: Moralo et al. suggests that "rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus (H2a.b.ratMouse1, also historically named H2A.B.3) [molaro_evolutionary_2018]". Our analysis of supplementary information suggests that H2a.b.ratMouse1 is H2ab2 gene in the current nomenclature. Our analysis of sequences published by Soboleva et al. suggests that H2A.B.3 (H2A.Lap1) is H2ab1 gene and not H2ab2

In [129]:
desc_dict = {
    "summary": "H2A.B.3_(Mus_musculus) is one of the three isoforms of H2A.B variant in mouse encoded by H2ab1 gene. This gene has been characterized the most experimentally among all H2A.B. variants.",
    "caveats": "Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. This variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Hence we follow the established convention. Moralo et al. suggests that rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus (H2a.b.ratMouse1, also historically named H2A.B.3) [molaro_evolutionary_2018]. Our analysis of supplementary information suggests that H2a.b.ratMouse1 is H2ab2 gene in the current nomenclature. Our analysis of sequences published by Soboleva et al. suggests that H2A.B.3 (H2A.Lap1) is H2ab1 gene and not H2ab2 [soboleva_unique_2011].",
}
desc_dict

{'summary': 'H2A.B.3_(Mus_musculus) is one of the three isoforms of H2A.B variant in mouse encoded by H2ab1 gene. This gene has been characterized the most experimentally among all H2A.B. variants.',
 'caveats': 'Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. This variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Hence we follow the established convention. Moralo et al. suggests that rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus (H2a.b.ratMouse1, also historically named H2A.B.3) [molaro_evolutionary_2018]. Our analysis of supplementary information suggests that H2a.b.ratMouse1 is H2ab2 gene in the current nomenclature. Our analysis of sequences published by Soboleva et al. suggests that H2A.B.3 (H2A.Lap1) is H2ab1 gene and n

In [131]:
desk_str = ', '.join([f"{k}='{v}'" for k, v in desc_dict.items()])
query = (
    f"UPDATE histone_description SET {desk_str} "
    f"WHERE id = {desc_id}"
)
print(query)
cursor.execute(query)

UPDATE histone_description SET summary='H2A.B.3_(Mus_musculus) is one of the three isoforms of H2A.B variant in mouse encoded by H2ab1 gene. This gene has been characterized the most experimentally among all H2A.B. variants.', caveats='Unfortunately there is some confusion with respect to the number suffixes in the literature and between the gene names and variant names. This variant was named initially H2A.Lap1 [soboleva_unique_2011], but renamed afterwards to H2A.B.3 [soboleva_new_2017], and this name has been used afterwards [anuar_gene_2019,jiang_short_2020]. Hence we follow the established convention. Moralo et al. suggests that rat-mouse common ancestor acquired an intact H2A.B gene in a new X-linked locus (H2a.b.ratMouse1, also historically named H2A.B.3) [molaro_evolutionary_2018]. Our analysis of supplementary information suggests that H2a.b.ratMouse1 is H2ab2 gene in the current nomenclature. Our analysis of sequences published by Soboleva et al. suggests that H2A.B.3 (H2A.La

In [132]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.id = '{ch_name}' "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B.3_(Mus_musculus),variant,Mus musculus,10090,151,H2A.B_(Mus_musculus),151,H2A.B.3_(Mus_musculus) is one of the three iso...,,,...,,,,,,,,,,Unfortunately there is some confusion with res...


In [133]:
df["summary"].values[0]

'H2A.B.3_(Mus_musculus) is one of the three isoforms of H2A.B variant in mouse encoded by H2ab1 gene. This gene has been characterized the most experimentally among all H2A.B. variants.'

In [134]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    f"WHERE h.parent = '{new_name}' "
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
0,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),83,"H2A.B_(Eutheria), previously known as ""Barr bo...",,Common ancestor of eutherian mammals encoded t...,...,,"H2A.B knock-out mice are viable, subfertile an...",,"round 50% identity with the canonical H2A, has...",H2A.B is expressed during mammalian male germ ...,,,,H2A.B is upregulated in cancer as other short ...,Due to rapid evolution H2A.B function in diffe...
1,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),84,H2A.L_(Eutheria) - is a class of short H2A var...,,Сommon ancestor of eutherian mammals encoded t...,...,,H2A.L.2 knock out mice are infertile because t...,Required for the histone–protamine exchange pr...,Sequence is divergent from cH2A. Identity with...,Accumulates in spermatid nuclei until the end ...,Likely mediated by interaction with RNA [hogho...,,H2A.L.2 preferentially dimerizes with H2B.1 (T...,"No information, since no H2A.L have been so fa...",There is some confusion in literature with res...
2,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),85,H2A.P_(Eutheria) - is a class of testis-specif...,,Сommon ancestor of eutherian mammals encoded a...,...,,No studies.,"No specific studies have been reported, but by...",Has one of the most divergent sequences from c...,H2A.P presence has not been confirmed at prote...,,Nucleosomes incorporating H2A.P are predicted ...,In yeast two-hybrid screens was shown to inter...,No information.,H2A.P in mice was previously name H2AL3 [govin...
3,H2A.Q,variant,Eutheria,9347,86,short_H2A_(Eutheria),86,H2A.Q - is a short H2A variant present in many...,,,...,,,,,,,,,,


In [135]:
# Make sure data is committed to the database
conn.commit()

## Change name from H2A.B → H2A.B_(Eutheria)

In [136]:
prev_name, new_name = "H2A.B", "H2A.B_(Eutheria)"

In [137]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,anuar_gene_2019
1,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,chadwick_novel_2001
2,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,chew_short_2021
3,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,doyen_dissection_2006
4,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,hirano_histone_2021
5,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,jiang_short_2020
6,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,molaro_evolutionary_2018
7,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,soboleva_new_2017
8,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,soboleva_unique_2011
9,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B,sugiyama_distinct_2014


In [138]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.B_(Homo_sapiens),variant,Homo sapiens,9606,119,H2A.B
1,H2A.B_(Mus_musculus),variant,Mus musculus,10090,120,H2A.B


### Save children and publications

In [139]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
children = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])["id"].values
children

array(['H2A.B_(Homo_sapiens)', 'H2A.B_(Mus_musculus)'], dtype=object)

In [140]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['anuar_gene_2019', 'chadwick_novel_2001', 'chew_short_2021',
       'doyen_dissection_2006', 'hirano_histone_2021', 'jiang_short_2020',
       'molaro_evolutionary_2018', 'soboleva_new_2017',
       'soboleva_unique_2011', 'sugiyama_distinct_2014'], dtype=object)

### Delete relations

In [141]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'H2A.B'


In [142]:
query = f"UPDATE histone SET parent=null WHERE parent = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET parent=null WHERE parent = 'H2A.B'


In [143]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.B,variant,Eutheria,9347,83,short_H2A_(Eutheria),,


In [144]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


### Update name

In [145]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='H2A.B_(Eutheria)' WHERE id = 'H2A.B'


In [146]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),,


### Return relations

In [147]:
query = (
    "SELECT * FROM histone "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(children)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
104,H2A.B_(Homo_sapiens),variant,Homo sapiens,9606,119,
105,H2A.B_(Mus_musculus),variant,Mus musculus,10090,120,


In [148]:
for ch in children:
    query = f"UPDATE histone SET parent='{new_name}' WHERE id = '{ch}'" 
    print(query)
    cursor.execute(query)

UPDATE histone SET parent='H2A.B_(Eutheria)' WHERE id = 'H2A.B_(Homo_sapiens)'
UPDATE histone SET parent='H2A.B_(Eutheria)' WHERE id = 'H2A.B_(Mus_musculus)'


In [149]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.B_(Homo_sapiens),variant,Homo sapiens,9606,119,H2A.B_(Eutheria)
1,H2A.B_(Mus_musculus),variant,Mus musculus,10090,120,H2A.B_(Eutheria)


In [150]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [151]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),anuar_gene_2019
1,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),chadwick_novel_2001
2,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),chew_short_2021
3,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),doyen_dissection_2006
4,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),hirano_histone_2021
5,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),jiang_short_2020
6,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),molaro_evolutionary_2018
7,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),soboleva_new_2017
8,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),soboleva_unique_2011
9,H2A.B_(Eutheria),variant,Eutheria,9347,83,short_H2A_(Eutheria),H2A.B_(Eutheria),sugiyama_distinct_2014


In [152]:
# Make sure data is committed to the database
conn.commit()

## Change name from H2A.L → H2A.L_(Eutheria)

In [153]:
prev_name, new_name = "H2A.L", "H2A.L_(Eutheria)"

In [154]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,govin_pericentric_2007
1,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,17261847
2,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,18703863
3,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,19506029
4,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,22650316
5,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,25731851
6,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,barral_histone_2017
7,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,govin_pericentric_2007
8,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,hoghoughi_rna-guided_2020
9,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L,molaro_evolutionary_2018


In [155]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.L_(Homo_sapiens),variant,Homo sapiens,9606,121,H2A.L
1,H2A.L_(Mus_musculus),variant,Mus musculus,10090,122,H2A.L


### Save children and publications

In [156]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
children = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])["id"].values
children

array(['H2A.L_(Homo_sapiens)', 'H2A.L_(Mus_musculus)'], dtype=object)

In [157]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array([' govin_pericentric_2007', '17261847', '18703863', '19506029',
       '22650316', '25731851', 'barral_histone_2017',
       'govin_pericentric_2007', 'hoghoughi_rna-guided_2020',
       'molaro_evolutionary_2018', 'Seal et al. unpublished',
       'soboleva_unique_2011'], dtype=object)

### Delete relations

In [158]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'H2A.L'


In [159]:
query = f"UPDATE histone SET parent=null WHERE parent = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET parent=null WHERE parent = 'H2A.L'


In [160]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.L,variant,Eutheria,9347,84,short_H2A_(Eutheria),,


In [161]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


### Update name

In [162]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='H2A.L_(Eutheria)' WHERE id = 'H2A.L'


In [163]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),,


### Return relations

In [164]:
query = (
    "SELECT * FROM histone "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(children)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
114,H2A.L_(Homo_sapiens),variant,Homo sapiens,9606,121,
115,H2A.L_(Mus_musculus),variant,Mus musculus,10090,122,


In [165]:
for ch in children:
    query = f"UPDATE histone SET parent='{new_name}' WHERE id = '{ch}'" 
    print(query)
    cursor.execute(query)

UPDATE histone SET parent='H2A.L_(Eutheria)' WHERE id = 'H2A.L_(Homo_sapiens)'
UPDATE histone SET parent='H2A.L_(Eutheria)' WHERE id = 'H2A.L_(Mus_musculus)'


In [166]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.L_(Homo_sapiens),variant,Homo sapiens,9606,121,H2A.L_(Eutheria)
1,H2A.L_(Mus_musculus),variant,Mus musculus,10090,122,H2A.L_(Eutheria)


In [167]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [168]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),govin_pericentric_2007
1,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),17261847
2,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),18703863
3,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),19506029
4,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),22650316
5,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),25731851
6,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),barral_histone_2017
7,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),govin_pericentric_2007
8,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),hoghoughi_rna-guided_2020
9,H2A.L_(Eutheria),variant,Eutheria,9347,84,short_H2A_(Eutheria),H2A.L_(Eutheria),molaro_evolutionary_2018


In [169]:
# Make sure data is committed to the database
conn.commit()

## Change name from H2A.P → H2A.P_(Eutheria)

In [170]:
prev_name, new_name = "H2A.P", "H2A.P_(Eutheria)"

In [171]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P,9700202
1,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P,el_kennani_ms_histonedb_2017
2,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P,faber_huntingtin_1998
3,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P,govin_pericentric_2007
4,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P,molaro_evolutionary_2018
5,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P,soboleva_unique_2011


In [172]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.P_(Homo_sapiens),variant,Homo sapiens,9606,123,H2A.P
1,H2A.P_(Mus_musculus),variant,Mus musculus,10090,124,H2A.P


### Save children and publications

In [173]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
children = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])["id"].values
children

array(['H2A.P_(Homo_sapiens)', 'H2A.P_(Mus_musculus)'], dtype=object)

In [174]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['9700202', 'el_kennani_ms_histonedb_2017', 'faber_huntingtin_1998',
       'govin_pericentric_2007', 'molaro_evolutionary_2018',
       'soboleva_unique_2011'], dtype=object)

### Delete relations

In [175]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'H2A.P'


In [176]:
query = f"UPDATE histone SET parent=null WHERE parent = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET parent=null WHERE parent = 'H2A.P'


In [177]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.P,variant,Eutheria,9347,85,short_H2A_(Eutheria),,


In [178]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


### Update name

In [179]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='H2A.P_(Eutheria)' WHERE id = 'H2A.P'


In [180]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.P_(Eutheria),variant,Eutheria,9347,85,short_H2A_(Eutheria),,


### Return relations

In [181]:
query = (
    "SELECT * FROM histone "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(children)]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
118,H2A.P_(Homo_sapiens),variant,Homo sapiens,9606,123,
119,H2A.P_(Mus_musculus),variant,Mus musculus,10090,124,


In [182]:
for ch in children:
    query = f"UPDATE histone SET parent='{new_name}' WHERE id = '{ch}'" 
    print(query)
    cursor.execute(query)

UPDATE histone SET parent='H2A.P_(Eutheria)' WHERE id = 'H2A.P_(Homo_sapiens)'
UPDATE histone SET parent='H2A.P_(Eutheria)' WHERE id = 'H2A.P_(Mus_musculus)'


In [183]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H2A.P_(Homo_sapiens),variant,Homo sapiens,9606,123,H2A.P_(Eutheria)
1,H2A.P_(Mus_musculus),variant,Mus musculus,10090,124,H2A.P_(Eutheria)


In [184]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [185]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.P_(Eutheria),variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P_(Eutheria),9700202
1,H2A.P_(Eutheria),variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P_(Eutheria),el_kennani_ms_histonedb_2017
2,H2A.P_(Eutheria),variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P_(Eutheria),faber_huntingtin_1998
3,H2A.P_(Eutheria),variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P_(Eutheria),govin_pericentric_2007
4,H2A.P_(Eutheria),variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P_(Eutheria),molaro_evolutionary_2018
5,H2A.P_(Eutheria),variant,Eutheria,9347,85,short_H2A_(Eutheria),H2A.P_(Eutheria),soboleva_unique_2011


In [186]:
# Make sure data is committed to the database
conn.commit()

## Change name from H2A.Q → H2A.Q_(Eutheria)

In [187]:
prev_name, new_name = "H2A.Q", "H2A.Q_(Eutheria)"

In [188]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.Q,variant,Eutheria,9347,86,short_H2A_(Eutheria),H2A.Q,jiang_short_2020
1,H2A.Q,variant,Eutheria,9347,86,short_H2A_(Eutheria),H2A.Q,molaro_evolutionary_2018


In [189]:
query = (
    "SELECT * FROM histone "
    f"WHERE parent='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


### Save publications

In [190]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
publications = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)["publication_id"].values
publications

array(['jiang_short_2020', 'molaro_evolutionary_2018'], dtype=object)

### Delete relations

In [191]:
query = f"DELETE FROM histone_has_publication WHERE histone_id = '{prev_name}'" 
print(query) 
cursor.execute(query)

DELETE FROM histone_has_publication WHERE histone_id = 'H2A.Q'


In [192]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{prev_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.Q,variant,Eutheria,9347,86,short_H2A_(Eutheria),,


### Update name

In [193]:
query = f"UPDATE histone SET id='{new_name}' WHERE id = '{prev_name}'" 
print(query) 
cursor.execute(query) 

UPDATE histone SET id='H2A.Q_(Eutheria)' WHERE id = 'H2A.Q'


In [194]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.Q_(Eutheria),variant,Eutheria,9347,86,short_H2A_(Eutheria),,


### Return relations

In [195]:
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in publications:
    if pid not in exist_pubs:
        print(f"Strange {pid}")
    cursor.execute(add_histone_has_publication, (new_name, pid))

In [196]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_has_publication hp "
    "ON h.id = hp.histone_id "
    f"WHERE h.id='{new_name}'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
0,H2A.Q_(Eutheria),variant,Eutheria,9347,86,short_H2A_(Eutheria),H2A.Q_(Eutheria),jiang_short_2020
1,H2A.Q_(Eutheria),variant,Eutheria,9347,86,short_H2A_(Eutheria),H2A.Q_(Eutheria),molaro_evolutionary_2018


In [197]:
# Make sure data is committed to the database
conn.commit()

In [None]:
# desc_dict = {
#     "summary": None,
#     "taxonomy": None,
#     "genes": None,
#     "evolution": None,
#     "expression": None,
#     "knock_out": None,
#     "function": None,
#     "sequence": None,
#     "localization": None,
#     "deposition": None,
#     "structure": None,
#     "interactions": None,
#     "disease": None,
#     "caveats": None,
# }

# Close connections

In [198]:
cursor.close()
conn.close()
tunnel.stop()