In [1]:
import pandas as pd
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

In [5]:
with open('db_curated_server_info.txt', 'r') as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()  
    if line and not line.startswith('#'):  
        key, value = line.split('=', 1)  
        config[key] = value.strip()  

server_name = config.get('server_name')
srever_port = int(config.get('srever_port'))
ssh_password = config.get('ssh_password')
ssh_username = config.get('ssh_username')
db_adress = config.get('db_adress')
db_port = int(config.get('db_port'))

In [6]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

32771


In [7]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [4]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [5]:
query = "SELECT * FROM histone"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'id, level, taxonomic_span, taxonomic_span_id, description, parent'

In [33]:
query = "SELECT * FROM alternative_name"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'id, name, taxonomy, gene, splice, histone'

In [37]:
query = "SELECT * FROM publication"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'id, title, doi, author, year'

In [38]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'histone_id, publication_id'

In [30]:
add_histone_description = (
    "INSERT INTO histone_description "
    "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
add_alternate_names = (
    "INSERT INTO alternative_name "
    "(name, taxonomy, gene, splice, histone) "
    "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
)
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_histone_has_publication = (
    "INSERT INTO histone_has_publication "
    "(histone_id, publication_id) "
    "VALUES (%s, %s)"
)

## Add histone variant cH2B.15_(Homo_sapiens)

In [10]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "cH2B.15_(Homo_sapiens)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


In [21]:
histone_description_summary = "cH2B.15_(Homo_sapiens)  is an isoform (variant) of clustered (canonical) H2B histones in human endoded by H2BC12L gene. H2BC12L is represented by a human-specific duplication of the H2BC12 gene from the chromosome 6 onto chromosome 21, the gene appears to be expressed, its protein sequence is expected to have two nonsynonymous substitutions with respect to H2BC12 gene."
data_histone_description = (histone_description_summary,) + ("null",) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
data_histone = {
    "id": "cH2B.15_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": histone_description_id,
    "parent": "cH2B_(Homo_sapiens)",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

### Checking added data

In [26]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "cH2B.15_(Homo_sapiens)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
32,cH2B.15_(Homo_sapiens),variant,Homo sapiens,9606,156.0,cH2B_(Homo_sapiens),cH2B.15_(Homo_sapiens) is an isoform (variant...,,,,,,,,,,,,,


## Add histone variant cenH3_(Plants)

In [27]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "cenH3_(Plants)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


In [29]:
histone_df[histone_df["id"] == "cenH3_(Eukarya)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
1,cenH3_(Eukarya),variant_group,,,,H3


In [30]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "cenH3_(Eukarya)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
1,cenH3_(Eukarya),variant_group,,,,H3,,,,,,,,,,,,,,


In [31]:
histone_description_summary = "cenH3 is a centromere-specific histone variant, which replaces canonical H3 in centromeric nucleosomes. It is required for kinetochore formation, mitotic progression and chromosome segregation. cenH3 has an extended L1-loop and its N-terminal tail is very different from other H3 variants. cenH3s have an extended L1-loop and usually replace Phe84 in canonical H3 with Trp, and Thr 107 with Ala, Cys, or Ser. cenH3s ususally lack a conserved glutamine in the alpha1 helix of the histone fold. cenH3s typically have only about 50-60% amino acid identity to canonical H3 in the histone fold domain and no conservation of the N-terminus."
data_histone_description = (histone_description_summary,) + ("null",) * 13
cursor.execute(add_histone_description, data_histone_description)

histone_description_id = cursor.lastrowid
data_histone = {
    "id": "cenH3_(Plants)",
    "level": "variant",
    "taxonomic_span": "Eukaryotes",
    "taxonomic_span_id": "2759",
    "description": histone_description_id,
    "parent": "cenH3_(Eukarya)",
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

### Add alternate names

In [35]:
data_alternative_names = [
    {
        "name": "CENP-A",
        "taxonomy": "null",
        "gene": "null",
        "splice": "null",
        "histone": "cenH3_(Plants)",
    },
    {
        "name": "Cse4",
        "taxonomy": "null",
        "gene": "null",
        "splice": "null",
        "histone": "cenH3_(Plants)",
    },
    {
        "name": "HCP-3",
        "taxonomy": "null",
        "gene": "null",
        "splice": "null",
        "histone": "cenH3_(Plants)",
    },
    {
        "name": "CNP1",
        "taxonomy": "null",
        "gene": "null",
        "splice": "null",
        "histone": "cenH3_(Plants)",
    },
    {
        "name": "HTR12",
        "taxonomy": "null",
        "gene": "null",
        "splice": "null",
        "histone": "cenH3_(Plants)",
    },
    {
        "name": "CNA1",
        "taxonomy": "null",
        "gene": "null",
        "splice": "null",
        "histone": "cenH3_(Plants)",
    },
    {
        "name": "cid",
        "taxonomy": "null",
        "gene": "null",
        "splice": "null",
        "histone": "cenH3_(Plants)",
    },
]
for data_an in data_alternative_names:
    cursor.execute(add_alternate_names, data_an)

# Make sure data is committed to the database
conn.commit()

In [22]:
query = "SELECT * FROM alternative_name"
cursor.execute(query)
alternative_name_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)
alternative_name_df[alternative_name_df["histone"] == "cenH3_(Plants)"]

Unnamed: 0,id,name,taxonomy,gene,splice,histone
81,82,CENP-A,,,,cenH3_(Plants)
82,83,Cse4,,,,cenH3_(Plants)
83,84,HCP-3,,,,cenH3_(Plants)
84,85,CNP1,,,,cenH3_(Plants)
85,86,HTR12,,,,cenH3_(Plants)
86,87,CNA1,,,,cenH3_(Plants)
87,88,cid,,,,cenH3_(Plants)


In [10]:
for i in alternative_name_df[alternative_name_df["histone"] == "cenH3_(Plants)"]["id"]:
    query = f"UPDATE alternative_name SET taxonomy=null WHERE id={i}"
    cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [12]:
for i in alternative_name_df[alternative_name_df["histone"] == "cenH3_(Plants)"]["id"]:
    query = f"UPDATE alternative_name SET gene=null WHERE id={i}"
    cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [13]:
for i in alternative_name_df[alternative_name_df["histone"] == "cenH3_(Plants)"]["id"]:
    query = f"UPDATE alternative_name SET splice=null WHERE id={i}"
    cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

### Add publications

In [57]:
publication_list = [
    "22650316",
    "19766562",
    "25956076",
    "23324462",
    "21743476",
    "14583738",
]
query = "SELECT id FROM publication"
cursor.execute(query)
for pub in set(publication_list) - set([i[0] for i in cursor]):
    data_publication = {
        "id": pub,
        "title": "null",
        "doi": "null",
        "author": "null",
        "year": "null",
    }
    cursor.execute(add_publication, data_publication)
    cursor.execute(add_histone_has_publication, ("cenH3_(Plants)", pub))

# Make sure data is committed to the database
conn.commit()

In [50]:
query = "SELECT publication_id FROM histone h LEFT JOIN histone_has_publication p ON h.id = p.histone_id"
cursor.execute(query)
for pub in set(publication_list) - set([i[0] for i in cursor]):
    cursor.execute(add_histone_has_publication, ("cenH3_(Plants)", pub))

# Make sure data is committed to the database
conn.commit()

In [60]:
cursor.execute(add_histone_has_publication, ("cenH3_(Plants)", "22650316"))

# Make sure data is committed to the database
conn.commit()

### Checking added data

In [32]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "cenH3_(Plants)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
2,cenH3_(Plants),variant,Eukaryotes,2759,157.0,cenH3_(Eukarya),cenH3 is a centromere-specific histone variant...,,,,,,,,,,,,,


In [15]:
query = "SELECT * FROM alternative_name"
cursor.execute(query)
alternative_name_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)
alternative_name_df[alternative_name_df["histone"] == "cenH3_(Plants)"]

Unnamed: 0,id,name,taxonomy,gene,splice,histone
81,82,CENP-A,,,,cenH3_(Plants)
82,83,Cse4,,,,cenH3_(Plants)
83,84,HCP-3,,,,cenH3_(Plants)
84,85,CNP1,,,,cenH3_(Plants)
85,86,HTR12,,,,cenH3_(Plants)
86,87,CNA1,,,,cenH3_(Plants)
87,88,cid,,,,cenH3_(Plants)


In [61]:
query = (
    "SELECT * "
    "FROM histone h LEFT JOIN histone_has_publication p ON h.id = p.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "cenH3_(Plants)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
2,cenH3_(Plants),variant,Eukaryotes,2759,157.0,cenH3_(Eukarya),cenH3_(Plants),14583738
3,cenH3_(Plants),variant,Eukaryotes,2759,157.0,cenH3_(Eukarya),cenH3_(Plants),19766562
4,cenH3_(Plants),variant,Eukaryotes,2759,157.0,cenH3_(Eukarya),cenH3_(Plants),21743476
5,cenH3_(Plants),variant,Eukaryotes,2759,157.0,cenH3_(Eukarya),cenH3_(Plants),22650316
6,cenH3_(Plants),variant,Eukaryotes,2759,157.0,cenH3_(Eukarya),cenH3_(Plants),23324462
7,cenH3_(Plants),variant,Eukaryotes,2759,157.0,cenH3_(Eukarya),cenH3_(Plants),25956076


## Add histone variant TS_H3.4

Эта запись дублиует запись `H3.4_(Mammalia)`. Поэтому просто совместим информацию из обоих записей. Для эотого нужно добавить в запись `H3.4_(Mammalia)` еще одно альтенативное имя ("H3.1t"), а также добавить публикации: ["22650316", "8986613"].

In [66]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "TS_H3.4"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent


In [67]:
histone_df[histone_df["id"] == "H3.4_(Mammalia)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
153,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3


In [68]:
query = "SELECT * FROM alternative_name"
cursor.execute(query)
alternative_name_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)
alternative_name_df[alternative_name_df["histone"] == "H3.4_(Mammalia)"]

Unnamed: 0,id,name,taxonomy,gene,splice,histone
32,33,H3T,,,,H3.4_(Mammalia)


In [69]:
query = (
    "SELECT * "
    "FROM histone h LEFT JOIN histone_has_publication p ON h.id = p.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.4_(Mammalia)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
336,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),kycia_tudor_2014
337,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),dong_structural_2020
338,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),kycia_tudor_2014
339,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),tachiwana_nucleosome_2008
340,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),tachiwana_structural_2010
341,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),ueda_testis-specific_2017


### Add alternate names

In [75]:
data_alternative_name = {
    "name": "H3.1t",
    "taxonomy": "null",
    "gene": "null",
    "splice": "null",
    "histone": "H3.4_(Mammalia)",
}
cursor.execute(add_alternate_names, data_alternative_name)

# Make sure data is committed to the database
conn.commit()

### Add publications

In [77]:
publication_list = ["22650316", "8986613"]
query = "SELECT id FROM publication"
cursor.execute(query)
for pub in set(publication_list) - set([i[0] for i in cursor]):
    data_publication = {
        "id": pub,
        "title": "null",
        "doi": "null",
        "author": "null",
        "year": "null",
    }
    cursor.execute(add_publication, data_publication)
    cursor.execute(add_histone_has_publication, ("H3.4_(Mammalia)", pub))

# Make sure data is committed to the database
conn.commit()

In [79]:
query = "SELECT publication_id FROM histone h LEFT JOIN histone_has_publication p ON h.id = p.histone_id"
cursor.execute(query)
for pub in set(publication_list) - set([i[0] for i in cursor]):
    cursor.execute(add_histone_has_publication, ("H3.4_(Mammalia)", pub))

# Make sure data is committed to the database
conn.commit()

In [81]:
cursor.execute(add_histone_has_publication, ("H3.4_(Mammalia)", "22650316"))

# Make sure data is committed to the database
conn.commit()

### Checking added data

In [76]:
query = "SELECT * FROM alternative_name"
cursor.execute(query)
alternative_name_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)
alternative_name_df[alternative_name_df["histone"] == "H3.4_(Mammalia)"]

Unnamed: 0,id,name,taxonomy,gene,splice,histone
32,33,H3T,,,,H3.4_(Mammalia)
88,90,H3.1t,,,,H3.4_(Mammalia)


In [None]:
query = (
    "SELECT * "
    "FROM histone h LEFT JOIN histone_has_publication p ON h.id = p.histone_id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.4_(Mammalia)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,histone_id,publication_id
336,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),kycia_tudor_2014
337,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),22650316
338,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),8986613
339,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),dong_structural_2020
340,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),kycia_tudor_2014
341,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),tachiwana_nucleosome_2008
342,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),tachiwana_structural_2010
343,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia),ueda_testis-specific_2017


## Edit histone variant H2A.W

In [86]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H2A.W"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
107,H2A.W,variant_group,Magnoliopsida,3398,13.0,H2A,H2A.W is a plant specific variant found in ang...,H2A.W variant is found exclusively in angiospe...,"Arabidopsis has three H2A.W genes (HTA6, HTA7,...",H2A.W is a plant-lineage-specific variant that...,In Arabidopsis HTA6 and HTA7 were found to hav...,,H2A.W participates in constitutive heterochrom...,A characteristic feature of H2A.W sequences is...,Genome-wide analysis showed that H2A.W variant...,Specific deposition mechanism are not known. C...,The extended C-terminal tail of H2A.W interact...,It was hypothesized that C-terminal tail of H2...,,


In [90]:
knock_out_text = "In Arabidopsis single mutants of H2A.W genes do not dispay any phenotype; double mutants, HtA6 HTA7 and HTA6 HTA12, and triple mutants result in growth defects that were even more severe in the triple mutant. This suggests that the three H2A.W paralogs in Arabidopsis are functionally redundant [alvarez-venegas_canonical_2019,yelagandula_histone_2014]. HTA7 knock-out in Arabidopsis results in sensitivity to genotoxic agents [lorkovic_compartmentalization_2017]."
query = f"UPDATE histone_description SET knock_out='{knock_out_text}' WHERE id=13"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

### Checking added data

In [91]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H2A.W"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
107,H2A.W,variant_group,Magnoliopsida,3398,13.0,H2A,H2A.W is a plant specific variant found in ang...,H2A.W variant is found exclusively in angiospe...,"Arabidopsis has three H2A.W genes (HTA6, HTA7,...",H2A.W is a plant-lineage-specific variant that...,In Arabidopsis HTA6 and HTA7 were found to hav...,In Arabidopsis single mutants of H2A.W genes d...,H2A.W participates in constitutive heterochrom...,A characteristic feature of H2A.W sequences is...,Genome-wide analysis showed that H2A.W variant...,Specific deposition mechanism are not known. C...,The extended C-terminal tail of H2A.W interact...,It was hypothesized that C-terminal tail of H2...,,


## Edit histone variant H3.3_(Animals)

In [92]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.3_(Animals)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
147,H3.3_(Animals),variant,Metazoa,33208,70.0,H3.3,H3.3_(Animals) is a class of replication-inde...,Animals (Metazoa),In human H3.3 is encoded by H3-3A and H3-3B ge...,,H3.3 mRNAs are polyadenilated. Expressed indep...,,H3.3 has been implicated in a variety of biolo...,,,In humans and other animals specialized chaper...,,,Mutations in histone genes can affect sites of...,


In [93]:
knock_out_text = "In mice complete depletion of H3.3 leads to developmental retardation and early embryonic lethality. At the cellular level, H3.3 loss triggers cell cycle suppression and cell death. Surprisingly, H3.3 depletion does not dramatically disrupt gene regulation in the developing embryo. Instead, H3.3 depletion causes dysfunction of heterochromatin structures at telomeres, centromeres, and pericentromeric regions of chromosomes, leading to mitotic defects [jang_histone_2015]. In C. elegans Despite these specific expression patterns, we find that neither loss of individual H3.3 homologs nor the knockout of all five H3.3-coding genes causes sterility or lethality. However, we demonstrate an essential role for the conserved histone chaperone HIRA in the nucleosomal loading of all H3.3 variants. This requirement can be bypassed by mutation of the H3.3-specific residues to those found in H3. While even removal of all H3.3 homologs does not result in lethality, it leads to reduced fertility and viability in response to high-temperature stress [delaney_differential_2018]. Flies that lack both H3.3 genes have reduced viability and individuals that survive to adulthood are completely sterile in both sexes [sakai_transcriptional_2009]. Targeted disruption of one gene (H3f3b) results in a number of phenotypic abnormalities, including a reduction in H3.3 histone levels, leading to male infertility, as well as abnormal sperm and testes morphology. Additionally, null germ cell populations at specific stages in spermatogenesis, in particular spermatocytes and spermatogonia, exhibited increased rates of apoptosis. Disruption of H3f3b also altered histone post-translational modifications and gene expression in the testes, with the most prominent changes occurring at genes involved in spermatogenesis. Finally, H3f3b null testes also exhibited abnormal germ cell chromatin reorganization and reduced protamine incorporation [yuen_histone_2014]."
query = f"UPDATE histone_description SET knock_out='{knock_out_text}' WHERE id=70"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

### Checking added data

In [94]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.3_(Animals)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
147,H3.3_(Animals),variant,Metazoa,33208,70.0,H3.3,H3.3_(Animals) is a class of replication-inde...,Animals (Metazoa),In human H3.3 is encoded by H3-3A and H3-3B ge...,,H3.3 mRNAs are polyadenilated. Expressed indep...,In mice complete depletion of H3.3 leads to de...,H3.3 has been implicated in a variety of biolo...,,,In humans and other animals specialized chaper...,,,Mutations in histone genes can affect sites of...,


## Edit histone variant H3.3_(Plants)

In [95]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.3_(Plants)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
151,H3.3_(Plants),variant,Viridiplantae,33090,71.0,H3.3,H3.3_(Plants) is a class of replication-indep...,Green plants (Viridiplantae),"In contrast to intronless cH3s, H3.3 in plants...",,"In Arabidopsis, H3.3 genes do not show replica...",,,Characteristic changes between cH3 and H3.3 in...,,"Similar to animals, plants have a specialized ...",,,,


In [96]:
knock_out_text = "In Arabidopsis, removal of three H3.3 genes (HTR4, HTR5,and HTR8) causes defects in male gametogenesis and results in embryonic lethality [wollmann_histone_2017]."
query = f"UPDATE histone_description SET knock_out='{knock_out_text}' WHERE id=71"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

### Checking added data

In [97]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.3_(Plants)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
151,H3.3_(Plants),variant,Viridiplantae,33090,71.0,H3.3,H3.3_(Plants) is a class of replication-indep...,Green plants (Viridiplantae),"In contrast to intronless cH3s, H3.3 in plants...",,"In Arabidopsis, H3.3 genes do not show replica...","In Arabidopsis, removal of three H3.3 genes (H...",,Characteristic changes between cH3 and H3.3 in...,,"Similar to animals, plants have a specialized ...",,,,


## Edit histone variant H3.4_(Mammalia)

In [98]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.4_(Mammalia)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
153,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia) is a mammal-specific H3 histo...,Mammals,In human is encoded by H3-4 gene (formerly HIS...,,,,,,,,PDB structure of human H3.4 containing nucleos...,,The single‑nucleotide polymorphism c190C>T (Ar...,


In [99]:
knock_out_text = "Knockout mice for H3.4 were first generated in 2017; both male and female H3t null mice were viable and healthy, but the male mice were sterile. H3.4 deficiency leads to azoospermia because of the loss of haploid germ cells[ueda_testis-specific_2017]."
query = f"UPDATE histone_description SET knock_out='{knock_out_text}' WHERE id=32"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

### Checking added data

In [100]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.4_(Mammalia)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
153,H3.4_(Mammalia),variant_group,Mammalia,40674,32.0,H3,H3.4_(Mammalia) is a mammal-specific H3 histo...,Mammals,In human is encoded by H3-4 gene (formerly HIS...,,,Knockout mice for H3.4 were first generated in...,,,,,PDB structure of human H3.4 containing nucleos...,,The single‑nucleotide polymorphism c190C>T (Ar...,


# For csv

In [8]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
pattern = r"__\?\?\?"  # Шаблон для поиска '__???'
sequence_df[sequence_df["variant"].str.contains(pattern, regex=True)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [10]:
sequence_df[
    sequence_df["variant_under_consideration"].str.contains(pattern, regex=True)
].shape

(53, 13)

In [14]:
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
]

## Remove histone sequences classified as clustered H2B(?), H3.6(?), H3.7(?), H3.8(?), clustered H4(?) (these histones are putative)

In [19]:
sequence_df[sequence_df["variant_under_consideration"] == "cH2B(?)_(Homo_sapiens)__???"]

Unnamed: 0,accession,variant,variant_under_consideration
245,NP_059141.1,,cH2B(?)_(Homo_sapiens)__???


In [20]:
query = "DELETE FROM sequence WHERE variant_under_consideration='cH2B(?)_(Homo_sapiens)__???'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [22]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
]
sequence_df[sequence_df["variant_under_consideration"] == "cH2B(?)_(Homo_sapiens)__???"]

Unnamed: 0,accession,variant,variant_under_consideration


In [23]:
sequence_df[sequence_df["variant_under_consideration"] == "H3.7(?)_(Homo_sapiens)__???"]

Unnamed: 0,accession,variant,variant_under_consideration
167,NP_001342338.1,,H3.7(?)_(Homo_sapiens)__???


In [25]:
query = "SELECT * FROM sequence_has_publication"
cursor.execute(query)
sp_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
sp_df[sp_df["sequence_accession"] == "NP_001342338.1"]

Unnamed: 0,sequence_accession,publication_id
9,NP_001342338.1,12408966


In [27]:
query = (
    "DELETE FROM sequence_has_publication WHERE sequence_accession='NP_001342338.1'; "
    "DELETE FROM sequence WHERE variant_under_consideration='H3.7(?)_(Homo_sapiens)__???'"
)
cursor.execute(query, multi=True)
# Make sure data is committed to the database
conn.commit()

In [6]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
]
sequence_df[sequence_df["variant_under_consideration"] == "H3.7(?)_(Homo_sapiens)__???"]

Unnamed: 0,accession,variant,variant_under_consideration


In [7]:
query = "SELECT * FROM sequence_has_publication"
cursor.execute(query)
sp_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
sp_df[sp_df["sequence_accession"] == "NP_001342338.1"]

Unnamed: 0,sequence_accession,publication_id


In [8]:
sequence_df[sequence_df["variant_under_consideration"] == "cH4(?)_(Homo_sapiens)__???"]

Unnamed: 0,accession,variant,variant_under_consideration
214,NP_003538.1,,cH4(?)_(Homo_sapiens)__???


In [9]:
query = "SELECT * FROM sequence_has_publication"
cursor.execute(query)
sp_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
sp_df[sp_df["sequence_accession"] == "NP_003538.1"]

Unnamed: 0,sequence_accession,publication_id
48,NP_003538.1,12408966


In [19]:
query = "DELETE FROM sequence_has_publication WHERE sequence_accession='NP_003538.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

DELETE FROM sequence_has_publication WHERE sequence_accession='NP_003538.1'


In [26]:
query = "DELETE FROM sequence WHERE variant_under_consideration='cH4(?)_(Homo_sapiens)__???'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [27]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
]
sequence_df[sequence_df["variant_under_consideration"] == "cH4(?)_(Homo_sapiens)__???"]

Unnamed: 0,accession,variant,variant_under_consideration


In [25]:
query = "SELECT * FROM sequence_has_publication"
cursor.execute(query)
sp_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
sp_df[sp_df["sequence_accession"] == "NP_003538.1"]

Unnamed: 0,sequence_accession,publication_id


## Update other sequences

In [29]:
sequence_df = sequence_df.sort_values(by=["variant_under_consideration"])
sequence_df

Unnamed: 0,accession,variant,variant_under_consideration
215,NP_005309.1,,H1.0_(Homo_sapiens)__???
223,NP_006017.1,,H1.10_(Homo_sapiens)__???
222,NP_005316.1,,H1.1_(Homo_sapiens)__???
216,NP_005310.1,,H1.2_(Homo_sapiens)__???
217,NP_005311.1,,H1.3_(Homo_sapiens)__???
218,NP_005312.1,,H1.4_(Homo_sapiens)__???
219,NP_005313.1,,H1.5_(Homo_sapiens)__???
220,NP_005314.2,,H1.6_(Homo_sapiens)__???
320,NP_861453.1,,H1.7_(Homo_sapiens)__???
306,NP_722575.1,,H1.8_(Homo_sapiens)__???


In [31]:
data_histone = {
    "id": "H1.0_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H1.0",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.0_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_005309.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [32]:
query = "SELECT * FROM sequence WHERE accession='NP_005309.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_005309.1,H1.0_(Homo_sapiens),,3005,H1-0,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MTENSTSAPAAKPKRAKASKKSTDHPKYSDMIVAAIQAEKNRAGSS...,


In [37]:
query = "SELECT * FROM histone WHERE id='H1.0_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.0_(Homo_sapiens),variant,Homo sapiens,9606,,H1.0


In [40]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
223,NP_006017.1,,H1.10_(Homo_sapiens)__???
222,NP_005316.1,,H1.1_(Homo_sapiens)__???
216,NP_005310.1,,H1.2_(Homo_sapiens)__???
217,NP_005311.1,,H1.3_(Homo_sapiens)__???
218,NP_005312.1,,H1.4_(Homo_sapiens)__???


In [41]:
data_histone = {
    "id": "H1.10_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H1.10",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.10_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_006017.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [42]:
query = "SELECT * FROM sequence WHERE accession='NP_006017.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_006017.1,H1.10_(Homo_sapiens),,8971,H1-10,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MSVELEEALPVTTAEGMAKKVTKAGGSAALSPSKKRKNSKKKNQPG...,


In [43]:
query = "SELECT * FROM histone WHERE id='H1.10_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.10_(Homo_sapiens),variant,Homo sapiens,9606,,H1.10


In [44]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
222,NP_005316.1,,H1.1_(Homo_sapiens)__???
216,NP_005310.1,,H1.2_(Homo_sapiens)__???
217,NP_005311.1,,H1.3_(Homo_sapiens)__???
218,NP_005312.1,,H1.4_(Homo_sapiens)__???
219,NP_005313.1,,H1.5_(Homo_sapiens)__???


In [45]:
data_histone = {
    "id": "H1.1_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H1.1",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.1_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_005316.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [46]:
query = "SELECT * FROM sequence WHERE accession='NP_005316.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_005316.1,H1.1_(Homo_sapiens),,3024,H1-1,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MSETVPPAPAASAAPEKPLAGKKAKKPAKAAAASKKKPAGPSVSEL...,


In [47]:
query = "SELECT * FROM histone WHERE id='H1.1_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.1_(Homo_sapiens),variant,Homo sapiens,9606,,H1.1


In [48]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
216,NP_005310.1,,H1.2_(Homo_sapiens)__???
217,NP_005311.1,,H1.3_(Homo_sapiens)__???
218,NP_005312.1,,H1.4_(Homo_sapiens)__???
219,NP_005313.1,,H1.5_(Homo_sapiens)__???
220,NP_005314.2,,H1.6_(Homo_sapiens)__???


In [49]:
data_histone = {
    "id": "H1.2_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H1.2",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.2_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_005310.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [50]:
query = "SELECT * FROM sequence WHERE accession='NP_005310.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_005310.1,H1.2_(Homo_sapiens),,3006,H1-2,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MSETAPAAPAAAPPAEKAPVKKKAAKKAGGTPRKASGPPVSELITK...,


In [51]:
query = "SELECT * FROM histone WHERE id='H1.2_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.2_(Homo_sapiens),variant,Homo sapiens,9606,,H1.2


In [52]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
217,NP_005311.1,,H1.3_(Homo_sapiens)__???
218,NP_005312.1,,H1.4_(Homo_sapiens)__???
219,NP_005313.1,,H1.5_(Homo_sapiens)__???
220,NP_005314.2,,H1.6_(Homo_sapiens)__???
320,NP_861453.1,,H1.7_(Homo_sapiens)__???


In [53]:
data_histone = {
    "id": "H1.3_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H1.3",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.3_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_005311.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [54]:
query = "SELECT * FROM sequence WHERE accession='NP_005311.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_005311.1,H1.3_(Homo_sapiens),,3007,H1-3,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MSETAPLAPTIPAPAEKTPVKKKAKKAGATAGKRKASGPPVSELIT...,


In [55]:
query = "SELECT * FROM histone WHERE id='H1.3_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.3_(Homo_sapiens),variant,Homo sapiens,9606,,H1.3


In [56]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
218,NP_005312.1,,H1.4_(Homo_sapiens)__???
219,NP_005313.1,,H1.5_(Homo_sapiens)__???
220,NP_005314.2,,H1.6_(Homo_sapiens)__???
320,NP_861453.1,,H1.7_(Homo_sapiens)__???
306,NP_722575.1,,H1.8_(Homo_sapiens)__???


In [57]:
data_histone = {
    "id": "H1.4_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H1.4",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.4_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_005312.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [58]:
query = "SELECT * FROM sequence WHERE accession='NP_005312.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_005312.1,H1.4_(Homo_sapiens),,3008,H1-4,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MSETAPAAPAAPAPAEKTPVKKKARKSAGAAKRKASGPPVSELITK...,


In [61]:
query = "SELECT * FROM histone WHERE id='H1.4_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.4_(Homo_sapiens),variant,Homo sapiens,9606,,H1.4


In [62]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
219,NP_005313.1,,H1.5_(Homo_sapiens)__???
220,NP_005314.2,,H1.6_(Homo_sapiens)__???
320,NP_861453.1,,H1.7_(Homo_sapiens)__???
306,NP_722575.1,,H1.8_(Homo_sapiens)__???
164,NP_001295191.1,,H1.8_(Homo_sapiens)__???


In [63]:
data_histone = {
    "id": "H1.5_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H1.5",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.5_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_005313.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [64]:
query = "SELECT * FROM sequence WHERE accession='NP_005313.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_005313.1,H1.5_(Homo_sapiens),,3009,H1-5,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MSETAPAETATPAPVEKSPAKKKATKKAAGAGAAKRKATGPPVSEL...,


In [65]:
query = "SELECT * FROM histone WHERE id='H1.5_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.5_(Homo_sapiens),variant,Homo sapiens,9606,,H1.5


In [66]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
220,NP_005314.2,,H1.6_(Homo_sapiens)__???
320,NP_861453.1,,H1.7_(Homo_sapiens)__???
306,NP_722575.1,,H1.8_(Homo_sapiens)__???
164,NP_001295191.1,,H1.8_(Homo_sapiens)__???
175,NP_003484.1,,H3.4_(Homo_sapiens)__???


In [67]:
data_histone = {
    "id": "H1.6_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "TS_H1.6",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.6_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_005314.2'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [68]:
query = "SELECT * FROM sequence WHERE accession='NP_005314.2'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_005314.2,H1.6_(Homo_sapiens),,3010,H1-6,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MSETVPAASASAGVAAMEKLPTKKRGRKPAGLISASRKVPNLSVSK...,


In [69]:
query = "SELECT * FROM histone WHERE id='H1.6_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.6_(Homo_sapiens),variant,Homo sapiens,9606,,TS_H1.6


In [70]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
320,NP_861453.1,,H1.7_(Homo_sapiens)__???
306,NP_722575.1,,H1.8_(Homo_sapiens)__???
164,NP_001295191.1,,H1.8_(Homo_sapiens)__???
175,NP_003484.1,,H3.4_(Homo_sapiens)__???
99,NP_001013721.2,,H3.5_(Homo_sapiens)__???


In [71]:
data_histone = {
    "id": "H1.7_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "TS_H1.7",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.7_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_861453.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [72]:
query = "SELECT * FROM sequence WHERE accession='NP_861453.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_861453.1,H1.7_(Homo_sapiens),,341567,H1-7,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MEQALTGEAQSRWPRRGGSGAMAEAPGPSGESRGHSATQLPAEKTV...,


In [73]:
query = "SELECT * FROM histone WHERE id='H1.7_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.7_(Homo_sapiens),variant,Homo sapiens,9606,,TS_H1.7


In [74]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
306,NP_722575.1,,H1.8_(Homo_sapiens)__???
164,NP_001295191.1,,H1.8_(Homo_sapiens)__???
175,NP_003484.1,,H3.4_(Homo_sapiens)__???
99,NP_001013721.2,,H3.5_(Homo_sapiens)__???
468,XP_003954426.1,,H3.5__???


In [75]:
data_histone = {
    "id": "H1.8_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "OO_H1.8",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H1.8_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_722575.1'"
cursor.execute(query)
query = f"UPDATE sequence SET variant='H1.8_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_001295191.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [76]:
query = "SELECT * FROM sequence WHERE accession='NP_722575.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_722575.1,H1.8_(Homo_sapiens),,132243,H1-8,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHSSL...,


In [77]:
query = "SELECT * FROM sequence WHERE accession='NP_001295191.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_001295191.1,H1.8_(Homo_sapiens),,132243,H1-8,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MAPATAPRRAGEAKGKGPKKPSEAKEDPPNVGKVKKAAKRPAKVQK...,


In [78]:
query = "SELECT * FROM histone WHERE id='H1.8_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H1.8_(Homo_sapiens),variant,Homo sapiens,9606,,OO_H1.8


In [81]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
175,NP_003484.1,,H3.4_(Homo_sapiens)__???
99,NP_001013721.2,,H3.5_(Homo_sapiens)__???
468,XP_003954426.1,,H3.5__???
166,NP_001342187.1,,H3.Y.1_(Homo_sapiens)__???
168,NP_001358848.1,,H3.Y.2_(Homo_sapiens)__???


In [82]:
data_histone = {
    "id": "H3.4_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H3.4_(Mammalia)",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H3.4_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_003484.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [83]:
query = "SELECT * FROM sequence WHERE accession='NP_003484.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_003484.1,H3.4_(Homo_sapiens),,8290,H3-4,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYRPGT...,


In [84]:
query = "SELECT * FROM histone WHERE id='H3.4_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H3.4_(Homo_sapiens),variant,Homo sapiens,9606,,H3.4_(Mammalia)


In [85]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
99,NP_001013721.2,,H3.5_(Homo_sapiens)__???
468,XP_003954426.1,,H3.5__???
166,NP_001342187.1,,H3.Y.1_(Homo_sapiens)__???
168,NP_001358848.1,,H3.Y.2_(Homo_sapiens)__???
91,HISTDB_H3_Y_0,,H3.Y__???


In [86]:
data_histone = {
    "id": "H3.5_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H3.5_(Primates_or_Hominids?)",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H3.5_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_001013721.2'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [87]:
query = "SELECT * FROM sequence WHERE accession='NP_001013721.2'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_001013721.2,H3.5_(Homo_sapiens),,440093,H3-5,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKPHRYRPGTV...,


In [88]:
query = "SELECT * FROM histone WHERE id='H3.5_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H3.5_(Homo_sapiens),variant,Homo sapiens,9606,,H3.5_(Primates_or_Hominids?)


In [89]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
468,XP_003954426.1,,H3.5__???
166,NP_001342187.1,,H3.Y.1_(Homo_sapiens)__???
168,NP_001358848.1,,H3.Y.2_(Homo_sapiens)__???
91,HISTDB_H3_Y_0,,H3.Y__???
92,HISTDB_H3_Y_1,,H3.Y__???


In [90]:
query = f"UPDATE sequence SET variant='H3.5_(Primates_or_Hominids?)', variant_under_consideration=null WHERE accession='XP_003954426.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [91]:
query = "SELECT * FROM sequence WHERE accession='XP_003954426.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,XP_003954426.1,H3.5_(Primates_or_Hominids?),410046862,,,9598,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTXGVKKPHRYRPGT...,


In [93]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
166,NP_001342187.1,,H3.Y.1_(Homo_sapiens)__???
168,NP_001358848.1,,H3.Y.2_(Homo_sapiens)__???
91,HISTDB_H3_Y_0,,H3.Y__???
92,HISTDB_H3_Y_1,,H3.Y__???
93,HISTDB_H3_Y_2,,H3.Y__???


In [94]:
data_histone = {
    "id": "H3.Y.1_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H3.Y_(Homo_sapiens)",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H3.Y.1_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_001342187.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [95]:
query = "SELECT * FROM sequence WHERE accession='NP_001342187.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_001342187.1,H3.Y.1_(Homo_sapiens),,391769,H3Y1,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGT...,


In [96]:
query = "SELECT * FROM histone WHERE id='H3.Y.1_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H3.Y.1_(Homo_sapiens),variant,Homo sapiens,9606,,H3.Y_(Homo_sapiens)


In [97]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
168,NP_001358848.1,,H3.Y.2_(Homo_sapiens)__???
91,HISTDB_H3_Y_0,,H3.Y__???
92,HISTDB_H3_Y_1,,H3.Y__???
93,HISTDB_H3_Y_2,,H3.Y__???
94,HISTDB_H3_Y_3,,H3.Y__???


In [98]:
data_histone = {
    "id": "H3.Y.2_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "H3.Y_(Homo_sapiens)",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='H3.Y.2_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_001358848.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [99]:
query = "SELECT * FROM sequence WHERE accession='NP_001358848.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_001358848.1,H3.Y.2_(Homo_sapiens),,340096,H3Y2,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGT...,


In [100]:
query = "SELECT * FROM histone WHERE id='H3.Y.2_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,H3.Y.2_(Homo_sapiens),variant,Homo sapiens,9606,,H3.Y_(Homo_sapiens)


In [101]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
91,HISTDB_H3_Y_0,,H3.Y__???
92,HISTDB_H3_Y_1,,H3.Y__???
93,HISTDB_H3_Y_2,,H3.Y__???
94,HISTDB_H3_Y_3,,H3.Y__???
467,XP_003804825.1,,TS_H3.4__???


In [102]:
for i in range(4):
    query = f"UPDATE sequence SET variant='H3.Y_(Primates?)', variant_under_consideration=null WHERE accession='HISTDB_H3_Y_{i}'"
    cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [105]:
query = "SELECT * FROM sequence WHERE accession LIKE 'HISTDB_H3_Y_%'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,HISTDB_H3_Y_0,H3.Y_(Primates?),NOGI,,,9544,Macaca mulatta,Chordata,Mammalia,,,ARTKQTARKATNWQAPRKPLATKAAAKRAPPRGGIKKPHRYKPGTQ...,
1,HISTDB_H3_Y_1,H3.Y_(Primates?),NOGI,,,9544,Macaca mulatta,Chordata,Mammalia,,,ARTKQTARKATNWQAPRKPLATKAPGKRLPPRGGIKKPHRYRPGTQ...,
2,HISTDB_H3_Y_2,H3.Y_(Primates?),NOGI,,,9598,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGTL...,
3,HISTDB_H3_Y_3,H3.Y_(Primates?),NOGI,,,9598,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGTL...,


In [106]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
467,XP_003804825.1,,TS_H3.4__???
214,NP_003539.1,,cH4_(Homo_sapiens)__???
255,NP_068803.1,,cH4_(Homo_sapiens)__???
213,NP_003537.1,,cH4_(Homo_sapiens)__???
212,NP_003536.1,,cH4_(Homo_sapiens)__???


In [107]:
query = f"UPDATE sequence SET variant='H3.4_(Mammalia)', variant_under_consideration=null WHERE accession='XP_003804825.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [108]:
query = "SELECT * FROM sequence WHERE accession='XP_003804825.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,XP_003804825.1,H3.4_(Mammalia),397466137,,,9597,Pan paniscus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLVTKVARKSAPATGGVKKPHRYRPGT...,


In [111]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
214,NP_003539.1,,cH4_(Homo_sapiens)__???
255,NP_068803.1,,cH4_(Homo_sapiens)__???
213,NP_003537.1,,cH4_(Homo_sapiens)__???
212,NP_003536.1,,cH4_(Homo_sapiens)__???
211,NP_003535.1,,cH4_(Homo_sapiens)__???


In [116]:
sequence_df[
    sequence_df["variant_under_consideration"] == "cH4_(Homo_sapiens)__???"
].shape

(14, 3)

In [113]:
histone_description_id = cursor.lastrowid
data_histone = {
    "id": "cH4_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "cH4",
}
cursor.execute(add_histone, data_histone)
for acc in sequence_df[
    sequence_df["variant_under_consideration"] == "cH4_(Homo_sapiens)__???"
]["accession"]:
    query = f"UPDATE sequence SET variant='cH4_(Homo_sapiens)', variant_under_consideration=null WHERE accession='{acc}'"
    cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [118]:
query = "SELECT * FROM sequence"
cursor.execute(query)
ch4df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
ch4df[
    ch4df["accession"].isin(
        sequence_df[
            sequence_df["variant_under_consideration"] == "cH4_(Homo_sapiens)__???"
        ]["accession"]
    )
]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
107,NP_001029249.1,cH4_(Homo_sapiens),,554313,H4C15,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
176,NP_003486.1,cH4_(Homo_sapiens),,8294,H4C9,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
205,NP_003529.1,cH4_(Homo_sapiens),,8359,H4C1,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
206,NP_003530.1,cH4_(Homo_sapiens),,8360,H4C4,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
207,NP_003531.1,cH4_(Homo_sapiens),,8361,H4C6,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
208,NP_003532.1,cH4_(Homo_sapiens),,8362,H4C12,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
209,NP_003533.1,cH4_(Homo_sapiens),,8364,H4C3,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
210,NP_003534.1,cH4_(Homo_sapiens),,8365,H4C8,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
211,NP_003535.1,cH4_(Homo_sapiens),,8366,H4C2,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,
212,NP_003536.1,cH4_(Homo_sapiens),,8367,H4C5,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,


In [119]:
query = "SELECT * FROM histone WHERE id='cH4_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,cH4_(Homo_sapiens),variant,Homo sapiens,9606,,cH4


In [120]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df.head()

Unnamed: 0,accession,variant,variant_under_consideration
112,NP_001035891.1,,cenH3_(Homo_sapiens)__???
171,NP_001800.1,,cenH3_(Homo_sapiens)__???
5,AAK39657.1,,cenH3__???
230,NP_012875.2,,cenH3__???
233,NP_031707.1,,cenH3__???


In [121]:
data_histone = {
    "id": "cenH3_(Animals)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "cenH3_(Eukarya)",
}
cursor.execute(add_histone, data_histone)
data_histone = {
    "id": "cenH3_(Mammalia)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "cenH3_(Animals)",
}
cursor.execute(add_histone, data_histone)
data_histone = {
    "id": "cenH3_(Homo_sapiens)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "cenH3_(Mammalia)",
}
cursor.execute(add_histone, data_histone)

query = f"UPDATE sequence SET variant='cenH3_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_001035891.1'"
cursor.execute(query)
query = f"UPDATE sequence SET variant='cenH3_(Homo_sapiens)', variant_under_consideration=null WHERE accession='NP_001800.1'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [122]:
query = "SELECT * FROM sequence WHERE accession='NP_001035891.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_001035891.1,cenH3_(Homo_sapiens),,1058,CENPA,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRRQG...,


In [123]:
query = "SELECT * FROM sequence WHERE accession='NP_001800.1'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_001800.1,cenH3_(Homo_sapiens),,1058,CENPA,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,MGPRRRSRKPEAPRRRSPSPTPTPGPSRRGPSLGASSHQHSRRRQG...,


In [124]:
query = "SELECT * FROM histone WHERE id='cenH3_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,cenH3_(Homo_sapiens),variant,Homo sapiens,9606,,cenH3_(Mammalia)


In [126]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df

Unnamed: 0,accession,variant,variant_under_consideration
5,AAK39657.1,,cenH3__???
230,NP_012875.2,,cenH3__???
233,NP_031707.1,,cenH3__???
284,NP_499128.1,,cenH3__???
289,NP_523730.2,,cenH3__???
293,NP_563627.1,,cenH3__???
298,NP_596473.1,,cenH3__???
361,XP_001011273.1,,cenH3__???
369,XP_001350068.1,,cenH3__???
401,XP_002287626.1,,cenH3__???


In [128]:
sequence_df['TAX'] = [
    "Plants",
    "Fungi",
    "Mammalia",
    "Animals",
    "Animals",
    "Plants",
    "Fungi",
    "Eukarya",
    "Eukarya",
    "Eukarya",
    "Eukarya",
    "Plants",
    "Eukarya",
]
sequence_df = sequence_df.sort_values(by=["TAX"])
sequence_df

Unnamed: 0,accession,variant,variant_under_consideration,TAX
284,NP_499128.1,,cenH3__???,Animals
289,NP_523730.2,,cenH3__???,Animals
361,XP_001011273.1,,cenH3__???,Eukarya
369,XP_001350068.1,,cenH3__???,Eukarya
401,XP_002287626.1,,cenH3__???,Eukarya
416,XP_002767160.1,,cenH3__???,Eukarya
503,XP_009526809.1,,cenH3__???,Eukarya
230,NP_012875.2,,cenH3__???,Fungi
298,NP_596473.1,,cenH3__???,Fungi
233,NP_031707.1,,cenH3__???,Mammalia


In [129]:
data_histone = {
    "id": "cenH3_(Fungi)",
    "level": "variant",
    "taxonomic_span": "Homo sapiens",
    "taxonomic_span_id": "9606",
    "description": None,
    "parent": "cenH3_(Eukarya)",
}
cursor.execute(add_histone, data_histone)

for i, row in sequence_df.iterrows():
    query = f"UPDATE sequence SET variant='cenH3_({row['TAX']})', variant_under_consideration=null WHERE accession='{row['accession']}'"
    cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [130]:
query = "SELECT * FROM sequence"
cursor.execute(query)
cenh3 = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
cenh3[cenh3["accession"].isin(sequence_df["accession"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
5,AAK39657.1,cenH3_(Plants),13794280,,,55529.0,Guillardia theta,,Cryptophyceae,,,MMKKQNLKRFKKSSNSLVDIRKFQKSTDLLIHRLPFARLVKEISLK...,
230,NP_012875.2,cenH3_(Fungi),27808712,,,559292.0,Saccharomyces cerevisiae S288C,Ascomycota,Saccharomycetes,,,MSSKQQWVSSAIQSDSSGRSLSNVNRLAGDQQSINDRALSLLQRTR...,
233,NP_031707.1,cenH3_(Mammalia),6680920,,,10090.0,Mus musculus,Chordata,Mammalia,,,MGPRRKPQTPRRRPSSPAPGPSRQSSSVGSQTLRRRQKFMWLKEIK...,
284,NP_499128.1,cenH3_(Animals),17553736,,,6239.0,Caenorhabditis elegans,Nematoda,Chromadorea,,,MADDTPIIEEIAEQNESVTRIMQRLKHDMQRVTSVPGFNTSAAGVN...,
289,NP_523730.2,cenH3_(Animals),22024004,,,7227.0,Drosophila melanogaster,Arthropoda,Insecta,,,MPRHSRAKRAPRPSANNSKSPNDDDTAFRSPEPEDGTDYGLEFTTS...,
293,NP_563627.1,cenH3_(Plants),18378832,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKHRVTRSQPRNQTDAAGASSSQAAGPTTTPTRRGGEGGDNTQ...,
298,NP_596473.1,cenH3_(Fungi),19113265,,,4896.0,Schizosaccharomyces pombe,Ascomycota,Schizosaccharomycetes,,,MAKKSLMAEPGDPIPRPRKKRYRPGTTALREIRKYQRSTDLLIQRL...,
361,XP_001011273.1,cenH3_(Eukarya),118356028,,,312017.0,Tetrahymena thermophila SB210,Ciliophora,Oligohymenophorea,,,MARKAYQPKRRSNSNQNQQRSDSLKKNKQDNLRSKSAGNQQGNEKN...,
369,XP_001350068.1,cenH3_(Eukarya),124513424,,,36329.0,Plasmodium falciparum 3D7,Apicomplexa,Aconoidasida,,,MVRTKKNIPNHNPLNAFNRDKSFKTNKTLPNRTVHHGISSKTTNIN...,
401,XP_002287626.1,cenH3_(Eukarya),223995905,,,296543.0,Thalassiosira pseudonana CCMP1335,Bacillariophyta,Coscinodiscophyceae,,,MRPGEKALREIRQYQSSTSLLLRRLPFARLVREIQYGMTRQPYRWQ...,


In [131]:
query = "SELECT * FROM histone WHERE id='cenH3_(Fungi)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]).fillna("")

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,cenH3_(Fungi),variant,Homo sapiens,9606,,cenH3_(Eukarya)


In [132]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).fillna("")
sequence_df = sequence_df[sequence_df["variant_under_consideration"] != ""][
    ["accession", "variant", "variant_under_consideration"]
].sort_values(by=["variant_under_consideration"])
sequence_df

Unnamed: 0,accession,variant,variant_under_consideration


# Close connections

In [19]:
cursor.close()
conn.close()
tunnel.stop()

# Что осталось необработанным:

In [None]:
uresolved = {
    "H3.3_(Eukarya)": {
        "level": "variant_group",
        "description": "The major replication-independent or replacement H3, important for development, transcription, and chromosome segregation. It typically differs from the canonical H3 by only a few amino acids that are necessary for replication-independent assembly. ",
        "taxonomic_span": "Eukaryotes",
        "taxonomic_span_id": "2759",
        "alternate_names": [
            {"name": "soH3-1", "gene": 1},
            {"name": "soH3-2", "gene": 2},
            {"name": "hv2", "gene": 1},
        ],
        "publications": ["22650316", "20738881", "19412883", "24229707", "14583738"],
    },  # не разобралась
    "H3.5_(Mammals?)": {
        "level": "variant_group",
        "description": "A hominid specific variant expressed in seminiferous tubules of human testis.",
        "taxonomic_span": "Hominids",
        "taxonomic_span_id": "9604",
        "alternate_names": [],
        "publications": ["21274551"],
    },  # у меня есть какое-то описание, надо сравнить
    "H3.5_(Hominids?)": {
        "level": "variant_group",
        "description": "A hominid specific variant expressed in seminiferous tubules of human testis.",
        "taxonomic_span": "Hominids",
        "taxonomic_span_id": "9604",
        "alternate_names": [],
        "publications": ["21274551"],
    },  # у меня есть какое-то описание, надо сравнить
}