In [1]:
import pandas as pd
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

46789


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [10]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
add_histone_description = (
    "INSERT INTO histone_description "
    "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
    "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
)

# Add or update H3.V

In [7]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.V")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
177,H3.V_(Trypanosomes?),variant_group,Trypanosomes,93954,58,H3


In [8]:
query = "SELECT * FROM histone_description"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'id, summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats'

In [11]:
histone_description_summary = "H3.V is a histone variant characterized so far in some trypanosomatids. It shares ~45-60% sequence identity with canonical H3. H3.V is not essential for viability. Although H3.V associated with the mitotic spindle and enriched at telomeres it is not essential for mini-chromosome segregation, telomere maintenance or transcriptional silencing at the telomere-proximal expression sites [lowell_variant_2004,  anderson_kinetoplastid-specific_2013]. While the general structure of H3.V resembles that of CenH3, it does not possess the specific sequence features required to be classified as a CenH3 variant [lowell_variant_2004]. H3.V plays a critical role in the regulation of antigenic variation. Together with H4.V they are involved in the positioning and expression of variant surface glycoprotein (VSG) genes, which the parasite Trypanosoma brucei switches to avoid immune detection [muller_genome_2018, schulz_base_2016]. H3.V is important for transcription termination and expression of downstream genes. The trypanosomatid-specific hyper-modified DNA base J  and H3.V collaboratively influence the termination process resulting in read-through transcription and increased expression of downstream genes [siegel_four_2009, reynolds_histone_2016, schulz_base_2016, mcdonald_localization_2022]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)
# Make sure data is committed to the database
conn.commit()

In [12]:
query = "SELECT * FROM histone_description"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["summary"].str.contains("H3.V")]

Unnamed: 0,id,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
204,205,H3.V is a histone variant characterized so far...,,,,,,,,,,,,,


In [13]:
histone_description_id = cursor.lastrowid
query = f"UPDATE histone SET id='H3.V_(Trypanosomatidae)', description={histone_description_id} WHERE id='H3.V_(Trypanosomes?)'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [14]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.V_(Trypanosomatidae)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
177,H3.V_(Trypanosomatidae),variant_group,Trypanosomes,93954,205,H3,H3.V is a histone variant characterized so far...,,,,,,,,,,,,,


# Add or update H3.5_(Hominidae)

In [15]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.5")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
170,H3.5_(Homo_sapiens),variant,Homo sapiens,9606.0,108,H3.5_(Primates_or_Hominids?)
171,H3.5_(Primates_or_Hominids?),variant_group,,,52,H3


In [16]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
171,H3.5_(Primates_or_Hominids?),variant_group,,,52,H3,,,,,,,,,,,,,,


## удалим описание 52, так как все поля в нем пустые

In [17]:
query = f"UPDATE histone SET description=null WHERE id='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
query = "DELETE FROM histone_description WHERE id=52"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [18]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
171,H3.5_(Primates_or_Hominids?),variant_group,,,,H3,,,,,,,,,,,,,,


## создадим новое описание для гистона

In [19]:
histone_description_summary = "H3.5 is a hominid-specific histone H3 variant expressed in the seminiferous tubules of human testes, encoded on chromosome 12p11.21 region of human chromosome 12. H3.3 is the most similar variant to H3.5 (approximately 96% identity). H3.5 has two copies of the ARKST motif and 5 amino acids differences comparing to H3.3 [schenk_h35_2011, ederveen_human_2011]. H3.5 likely evolved from H3F3B gene duplication and is associated with euchromatin and actively transcribed genes [schenk_h35_2011]. Nucleosomes containing histone variant H3.5 are less stable than H3.1 and H3.3 nucleosomes, but more stable than H3.4 nucleosome [urahama_histone_2016]. Notable, amino acid L in position 103 of H3.5 (amino acid F at the corresponding position in H3.3) reduces the number of hydrophobic bonds with histone H4 leading to instability of nucleosome [urahama_histone_2016]. H3.5 localizes primarely at exons, 5'UTR, 3'UTR, promoters and introns. Although it enreached around transcription start sites (TSSs), H3.5 distribution is independent of gene expression levels [urahama_histone_2016]. Clinical data indicate that H3.5 mRNA levels are significantly lower in patients with nonobstructive azoospermia compared to those with obstructive azoospermia or normal individuals, suggesting H3.5's role in normal spermatogenesis and its regulation by gonadotropins [ding_primate-specific_2021]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)
# Make sure data is committed to the database
conn.commit()

In [20]:
query = "SELECT * FROM histone_description"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["summary"].str.contains("H3.5")]

Unnamed: 0,id,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
204,206,H3.5 is a hominid-specific histone H3 variant ...,,,,,,,,,,,,,


In [21]:
query = "SELECT * FROM histone WHERE id='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
cursor.fetchone()

('H3.5_(Primates_or_Hominids?)', 'variant_group', 'null', 'null', None, 'H3')

## создадим новую запись для H3.5_(Hominidae), остальные поля оставим те же, за исключением описания, которому присвоим id нового описания 206

In [22]:
query = "SELECT * FROM histone WHERE id='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
old_histone_data = cursor.fetchone()

data_histone = {
    "id": "H3.5_(Hominidae)",
    "level": old_histone_data[1],
    "taxonomic_span": old_histone_data[2],
    "taxonomic_span_id": old_histone_data[3],
    "description": 206,
    "parent": old_histone_data[5],
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

## поменяем все зависимости: поле parent в таблице histone, поле variant в таблице sequence, поле histone в таблице alternative_name и histone_id в таблице histone_has_publication

In [23]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.5")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
170,H3.5_(Hominidae),variant_group,,,206.0,H3
171,H3.5_(Homo_sapiens),variant,Homo sapiens,9606.0,108.0,H3.5_(Primates_or_Hominids?)
172,H3.5_(Primates_or_Hominids?),variant_group,,,,H3


In [24]:
query = f"UPDATE histone SET parent='H3.5_(Hominidae)' WHERE parent='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [25]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.5")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
170,H3.5_(Hominidae),variant_group,,,206.0,H3
171,H3.5_(Homo_sapiens),variant,Homo sapiens,9606.0,108.0,H3.5_(Hominidae)
172,H3.5_(Primates_or_Hominids?),variant_group,,,,H3


In [26]:
query = "SELECT * FROM sequence"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["variant"]=="H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
468,XP_003954426.1,H3.5_(Primates_or_Hominids?),410046862,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTXGVKKPHRYRPGT...,


In [27]:
query = f"UPDATE sequence SET variant='H3.5_(Hominidae)' WHERE variant='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [28]:
query = "SELECT * FROM sequence"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["variant"]=="H3.5_(Hominidae)"]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
468,XP_003954426.1,H3.5_(Hominidae),410046862,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTXGVKKPHRYRPGT...,


In [29]:
query = "SELECT * FROM alternative_name"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone"]=="H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,id,name,taxonomy,gene,splice,histone


In [30]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"]=="H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,histone_id,publication_id


## удалим H3.5_(Primates_or_Hominids?)

In [31]:
query = "DELETE FROM histone WHERE id='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [32]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.5")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
170,H3.5_(Hominidae),variant_group,,,206,H3
171,H3.5_(Homo_sapiens),variant,Homo sapiens,9606.0,108,H3.5_(Hominidae)


# Add or update H3.6_(Homo_sapiens)

In [9]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.6")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
172,H3.6_(Mammals?)?,variant_group,,,53,H3


In [13]:
query = (
    "SELECT h.*, d.summary, d.taxonomy, d.genes, d.evolution, d.expression, d.knock_out, d.function, d.sequence, d.localization, d.deposition, d.structure, d.interactions, d.disease, d.caveats "
    "FROM histone h LEFT JOIN histone_description d ON h.description = d.id"
)
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"] == "H3.6_(Mammals?)?"]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
172,H3.6_(Mammals?)?,variant_group,,,53,H3,,,,,,,,,,,,,,


In [9]:
query = "SELECT * FROM histone_description"
cursor.execute(query)
cursor.fetchall()
"=null AND ".join([i[0] for i in cursor.description])

'id=null AND summary=null AND taxonomy=null AND genes=null AND evolution=null AND expression=null AND knock_out=null AND function=null AND sequence=null AND localization=null AND deposition=null AND structure=null AND interactions=null AND disease=null AND caveats'

In [22]:
query = ("SELECT * FROM histone_description WHERE summary='null' AND "
         "taxonomy='null' AND genes='null' AND evolution='null' AND expression='null' AND "
         "knock_out='null' AND function='null' AND sequence='null' AND localization='null' AND "
         "deposition='null' AND structure='null' AND interactions='null' AND "
         "disease='null' AND caveats='null'")
cursor.execute(query)
len(cursor.fetchall())

47

In [21]:
query = "SELECT * FROM histone_description WHERE summary='null'"
cursor.execute(query)
cursor.fetchall()

[(1,
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null'),
 (7,
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null'),
 (8,
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null'),
 (46,
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null'),
 (48,
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null'),
 (50,
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null'),
 (53,
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'null',
  'nu

# Close connections

In [23]:
cursor.close()
conn.close()
tunnel.stop()