In [1]:
import pandas as pd
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

36235


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)

# Update references of H2B.W sequences

In [7]:
query = "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp ON s.accession = sp.sequence_accession"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["variant"] == "H2B.W"]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
38,DAA13058.1,H2B.W,296470943,,,9913.0,Bos taurus,Chordata,Mammalia,,,MGIGGSILSETSSDSYEEDVITKETGISEIEPSEKEMAKVETSKPD...,,,
199,NP_001180847.1,H2B.W,302564969,,,9544.0,Macaca mulatta,Chordata,Mammalia,,,MLRTQVPPLLRSTTAIVWSCRVMAAASAMAEPSSETTSEEQLITQE...,,,
575,XP_002720211.1,H2B.W,291407712,,,9986.0,Oryctolagus cuniculus,Chordata,Mammalia,,,MAEPASHVASEENLSLEPKTTASSTPKEKQPRRRRRRRQGHNYSFA...,,,
592,XP_002925981.1,H2B.W,301781100,,,9646.0,Ailuropoda melanoleuca,Chordata,Mammalia,,,MAEPGCETSSEESLGTEEPSAANPKSPKQKQKQPRRQCRRRCRRCP...,,,
745,XP_548517.2,H2B.W,545560116,,,9615.0,Canis lupus familiaris,Chordata,Mammalia,,,MIPGKPEEGKGSSEGPICDTEVACDVRKCSDYKGASAPQQPPLSVL...,,,


In [8]:
df[df["variant"] == "H2B.W"].groupby(["accession"])["publication_id"].apply(
    lambda x: [] if x.dropna().empty else list(x.unique())
).reset_index()

Unnamed: 0,accession,publication_id
0,DAA13058.1,[]
1,NP_001180847.1,[]
2,XP_002720211.1,[]
3,XP_002925981.1,[]
4,XP_548517.2,[]


In [9]:
df[df["variant"] == "H2B.W"]['accession']

38         DAA13058.1
199    NP_001180847.1
575    XP_002720211.1
592    XP_002925981.1
745       XP_548517.2
Name: accession, dtype: object

In [10]:
pubmedids = ["22650316", "25731851", "19583817", "16449661"]
query = "SELECT id FROM publication"
cursor.execute(query)
exist_pubs = [i[0] for i in cursor.fetchall()]
for pid in pubmedids:
    if pid not in exist_pubs:
        data_publication = {
            "id": pid,
            "title": None,
            "doi": None,
            "author": None,
            "year": None,
        }
        cursor.execute(add_publication, data_publication)
    for acc in df[df["variant"] == "H2B.W"]['accession']:
        cursor.execute(add_sequence_has_publication, (acc, pid))

# Make sure data is committed to the database
conn.commit()

In [13]:
query = "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp ON s.accession = sp.sequence_accession"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["variant"] == "H2B.W"].groupby(["accession"])["publication_id"].apply(
    lambda x: [] if x.dropna().empty else list(x.unique())
).reset_index()

Unnamed: 0,accession,publication_id
0,DAA13058.1,"[16449661, 19583817, 22650316, 25731851]"
1,NP_001180847.1,"[16449661, 19583817, 22650316, 25731851]"
2,XP_002720211.1,"[16449661, 19583817, 22650316, 25731851]"
3,XP_002925981.1,"[16449661, 19583817, 22650316, 25731851]"
4,XP_548517.2,"[16449661, 19583817, 22650316, 25731851]"


In [None]:
histone_description_summary = "H3.5 is a hominid-specific histone H3 variant expressed in the seminiferous tubules of human testes, encoded on chromosome 12p11.21 region of human chromosome 12. H3.3 is the most similar variant to H3.5 (approximately 96% identity). H3.5 has two copies of the ARKST motif and 5 amino acids differences comparing to H3.3 [schenk_h35_2011, ederveen_human_2011]. H3.5 likely evolved from H3F3B gene duplication and is associated with euchromatin and actively transcribed genes [schenk_h35_2011]. Nucleosomes containing histone variant H3.5 are less stable than H3.1 and H3.3 nucleosomes, but more stable than H3.4 nucleosome [urahama_histone_2016]. Notable, amino acid L in position 103 of H3.5 (amino acid F at the corresponding position in H3.3) reduces the number of hydrophobic bonds with histone H4 leading to instability of nucleosome [urahama_histone_2016]. H3.5 localizes primarely at exons, 5'UTR, 3'UTR, promoters and introns. Although it enreached around transcription start sites (TSSs), H3.5 distribution is independent of gene expression levels [urahama_histone_2016]. Clinical data indicate that H3.5 mRNA levels are significantly lower in patients with nonobstructive azoospermia compared to those with obstructive azoospermia or normal individuals, suggesting H3.5's role in normal spermatogenesis and its regulation by gonadotropins [ding_primate-specific_2021]."
data_histone_description = (histone_description_summary,) + (None,) * 13
cursor.execute(add_histone_description, data_histone_description)
# Make sure data is committed to the database
conn.commit()

In [20]:
query = "SELECT * FROM histone_description"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["summary"].str.contains("H3.5")]

Unnamed: 0,id,summary,taxonomy,genes,evolution,expression,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats
204,206,H3.5 is a hominid-specific histone H3 variant ...,,,,,,,,,,,,,


In [14]:
query = "SELECT * FROM histone WHERE id='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
cursor.fetchone()

('H3.5_(Primates_or_Hominids?)', 'variant_group', 'null', 'null', 52, 'H3')

In [24]:
query = "SELECT * FROM histone WHERE id='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
old_histone_data = cursor.fetchone()

data_histone = {
    "id": "H3.5_(Hominidae)",
    "level": old_histone_data[1],
    "taxonomic_span": old_histone_data[2],
    "taxonomic_span_id": old_histone_data[3],
    "description": 206,
    "parent": old_histone_data[5],
}
cursor.execute(add_histone, data_histone)

# Make sure data is committed to the database
conn.commit()

In [25]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.5")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
170,H3.5_(Hominidae),variant_group,,,206.0,H3
171,H3.5_(Homo_sapiens),variant,Homo sapiens,9606.0,108.0,H3.5_(Primates_or_Hominids?)
172,H3.5_(Primates_or_Hominids?),variant_group,,,,H3


In [28]:
query = f"UPDATE histone SET parent='H3.5_(Hominidae)' WHERE parent='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [29]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.5")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
170,H3.5_(Hominidae),variant_group,,,206.0,H3
171,H3.5_(Homo_sapiens),variant,Homo sapiens,9606.0,108.0,H3.5_(Hominidae)
172,H3.5_(Primates_or_Hominids?),variant_group,,,,H3


In [27]:
query = "SELECT * FROM sequence"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["variant"] == "H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
468,XP_003954426.1,H3.5_(Primates_or_Hominids?),410046862,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTXGVKKPHRYRPGT...,


In [30]:
query = f"UPDATE sequence SET variant='H3.5_(Hominidae)' WHERE variant='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [32]:
query = "SELECT * FROM sequence"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["variant"] == "H3.5_(Hominidae)"]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
468,XP_003954426.1,H3.5_(Hominidae),410046862,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTXGVKKPHRYRPGT...,


In [34]:
query = "SELECT * FROM alternative_name"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone"] == "H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,id,name,taxonomy,gene,splice,histone


In [35]:
query = "SELECT * FROM histone_has_publication"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["histone_id"] == "H3.5_(Primates_or_Hominids?)"]

Unnamed: 0,histone_id,publication_id


In [36]:
query = "DELETE FROM histone WHERE id='H3.5_(Primates_or_Hominids?)'"
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [37]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df[histone_df["id"].str.contains("H3.5")]

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
170,H3.5_(Hominidae),variant_group,,,206,H3
171,H3.5_(Homo_sapiens),variant,Homo sapiens,9606.0,108,H3.5_(Hominidae)


# Close connections

In [14]:
cursor.close()
conn.close()
tunnel.stop()