# Recover curatedDB from files classification.json and histones.csv

In [1]:
%cd '../..'

/mnt/scratch/l_singh/hdb/project_dir/histonedb/CURATED_SET


In [10]:
import pandas as pd
import numpy as np
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder
from ete3 import Tree
from curated_set_services import dict2tree
import json

In [3]:
with open("curated_service/curatedDB/db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [4]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

43171


In [5]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [6]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [7]:
# Получение списка всех таблиц в базе данных
cursor.execute("SHOW TABLES;")
tables = cursor.fetchall()

# Отключаем проверку внешних ключей временно
cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")

# Транзакция для очистки всех таблиц
for (table,) in tables:
    cursor.execute(f"TRUNCATE TABLE {table};")

# Восстанавливаем проверку внешних ключей
cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")

# Фиксация изменений
conn.commit()

In [8]:
# Получение списка всех таблиц в базе данных
cursor.execute("SHOW TABLES;")
tables = cursor.fetchall()

# Функция для проверки, что таблица пуста
def is_table_empty(table_name):
    cursor.execute(f"SELECT COUNT(*) FROM {table_name};")
    result = cursor.fetchone()
    return result[0] == 0  # Возвращает True, если количество строк равно 0

all_empty = True
non_empty_tables = []

# Проверка всех таблиц
for (table,) in tables:
    if not is_table_empty(table):
        all_empty = False
        non_empty_tables.append(table)

if all_empty:
    print("Все таблицы в базе данных пустые.")
else:
    print(f"Следующие таблицы не пустые: {', '.join(non_empty_tables)}")

Все таблицы в базе данных пустые.


## Add info from classification.json

In [11]:
with open('classification.json') as json_file:
    data = json.load(json_file)

hist_tree=Tree()
dict2tree(hist_tree,data['tree'])
print(hist_tree.get_ascii(show_internal=True))


   /-Archaeal
  |
  |   /-cH1
  |  |
  |  |--generic_H1
  |  |
  |  |-H1.0-H1.0_(Homo_sapiens)
  |  |
  |  |-H1.1-H1.1_(Homo_sapiens)
  |  |
  |  |-H1.10-H1.10_(Homo_sapiens)
  |  |
  |  |-H1.2-H1.2_(Homo_sapiens)
  |  |
  |  |-H1.3-H1.3_(Homo_sapiens)
  |-H1
  |  |-H1.4-H1.4_(Homo_sapiens)
  |  |
  |  |-H1.5-H1.5_(Homo_sapiens)
  |  |
  |  |-OO_H1.8-H1.8_(Homo_sapiens)
  |  |
  |  |--scH1
  |  |
  |  |-TS_H1.6-H1.6_(Homo_sapiens)
  |  |
  |  |-TS_H1.7-H1.7_(Homo_sapiens)
  |  |
  |   \-TS_H1.9
  |
  |                                                                        /-cH2A.10_(Homo_sapiens)
  |                                                                       |
  |                                                                       |--cH2A.11_(Homo_sapiens)
  |                                                                       |
  |                                                                       |--cH2A.1_(Homo_sapiens)
  |                                         

In [12]:
sql_query = []
desc_id = 0
pubs_unique = []
alt_name_id = 0
added_histones = []

for node in hist_tree.traverse("levelorder"):
    if node.is_root():
        continue
    
    added_histones.append(node.name)
    
    # insert desc
    desc = None if node.name not in data["info"].keys() else data["info"][node.name].get("description", None)
    
    if isinstance(desc, str):
        desc = None if desc == "null" else {'summary': desc}
    
    if desc is not None:
        desc_id += 1
        # Параметризованный запрос для вставки в histone_description
        cursor.execute('''INSERT INTO histone_description 
            (id, summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);''', (
                desc_id,
                desc.get("summary", None),
                desc.get("taxonomy", None),
                desc.get("genes", None),
                desc.get("evolution", None),
                desc.get("expression", None),
                desc.get("knock-out", None),
                desc.get("function", None),
                desc.get("sequence", None),
                desc.get("localization", None),
                desc.get("deposition", None),
                desc.get("structure", None),
                desc.get("interactions", None),
                desc.get("disease", None),
                desc.get("caveats", None)
            ))

    # insert histone
    cursor.execute('''INSERT INTO histone 
        (id, level, taxonomic_span, taxonomic_span_id, description, parent) 
        VALUES (%s, %s, %s, %s, %s, %s);''', (
            node.name,
            "type" if node.up.is_root() else "variant_group" if node.up.up.is_root() else "variant",
            data["info"].get(node.name, {}).get("taxonomic_span", None),
            data["info"].get(node.name, {}).get("taxonomic_span_id", None),
            desc_id if desc is not None else None,
            None if node.up.is_root() else node.up.name
        ))

    # insert publication
    pubs = data["info"].get(node.name, {}).get("publications", None)
    
    if pubs is not None:
        if not isinstance(pubs, list):
            print(f"PUBS IS NOT A LIST:\n{pubs}")
        else:
            for p in pubs:
                if p not in pubs_unique:
                    pubs_unique.append(p)
                    # Вставка публикации
                    cursor.execute('''INSERT INTO publication (id, title, doi, author, year) 
                        VALUES (%s, %s, %s, %s, %s);''', (
                            p, None, None, None, None
                        ))

                # insert relation histone_has_publication
                cursor.execute('''INSERT INTO histone_has_publication (histone_id, publication_id) 
                    VALUES (%s, %s);''', (
                        node.name, p
                    ))

    # insert alternative name
    alts = data["info"].get(node.name, {}).get("alternate_names", None)
    
    if alts is not None:
        if not isinstance(alts, list):
            print(f"ALTS IS NOT A LIST:\n{alts}")
        else:
            for an in alts:
                alt_name_id += 1
                if "name" not in an.keys():
                    print(f"ALT NAME DOES NOT EXIST:\n{an}")
                else:
                    # Вставка альтернативных имен
                    cursor.execute('''INSERT INTO alternative_name 
                        (id, name, taxonomy, gene, splice, histone) 
                        VALUES (%s, %s, %s, %s, %s, %s);''', (
                            alt_name_id,
                            an["name"],
                            an.get("taxonomy", None),
                            an.get("gene", None),
                            an.get("splice", None),
                            node.name
                        ))

# Сохранение изменений в базе данных
conn.commit()

In [13]:
query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
histone_df

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent
0,Archaeal,type,,,1,
1,cenH3_(Animals),variant,Homo sapiens,9606,94,cenH3_(Eukarya)
2,cenH3_(Eukarya),variant_group,,,46,H3
3,cenH3_(Fungi),variant,Homo sapiens,9606,95,cenH3_(Eukarya)
4,cenH3_(Homo_sapiens),variant,Homo sapiens,9606,158,cenH3_(Mammalia)
...,...,...,...,...,...,...
199,TS H3.10,variant,,,139,H3.3-like_(Plants)
200,TS_H1.6,variant_group,Mammalia,40674,19,H1
201,TS_H1.7,variant_group,Mammalia,40674,20,H1
202,TS_H1.9,variant_group,Mammalia,40674,21,H1


## Add sequences from histones.csv

In [15]:
df_seq = pd.read_csv('histones.csv')
df_seq.index = df_seq.accession
df_seq

Unnamed: 0_level_0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAA30025.1,AAA30025.1,H2B,cH2B,cH2B_(Animals),,161386,,,7660.0,Psammechinus miliaris,Echinodermata,Echinoidea,,,,MAPTAQVAKKGSKKAVKAPRPSGGKKRNRKRKESYGIYIYKVLKQV...
AAB48832.1,AAB48832.1,H2B,CS_H2B_(Echinoidea),CS_H2B_(Echinoidea),,,,,7660.0,Psammechinus miliaris,Echinodermata,Echinoidea,,,9032246,MPAKGAATKGEKKQAVKSKAMASSRTGDKKRRRRRLESYNIYIYKV...
AAC28846.1,AAC28846.1,H2A,macroH2A,macroH2A,,3395780,,,9031.0,Gallus gallus,Chordata,Aves,,,22650316 26989147,MSSRGGKKKSTKTSRSAKAGVIFPVGRMLRYIKKGHPKYRIGVGAP...
AAC37291.1,AAC37291.1,H2A,H2A.X,H2A.X_(Protists),,310870,,,5911.0,Tetrahymena thermophila,Ciliophora,Oligohymenophorea,,,22650316 26989147,MSTTGKGGKAKGKTASSKQVSRSARAGLQFPVGRISRFLKHGRYSE...
AAC37292.1,AAC37292.1,H2A,cH2A,cH2A_(Protists),,310872,,,5911.0,Tetrahymena thermophila,Ciliophora,Oligohymenophorea,,,22650316 26989147,MSTTGKGGKAKGKTASSKQVSRSARAGLQFPVGRISRFLKNGRYSE...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XP_846259.1,XP_846259.1,H2A,H2A.Z,H2A.Z,,72391930,,,185431.0,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,22650316 26989147,MSLTGDDAVPQAPLVGGVAMSPEQASALTGGKLGGKAVGPAHGKGK...
XP_875023.2,XP_875023.2,H2A,short_H2A,H2A.L,,119923405,,,9913.0,Bos taurus,Chordata,Mammalia,,,22650316 26989147,MSPRRHLWNCRRSRRHSLSRSTRAELQFPVSRVDRLLREGQGAYRL...
XP_951561.1,XP_951561.1,H4,cH4,cH4,,84043542,,,185431.0,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,,MAKGKRVGESKGAQKRQKKVLRDNVRGITRGSIRRLARRAGVKRIS...
XP_956003.1,XP_956003.1,H3,H3.3,H3.3,,85077297,,,367110.0,Neurospora crassa OR74A,Ascomycota,Sordariomycetes,,,,MARTKQTARKSTGGKAPRKQLASKAARKSAPSTGGVKKPHRYKPGT...


In [16]:
sql_query_seq = []

for i, row in df_seq.iterrows():
    # insert sequence
    cursor.execute('''INSERT INTO sequence 
        (accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);''', (
            row["accession"],
            row["variant"] if row["variant"] in added_histones else None,
            row["gi"] if not pd.isna(row["gi"]) else None,
            int(row["ncbi_gene_id"]) if not pd.isna(row["ncbi_gene_id"]) else None,
            row["hgnc_gene_name"] if not pd.isna(row["hgnc_gene_name"]) else None,
            int(row["taxonomy_id"]) if not pd.isna(row["taxonomy_id"]) else None,
            row["organism"] if not pd.isna(row["organism"]) else None,
            row["phylum"] if not pd.isna(row["phylum"]) else None,
            row["class"] if not pd.isna(row["class"]) else None,
            row["taxonomy_group"] if not pd.isna(row["taxonomy_group"]) else None,
            row["info"] if not pd.isna(row["info"]) else None,
            row["sequence"],
            row["variant"] if row["variant"] not in added_histones else None
        ))

    # insert publication if not exist (by default all fields will be null except PK)
    pubs_seq = row["references"]
    
    if not pd.isna(pubs_seq) and pubs_seq != '' and pubs_seq != '?':
        for p in pubs_seq.split():
            p = p.replace('(?)', '')
            if p not in pubs_unique:
                pubs_unique.append(p)
                # Вставка публикации
                cursor.execute('''INSERT INTO publication 
                    (id, title, doi, author, year) 
                    VALUES (%s, %s, %s, %s, %s);''', (
                        p, None, None, None, None
                    ))

            # insert relation sequence_has_publication
            cursor.execute('''INSERT INTO sequence_has_publication 
                (sequence_accession, publication_id) 
                VALUES (%s, %s);''', (
                    row["accession"], p
                ))

# Сохранение изменений в базе данных
conn.commit()

In [17]:
query = "SELECT * FROM sequence"
cursor.execute(query)
sequence_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
sequence_df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,AAA30025.1,cH2B_(Animals),161386,,,7660.0,Psammechinus miliaris,Echinodermata,Echinoidea,,,MAPTAQVAKKGSKKAVKAPRPSGGKKRNRKRKESYGIYIYKVLKQV...,
1,AAB48832.1,CS_H2B_(Echinoidea),,,,7660.0,Psammechinus miliaris,Echinodermata,Echinoidea,,,MPAKGAATKGEKKQAVKSKAMASSRTGDKKRRRRRLESYNIYIYKV...,
2,AAC28846.1,macroH2A,3395780,,,9031.0,Gallus gallus,Chordata,Aves,,,MSSRGGKKKSTKTSRSAKAGVIFPVGRMLRYIKKGHPKYRIGVGAP...,
3,AAC37291.1,H2A.X_(Protists),310870,,,5911.0,Tetrahymena thermophila,Ciliophora,Oligohymenophorea,,,MSTTGKGGKAKGKTASSKQVSRSARAGLQFPVGRISRFLKHGRYSE...,
4,AAC37292.1,cH2A_(Protists),310872,,,5911.0,Tetrahymena thermophila,Ciliophora,Oligohymenophorea,,,MSTTGKGGKAKGKTASSKQVSRSARAGLQFPVGRISRFLKNGRYSE...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,XP_846259.1,H2A.Z,72391930,,,185431.0,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,MSLTGDDAVPQAPLVGGVAMSPEQASALTGGKLGGKAVGPAHGKGK...,
565,XP_875023.2,H2A.L,119923405,,,9913.0,Bos taurus,Chordata,Mammalia,,,MSPRRHLWNCRRSRRHSLSRSTRAELQFPVSRVDRLLREGQGAYRL...,
566,XP_951561.1,cH4,84043542,,,185431.0,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,MAKGKRVGESKGAQKRQKKVLRDNVRGITRGSIRRLARRAGVKRIS...,
567,XP_956003.1,H3.3,85077297,,,367110.0,Neurospora crassa OR74A,Ascomycota,Sordariomycetes,,,MARTKQTARKSTGGKAPRKQLASKAARKSAPSTGGVKKPHRYKPGT...,


# Saving to files and checking

## Saving to JSON

In [18]:
def get_tree_dict(cursor, paretnt_id):
    cursor.execute(f"SELECT id FROM histone WHERE parent = '{paretnt_id}'")
    res = cursor.fetchall()
    if len(res) < 1:
        return "null"
    return {v: get_tree_dict(cursor, v) for v, *_ in res}

In [19]:
def dict_clean(items, default="null"):
    result = {}
    for key, value in items.items():
        if value is None:
            if default is None:
                continue
            value = default
        result[key] = value
    return result

In [20]:
def replace_key_in_dict(data, old_key, new_key):
    if isinstance(data, dict):
        new_data = {}
        for key, value in data.items():
            # Если ключ совпадает с `old_key`, заменяем его на `new_key`
            if key == old_key:
                new_data[new_key] = replace_key_in_dict(value, old_key, new_key)
            else:
                new_data[key] = replace_key_in_dict(value, old_key, new_key)
        return new_data
    elif isinstance(data, list):
        # Рекурсивно обрабатываем списки
        return [replace_key_in_dict(item, old_key, new_key) for item in data]
    return data

In [21]:
%%time
query = "SELECT id FROM histone WHERE parent IS NULL"
cursor.execute(query)
types = cursor.fetchall()
variants_tree = {}
for t, *_ in types:
    variants_tree[t] = get_tree_dict(cursor, t)

variants_info = {}
query = (
    "SELECT hdap.*, p.title, p.doi, p.author, p.year FROM ( "
    "SELECT hda.*, publication_id FROM ( "
    "SELECT hd.*, an.name, an.taxonomy as an_taxonomy, an.gene, an.splice FROM ( "
    "SELECT h.id, "
    "h.level,"
    "h.taxonomic_span, "
    "h.taxonomic_span_id, "
    "h.parent, "
    "d.summary, "
    "d.taxonomy, "
    "d.genes, "
    "d.evolution, "
    "d.expression, "
    "d.knock_out, "
    "d.function, "
    "d.sequence, "
    "d.localization, "
    "d.deposition,"
    "d.structure,"
    "d.interactions, "
    "d.disease, "
    "d.caveats FROM histone h LEFT JOIN histone_description d ON h.description = d.id "
    ") hd LEFT JOIN alternative_name an ON hd.id = an.histone "
    ") hda LEFT JOIN histone_has_publication hp ON hda.id = hp.histone_id "
    ") hdap LEFT JOIN publication p ON hdap.publication_id = p.id;"
)
cursor.execute(query)
variants_info_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)
for var_name in variants_info_df.id.unique():
    # print(var_name)
    level_value = variants_info_df[variants_info_df["id"] == var_name].iloc[0]["level"]
    taxonomic_span_value = variants_info_df[variants_info_df["id"] == var_name].iloc[0][
        "taxonomic_span"
    ]
    taxonomic_span_id_value = variants_info_df[variants_info_df["id"] == var_name].iloc[
        0
    ]["taxonomic_span_id"]
    variants_info[var_name] = {
        "level": level_value if level_value else "null",
        "description": {},
        "taxonomic_span": taxonomic_span_value if taxonomic_span_value else "null",
        "taxonomic_span_id": taxonomic_span_id_value
        if taxonomic_span_id_value
        else "null",
        "alternate_names": [],
        "publications": [],
    }
    var_publications = (
        variants_info_df.groupby(["id"])["publication_id"]
        .apply(lambda x: [] if x.dropna().empty else list(x.unique()))
        .reset_index()
    )
    variants_info[var_name]["publications"] = var_publications[
        var_publications["id"] == var_name
    ].iloc[0]["publication_id"]
    var_names = (
        variants_info_df.groupby(["id"])[["name", "an_taxonomy", "gene", "splice"]]
        .apply(
            lambda x: []
            if x.drop_duplicates().dropna(how="all").empty
            else list(
                map(
                    lambda y: dict_clean(y, default=None),
                    x.drop_duplicates()
                    .rename(columns={"an_taxonomy": "taxonomy"})
                    .to_dict("records"),
                )
            )
        )
        .reset_index()
    )
    variants_info[var_name]["alternate_names"] = var_names[
        var_names["id"] == var_name
    ].iloc[0][0]
    default_description = {
        "summary": "null",
        "taxonomy": "null",
        "genes": "null",
        "evolution": "null",
        "expression": "null",
        "knock-out": "null",
        "function": "null",
        "sequence": "null",
        "localization": "null",
        "deposition": "null",
        "structure": "null",
        "interactions": "null",
        "disease": "null",
        "caveats": "null",
    }
    var_desc = (
        variants_info_df.groupby(["id"])[
            [
                "summary",
                "taxonomy",
                "genes",
                "evolution",
                "expression",
                "knock_out",
                "function",
                "sequence",
                "localization",
                "deposition",
                "structure",
                "interactions",
                "disease",
                "caveats",
            ]
        ]
        .apply(
            lambda x: default_description
            if x.drop_duplicates().dropna(how="all").empty
            else dict_clean(x.drop_duplicates().to_dict("records")[0])
        )
        .reset_index()
    )
    variants_info[var_name]["description"] = var_desc[var_desc["id"] == var_name].iloc[
        0
    ][0]
    # Заменяем ключи во всех словарях
    for key, value in variants_info.items():
        variants_info[key] = replace_key_in_dict(value, "knock_out", "knock-out")

with open("curated_service/curatedDB/classification_241022.json", "w") as outfile:
    json.dump({"tree": variants_tree, "info": variants_info}, outfile, indent=4)

CPU times: user 7min 40s, sys: 8.98 s, total: 7min 49s
Wall time: 7min 27s


## Сравним полученный JSON с classification.json

In [22]:
with open(
    "/home/l_singh/_scratch/hdb/project_dir/histonedb/CURATED_SET/classification.json",
    "r",
    encoding="utf-8",
) as file:
    data1 = json.load(file)

with open("curated_service/curatedDB/classification_241022.json", "r", encoding="utf-8") as file:
    data2 = json.load(file)

In [23]:
# Функция для приведения всех значений к строкам (рекурсивно)
def convert_values_to_str(data):
    if isinstance(data, dict):
        return {key: convert_values_to_str(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_values_to_str(item) for item in data]
    elif isinstance(
        data, (int, float, bool)
    ):  # Преобразуем числа и логические значения в строки
        return str(data)
    return data  # Возвращаем неизмененные строковые значения

In [24]:
# Приводим все значения к строкам для корректного сравнения
data1 = convert_values_to_str(data1)
data2 = convert_values_to_str(data2)

In [25]:
from deepdiff import DeepDiff

# Используем DeepDiff для сравнения
differences = DeepDiff(data1, data2, ignore_order=True)

In [26]:
differences.keys()

dict_keys([])

### Файлы идентичны!

In [None]:
# Если есть различия, выводим их
if differences:
    print("Найдены различия:")
    # print("dictionary_item_added:")
    # print(differences["dictionary_item_added"])
    # print("dictionary_item_removed:")
    # print(differences['dictionary_item_removed'])
    print("values_changed:")
    for k in list(differences['values_changed'].keys()):
        # if 'level' in k or 'taxonomic_span' in k: continue
        print(f"{k}: {differences['values_changed'][k]}")
else:
    print("Файлы идентичны.")

## Saving to CSV

In [27]:
def find_parent_by_level(df, variant_name, target_level):
    if variant_name == "":
        return ""

    # Ищем строку с данным названием варианта
    current_row = df[df["id"] == variant_name]
    if current_row.empty:
        return "HISTONE_NOT_EXIST"

    if current_row["level"].values[0] == target_level:
        return current_row["id"].values[0]

    # Пока не найден родитель с нужным уровнем
    while not current_row.empty:
        # Получаем значение поля parent
        parent_id = current_row["parent"].values[0]

        if pd.isna(parent_id):
            print("here")
            return None  # Нет родителя

        # Ищем родителя
        parent_row = df[df["id"] == parent_id]

        # Проверяем уровень родителя
        if parent_row["level"].values[0] == target_level:
            return parent_row["id"].values[0]

        # Продолжаем искать родителя на следующем уровне
        current_row = parent_row

    return None  # Если не найдено

In [28]:
query = "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp ON s.accession = sp.sequence_accession"
cursor.execute(query)
sequence_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
)

sequence_df = (
    sequence_df.groupby(["accession"])
    .agg(lambda x: "" if x.dropna().empty else " ".join(list(map(str, x.unique()))))
    .reset_index()
)

sequence_df = sequence_df.drop(columns=["sequence_accession"])
sequence_df["taxonomy_id"] = (
    pd.to_numeric(sequence_df["taxonomy_id"], errors="coerce")
    .fillna(np.nan)
    .astype("Int64")
)

query = "SELECT * FROM histone"
cursor.execute(query)
histone_df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
sequence_df["variant_group"] = sequence_df["variant"].apply(
    lambda x: find_parent_by_level(histone_df, x, "variant_group")
)
sequence_df["type"] = sequence_df["variant"].apply(
    lambda x: find_parent_by_level(histone_df, x, "type")
)

sequence_df["doublet"] = [""] * sequence_df.shape[0]

sequence_df = (
    sequence_df[
        [
            "accession",
            "type",
            "variant_group",
            "variant",
            # "variant_under_consideration",
            "doublet",
            "gi",
            "ncbi_gene_id",
            "hgnc_gene_name",
            "taxonomy_id",
            "organism",
            "phylum",
            "class",
            "taxonomy_group",
            "info",
            "publication_id",
            "sequence",
        ]
    ]
    .rename(columns={"publication_id": "references"})
    .replace("None", "")
)
sequence_df.to_csv('curated_service/curatedDB/histones_241022.csv', index=False)

sequence_df.head()

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
0,AAA30025.1,H2B,cH2B,cH2B_(Animals),,161386.0,,,7660,Psammechinus miliaris,Echinodermata,Echinoidea,,,,MAPTAQVAKKGSKKAVKAPRPSGGKKRNRKRKESYGIYIYKVLKQV...
1,AAB48832.1,H2B,CS_H2B_(Echinoidea),CS_H2B_(Echinoidea),,,,,7660,Psammechinus miliaris,Echinodermata,Echinoidea,,,9032246,MPAKGAATKGEKKQAVKSKAMASSRTGDKKRRRRRLESYNIYIYKV...
2,AAC28846.1,H2A,macroH2A,macroH2A,,3395780.0,,,9031,Gallus gallus,Chordata,Aves,,,22650316 26989147,MSSRGGKKKSTKTSRSAKAGVIFPVGRMLRYIKKGHPKYRIGVGAP...
3,AAC37291.1,H2A,H2A.X,H2A.X_(Protists),,310870.0,,,5911,Tetrahymena thermophila,Ciliophora,Oligohymenophorea,,,22650316 26989147,MSTTGKGGKAKGKTASSKQVSRSARAGLQFPVGRISRFLKHGRYSE...
4,AAC37292.1,H2A,cH2A,cH2A_(Protists),,310872.0,,,5911,Tetrahymena thermophila,Ciliophora,Oligohymenophorea,,,22650316 26989147,MSTTGKGGKAKGKTASSKQVSRSARAGLQFPVGRISRFLKNGRYSE...


In [29]:
sequence_df.shape

(569, 16)

In [30]:
sequence_df[sequence_df["variant_group"] == "HISTONE_NOT_EXIST"]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence


In [31]:
sequence_df.isna().sum(axis=0)

accession          0
type               0
variant_group      0
variant            0
doublet            0
gi                 0
ncbi_gene_id       0
hgnc_gene_name     0
taxonomy_id       20
organism           0
phylum             0
class              0
taxonomy_group     0
info               0
references         0
sequence           0
dtype: int64

In [32]:
sequence_df[sequence_df["phylum"] == "None"]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence


## Сравним полученную таблицу с histones.csv

In [33]:
df_histonescsv = pd.read_csv(
    "/home/l_singh/_scratch/hdb/project_dir/histonedb/CURATED_SET/histones.csv"
).fillna("")
df_histonescsv["taxonomy_id"] = (
    pd.to_numeric(df_histonescsv["taxonomy_id"], errors="coerce")
    .fillna(np.nan)
    .astype("Int64")
)

In [34]:
# Функция для преобразования строкового представления вещественных чисел в строковые целые
def convert_to_integer_string(value):
    if value != "":  # Если значение не пустое
        return str(
            int(float(value))
        )  # Преобразуем в float, затем в int, и снова в строку
    return value  # Если значение пустое, оставляем его без изменений


# Применяем функцию к колонке ncbi_gene_id
df_histonescsv["ncbi_gene_id"] = df_histonescsv["ncbi_gene_id"].apply(
    convert_to_integer_string
)

In [35]:
df_histonescsv[df_histonescsv["accession"] == "HISTDB_H2A_L_0"]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
49,HISTDB_H2A_L_0,H2A,short_H2A,H2A.L.1_(Homo_sapiens),,,115482714,H2AL1Q,9606,Homo sapiens,Chordata,Mammalia,Mammalia,,29549088 35099534,MAGNKHSRSSCKPRRQCLSRSRRAELQFPVSHMERCLREGQYARHL...


In [36]:
df_histonescsv.columns

Index(['accession', 'type', 'variant_group', 'variant', 'doublet', 'gi',
       'ncbi_gene_id', 'hgnc_gene_name', 'taxonomy_id', 'organism', 'phylum',
       'class', 'taxonomy_group', 'info', 'references', 'sequence'],
      dtype='object')

In [37]:
sequence_df.columns

Index(['accession', 'type', 'variant_group', 'variant', 'doublet', 'gi',
       'ncbi_gene_id', 'hgnc_gene_name', 'taxonomy_id', 'organism', 'phylum',
       'class', 'taxonomy_group', 'info', 'references', 'sequence'],
      dtype='object')

In [38]:
# Сначала проверим, совпадает ли набор столбцов
set(df_histonescsv.columns) == set(sequence_df.columns)

True

In [39]:
# Функция для сортировки числовых элементов в колонке 'references', оставляя нечисловые элементы на месте
def sort_references_column(df, column_name="references"):
    def sort_reference_string(reference_str):
        elements = reference_str.split()  # Разделяем строку на элементы
        numeric_elements = sorted(
            [el for el in elements if el.isdigit()], key=int
        )  # Сортируем числовые элементы
        non_numeric_elements = [
            el for el in elements if not el.isdigit()
        ]  # Не числовые элементы остаются на месте
        return " ".join(
            numeric_elements + non_numeric_elements
        )  # Объединяем отсортированные числа и нечисловые

    # Применяем функцию к каждому элементу колонки
    df[column_name] = df[column_name].apply(sort_reference_string)
    return df


# Применяем сортировку к колонке 'references' в обеих таблицах
df_histonescsv = sort_references_column(df_histonescsv)
sequence_df = sort_references_column(sequence_df)

In [40]:
# Сортируем строки по всем столбцам
df1_sorted = df_histonescsv.sort_values(by=list(df_histonescsv.columns)).reset_index(
    drop=True
)
df2_sorted = sequence_df.sort_values(by=list(sequence_df.columns)).reset_index(
    drop=True
)

# Сравниваем отсортированные DataFrame
df1_sorted.equals(df2_sorted)

True

### Таблицы идентичны!

In [None]:
df_histonescsv.dtypes

In [None]:
sequence_df.dtypes

In [66]:
def find_differences(df1, df2):
    # Используем merge с индикатором, чтобы отследить источник каждой строки
    comparison = df1.merge(df2, how="outer", indicator=True)

    # Строки, которые есть только в первой таблице (df1)
    only_in_df1 = comparison[comparison["_merge"] == "left_only"]

    # Строки, которые есть только во второй таблице (df2)
    only_in_df2 = comparison[comparison["_merge"] == "right_only"]

    return only_in_df1, only_in_df2

In [67]:
only_in_df1, only_in_df2 = find_differences(df_histonescsv_, sequence_df_)

In [None]:
# Записи, которые есть только в первой таблице (df_histonescsv):
only_in_df1_ = only_in_df1.drop(columns=["_merge"])
only_in_df1_

In [None]:
# Записи, которые есть только во второй таблице (sequence_df):
only_in_df2_ = only_in_df2.drop(columns=["_merge"])
only_in_df2_

In [None]:
df_histonescsv.shape, sequence_df.shape

# Close connections

In [41]:
cursor.close()
conn.close()
tunnel.stop()