In [None]:
import json
import sys

sys.path.append("..")

import numpy as np
import pandas as pd


def table_to_df(df):
    """Transform the JSON JSTree in the text report table to a pandas dataframe columns

    Args:
        df (Dataframe): Dataframe version of the text report table in database

    Returns:
        Dataframe: Table as dataframe with new columns correspondig to JSTree nodes
        List: List of the columns extracted from the standard vocabulary JSTree nodes
    """
    # Transformation of the SQLite table to a Pandas DataFrame by parsing the JSON tree
    # Returns a dictionnary and a list of columns (standard terms)
    tree_as_dict = {}
    features_col = []
    for index, row in df.iterrows():
        tree_as_dict.setdefault("id", []).append(row[0])
        tree_as_dict.setdefault("patient_id", []).append(row[1])
        tree_as_dict.setdefault("expert_id", []).append(row[2])
        tree_as_dict.setdefault("biopsie_id", []).append(row[3])
        tree_as_dict.setdefault("muscle_prelev", []).append(row[4])
        tree_as_dict.setdefault("age_biopsie", []).append(row[5])
        tree_as_dict.setdefault("date_envoie", []).append(row[6])
        tree_as_dict.setdefault("gene_diag", []).append(row[7])
        tree_as_dict.setdefault("comment", []).append(row[9])
        tree_as_dict.setdefault("conclusion", []).append(row[10])
        tree_as_dict.setdefault("BOQA_prediction", []).append(row[11])
        tree_as_dict.setdefault("BOQA_prediction_score", []).append(row[12])
        tree_as_dict.setdefault("datetime", []).append(row[13])

        my_tree = row[8]
        for feature in my_tree:
            tree_as_dict.setdefault(feature["text"], []).append(
                float(feature["data"].get("presence", -0.25))
            )
            if index == 0:
                features_col.append(feature["text"])
    df_return = pd.DataFrame.from_dict(tree_as_dict)
    return df_return, features_col


def db_to_df():
    """Convert the SQLite table to a pandas dataframe

    Returns:
        Dataframe: Dataframe representation of the SQLite table for text reports
    """
    df = pd.read_sql(db.session.query(ReportHisto).statement, db.session.bind)
    return df


def process_df(df):
    """Process the dataframe to replace diagnosis names and values with
    corresponding values from dictionary hardcoded in the function.

    Args:
        df (Dataframe): Dataframe representation of the SQLite table for text reports

    Returns:
        Dataframe: Processed DataFrame with modified values
    """
    df = df.replace(
        {
            "COM_CCD": "COM",
            "COM_MMM": "COM",
            "NM_CAP": "NM",
            "CFTD": "OTHER",
            "NON_CM": "OTHER",
            "CM": "UNCLEAR",
        }
    )
    df = df.replace({-0.25: np.nan, 0.25: 1, 0.5: 1, 0.75: 1})
    return df


df = db_to_df()
df, features_col = table_to_df(df)
df = process_df(df)

In [None]:
with open("../data/ontology/ontology.json", "r") as fp:
    onto = json.load(fp)
for term in onto:
    df_temp = df[df[term["text"]] == 1]
    gene_datamined_temp = list(df_temp["gene_diag"].value_counts().index)
    phenotype_datamined_temp = list(df_temp["conclusion"].value_counts().index)
    if gene_datamined_temp == []:
        term["data"]["gene_datamined"] = ""
    else:
        term["data"]["gene_datamined"] = ",".join(gene_datamined_temp)
    if phenotype_datamined_temp == []:
        term["data"]["phenotype_datamined"] = ""
    else:
        term["data"]["phenotype_datamined"] = ",".join(phenotype_datamined_temp)
with open("../data/ontology/ontology.json", "w") as fp:
    json.dump(onto, fp, indent=4)