In [None]:
import json
import pandas as pd
import sqlite3
import numpy as np
from math import pow
import random
con = sqlite3.connect("data/app.db")
df = pd.read_sql_query("SELECT * from report_histo", con)

In [None]:
def boqa(alpha, beta, query, items_stat):
    hidden = {}
    p = {}
    a = {}
    a_init = 0
    # Pour chaque maladie
    for disease in items_stat:
        # On initialise de Hidden Layer avec les valeurs de stats
        for term in query:
            if term in items_stat[disease]["feature"].keys():
                proba = items_stat[disease]["feature"][term]/100
                hidden[term] = np.random.choice([1,0], p=[proba,1-proba])
            else: hidden[term] = 0
        # On calcul la cardinalité des termes H et Q
        m = matrix_m(query, hidden)
        a[disease] = pow(beta, m[0,1]) * pow(1-beta, m[1,1]) * pow(1-alpha, m[0,0]) * pow(alpha, m[1,0])
        a_init += a[disease]
    for disease in items_stat:
        p[disease] = a[disease] / a_init
    return p

def matrix_m(Q, H):
    matrix_count = np.empty((2,2))
    if Q.keys() != H.keys():
        raise Exception("Error Ontology not matching stats")
    for x in range(2):
        for y in range(2):
            count = 0
            for i in Q:
                if Q[i] == x and H[i] == y:
                    count += 1
            matrix_count[x,y] = count
    return matrix_count

def subsample_query(query):
    query_filt = {k: v for k, v in query.items() if v == 1}
    try: keys = random.sample(list(query_filt), 6)
    except: keys = query_filt

    for k in query.keys():
        if k not in keys:
            query[k] = 0
    return query

def propagate_annotations(query):
    pass

def boqa_monte_carlo(query, items_stat, n_indiv=100, alpha=0.05, beta=0.05, ):
    results = []
    for i in range(n_indiv):
        result = boqa(alpha, beta, query, items_stat)
        for key,value in result.items():
            if value >0.5:
                results.append(key)
    dd = {x:results.count(x) for x in set(results)}
    return [ max(dd, key=dd.get), max(dd.values()) ]


In [None]:
items_stat = json.load(open("data/stat_per_diag.json", "r"))
items_stat.pop('OTHER', None)
my_tree = json.loads(df.iloc[0,8])
query = {}
replace_dict = {-0.25: 0, 0.25: 1, 0.5: 1, 0.75: 1, 1:1, 0:0}
for feature in my_tree:
    value = float(feature["data"].get("presence", -0.25))
    query[feature["text"]] = replace_dict[value]
boqa_monte_carlo(query, items_stat)

In [None]:
df = df.replace({"COM_CCD":"COM", "COM_MMM":"COM", "NM_CAP":"NM", "CFTD":"OTHER", "NON_CM":"OTHER","CM":"OTHER", "UNCLEAR":"OTHER"})
df = df.drop(df[df["conclusion"]=="OTHER"].index)

results = []
for i in range(len(df)):
    my_tree = json.loads(df.iloc[i,8])
    query = {}
    replace_dict = {-0.25: 0, 0.25: 1, 0.5: 1, 0.75: 1, 1:1, 0:0}
    for feature in my_tree:
        value = float(feature["data"].get("presence", -0.25))
        query[feature["text"]] = replace_dict[value]
    class_CM, proba = boqa_monte_carlo(query, items_stat)
    results.append([class_CM, proba, i])


In [None]:
tp, tn, fp, fn = [], [], [], []
for class_CM, proba, index in results:
    if class_CM == df.iloc[index,10] and proba >= 50:
        tp.append([class_CM, proba , df.iloc[index,10]])
    elif class_CM == df.iloc[index,10] and proba < 50:
        fn.append([class_CM, proba , df.iloc[index,10]])
    elif class_CM != df.iloc[index,10] and proba < 50:
        tn.append([class_CM, proba , df.iloc[index,10]])
    elif class_CM != df.iloc[index,10] and proba >= 50:
        fp.append([class_CM, proba , df.iloc[index,10]])

In [None]:
print(len(tp), len(tp)/len(df)*100)
print(len(tn), len(tn)/len(df)*100)
print(len(fp), len(fp)/len(df)*100)
print(len(fn), len(fn)/len(df)*100)

In [None]:
cf_matrix = np.empty((2,2))
cf_matrix[0,0] = len(tn)
cf_matrix[0,1] = len(fp)
cf_matrix[1,0] = len(fn)
cf_matrix[1,1] = len(tp)
import seaborn as sns
group_names = ["True Neg","False Pos","False Neg","True Pos"]
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix, annot=labels, fmt="", cmap='Blues')