In [None]:
import pandas as pd
import sqlite3
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set_style("whitegrid")
con = sqlite3.connect("data/app.db")
df = pd.read_sql_query("SELECT * from report_histo", con)
excel = pd.read_excel("data/Dossier_Patient_clean_myocapt_KC_v6.xlsx")

In [None]:
def table_to_df(df):
    # Transformation de la table SQLite en dataframe en parsant l'arbre JSON
    # Return un dictionnaire et une liste de colonnes (les features histo)
    tree_as_dict = {}
    features_col = []
    for index, row in df.iterrows():
        tree_as_dict.setdefault("id", []).append(row[0])
        tree_as_dict.setdefault("patient_id", []).append(row[1])
        tree_as_dict.setdefault("expert_id", []).append(row[2])
        tree_as_dict.setdefault("biopsie_id", []).append(row[3])
        tree_as_dict.setdefault("muscle_prelev", []).append(row[4])
        tree_as_dict.setdefault("age_biopsie", []).append(row[5])
        tree_as_dict.setdefault("date_envoie", []).append(row[6])
        tree_as_dict.setdefault("gene_diag", []).append(row[7])
        tree_as_dict.setdefault("comment", []).append(row[9])
        tree_as_dict.setdefault("conclusion", []).append(row[10])
        tree_as_dict.setdefault("datetime", []).append(row[11])

        my_tree = json.loads(row[8])
        for feature in my_tree:
            tree_as_dict.setdefault(feature["text"], []).append(float(feature["data"].get("presence", -0.25)))
            if index==0:
                features_col.append(feature["text"])
    return tree_as_dict, features_col
tree_as_dict, features_col = table_to_df(df)
df2 = pd.DataFrame.from_dict(tree_as_dict)

# Simplifier le diagnostic en regroupant les sous-type sous 5 catégories: Némaline, Core Myo, CentroNuc ou UNCLEAR ou OTHER
# Simplification de l'encodage:
# No Info (-0.25) -> 0 ; Absence reste ; Présence faible/modéré/forte (0.25 0.5 0.75) -> 1
df2 = df2.replace({"COM_CCD":"COM", "COM_MMM":"COM", "NM_CAP":"NM", "CFTD":"OTHER", "NON_CM":"OTHER","CM":"UNCLEAR"})
df2 = df2.replace({-0.25:np.nan, 0.25:1, 0.5:1, 0.75:1})

In [None]:
muscle_prelev = df2["muscle_prelev"].value_counts()
# Empty index to N/A
as_list = muscle_prelev.index.tolist()
idx = as_list.index('')
as_list[idx] = 'N/A'
muscle_prelev.index = as_list
sns.barplot(x=muscle_prelev.index, y=muscle_prelev)
var = plt.xticks(rotation=25)

In [None]:
age_biopsie = df2["age_biopsie"].value_counts()
bebe = age_biopsie.where(age_biopsie.index<=2).sum()
enfant = age_biopsie.where((age_biopsie.index>2) & (age_biopsie.index<18)).sum()
adulte = age_biopsie.where(age_biopsie.index>=18).sum()
g = sns.barplot(x=["Bébé (<=2ans)", "Enfant (3-17ans)", "Adulte (>=18ans)"], y=[bebe, enfant, adulte])
for i in range(3):
    g.text(i, [bebe, enfant, adulte][i]+0.1, int([bebe, enfant, adulte][i]), color='black', ha="center")

In [None]:
gene_diag = df2["gene_diag"].value_counts()[0:4]
gene_diag["Other"] = 89-(14+11+8+7)
fig_dims = (8, 4)
fig, ax = plt.subplots(figsize=fig_dims)
g = sns.barplot(x=gene_diag.index, y=gene_diag, ax=ax)
for i in range(len(gene_diag)):
    g.text(i, gene_diag[i]+0.1, gene_diag[i], color='black', ha="center")
#var = plt.xticks(rotation=90)


In [None]:
# Merge sub types for stats
conclusion = df2["conclusion"].value_counts()
g = sns.barplot(x=conclusion.index, y=conclusion)
for i in range(len(conclusion)):
    g.text(i, conclusion[i]+0.1, conclusion[i], color='black', ha="center")

In [None]:
df2[features_col].sum().sort_values(ascending=False)[0:10]

In [None]:
# Les features les plus présent chez les NM
df2.where(df2["conclusion"]=="NM")[features_col].sum().sort_values(ascending=False)[0:10]

In [None]:
# Les features les plus présent chez les Centro nuclear myo
df2.where(df2["conclusion"]=="CNM")[features_col].sum().sort_values(ascending=False)[0:10]

In [None]:
# Pour le learning futur: sortir la matrice de 0/1 en .csv
# df2 = df2.replace({"UNCLEAR":"OTHER"})
# df2.to_csv("data/histo_feature.csv", index=False)

In [None]:
stat_per_gene = {}
all_genes = list(set(df2.gene_diag.to_list()))
for i in all_genes:
    ds = df2[df2["gene_diag"]==i][features_col].sum().sort_values(ascending=False)
    nrow = len(df2[df2["gene_diag"]==i])
    ds = ds / nrow * 100
    stat_per_gene[i] = {}
    stat_per_gene[i]["n"] = nrow
    stat_per_gene[i]["feature"] = ds[ds>0].round().to_dict()

In [None]:
stat_per_diag = {}
all_diag = list(set(df2.conclusion.to_list()))
for i in all_diag:
    ds = df2[df2["conclusion"]==i][features_col].sum().sort_values(ascending=False)
    nrow = len(df2[df2["conclusion"]==i])
    ds = ds / nrow * 100
    stat_per_diag[i] = {}
    stat_per_diag[i]["n"] = nrow
    stat_per_diag[i]["feature"] = ds[ds>0].round().to_dict()

In [None]:
import json

with open("data/stat_per_gene.json","w") as f:
    json.dump(stat_per_gene, f, indent=4, ensure_ascii=False)
with open("data/stat_per_diag.json","w") as f:
    json.dump(stat_per_diag, f, indent=4, ensure_ascii=False)

In [None]:
import matplotlib.pyplot as plt
import plotly
import numpy as np
import seaborn as sns
import plotly.graph_objs as go

plotly.offline.init_notebook_mode()
onto_values = df2.iloc[:,11:]
onto_values = onto_values.dropna(axis=1, thresh=10)
onto_values = onto_values.replace({0:-1})
# onto_values = onto_values.fillna(0)
corrMatrix = onto_values.corr()
corrMatrix = corrMatrix.fillna(0)
col_row_to_drop = []
for i in range(len(corrMatrix)):
    if corrMatrix.iloc[i,i] == 0:
        col_row_to_drop.append(corrMatrix.columns[i])
        
corrMatrix.drop(col_row_to_drop, axis=1, inplace=True)
corrMatrix.drop(col_row_to_drop, axis=0, inplace=True)
# Use Seaborn to cluster data
g = sns.clustermap(corrMatrix, cmap="coolwarm", figsize=(20,20))
plt.close()
#trace_heatmap = go.Heatmap(x=g.data2d.columns, y=g.data2d.columns,
#                    z=g.data2d, colorscale="RdBu")
trace_heatmap = go.Heatmap(x=corrMatrix.columns, y=corrMatrix.columns,
                    z=corrMatrix, colorscale="RdBu")
data = [trace_heatmap]
layout = go.Layout(title="Histology ontology terms correlation matrix (threshold n>10)", showlegend=True, width=1000, height=1000, yaxis={"scaleanchor":"x"})
figure = go.Figure(data=data, layout=layout)
figure.show()
figure.write_json("data/correlation_matrix.json")

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.decomposition import PCA

# Target - Data DF
df = pd.read_csv("data/histo_feature.csv")
df = df.iloc[:,9:]
# Drop les OTHER pour l'instant (que 3 classes)
df = df.drop(df[df["conclusion"]=="OTHER"].index)
df = df.drop(df[df["conclusion"]=="UNCLEAR"].index)
del df["datetime"]
# Enlever les col remplis de NaN ou avec moins de 5 valeur (annotations)
df = df.dropna(axis=1, thresh=5)
df.fillna(0, inplace=True)
df = df.replace({0.25:1, 0.5:1, 0.75:1})
# Séparer les features des labels et onehot encoding des labels
# NM:2, COM:1, UNCLEAR:4, CNM:0, OTHER:3
X, Y = df.iloc[:,1:],df.iloc[:,0]

label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)

pca = PCA(2)
projected = pca.fit_transform(X)

plt.scatter(projected[:, 0], projected[:, 1],
            c=label_encoded_y, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('Set1', 3))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar()

In [None]:
pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');