### Import libs and raw database

In [2]:
import pandas as pd
import numpy as np
import json
import random
import sqlite3
from sklearn.model_selection import StratifiedKFold

con = sqlite3.connect("data/app.db")
df_raw = pd.read_sql_query("SELECT * from report_histo", con)
excel = pd.read_excel("data/Dossier_Patient_clean_myocapt_KC_v6.xlsx")

### Convert js-tree json to per-columns dataframe and save data as raw_dataset

In [9]:
def table_to_df(df):
    # Transformation de la table SQLite en dataframe en parsant l'arbre JSON
    # Return un dictionnaire et une liste de colonnes (les features histo)
    tree_as_dict = {}
    features_col = []
    for index, row in df.iterrows():
        tree_as_dict.setdefault("id", []).append(row[0])
        tree_as_dict.setdefault("patient_id", []).append(row[1])
        tree_as_dict.setdefault("expert_id", []).append(row[2])
        tree_as_dict.setdefault("biopsie_id", []).append(row[3])
        tree_as_dict.setdefault("muscle_prelev", []).append(row[4])
        tree_as_dict.setdefault("age_biopsie", []).append(row[5])
        tree_as_dict.setdefault("date_envoie", []).append(row[6])
        tree_as_dict.setdefault("gene_diag", []).append(row[7])
        tree_as_dict.setdefault("comment", []).append(row[9])
        tree_as_dict.setdefault("conclusion", []).append(row[10])
        tree_as_dict.setdefault("datetime", []).append(row[11])

        my_tree = json.loads(row[8])
        for feature in my_tree:
            tree_as_dict.setdefault(feature["text"], []).append(float(feature["data"].get("presence", -0.25)))
            if index==0:
                features_col.append(feature["text"])
    return tree_as_dict, features_col

tree_as_dict, features_col = table_to_df(df_raw)
df = pd.DataFrame.from_dict(tree_as_dict)

# Simplifier le diagnostic en regroupant les sous-type sous 5 catégories: Némaline, Core Myo, CentroNuc ou UNCLEAR ou OTHER
# Simplification de l'encodage:
# No Info (-0.25) -> 0 ; Absence reste ; Présence faible/modéré/forte (0.25 0.5 0.75) -> 1
df = df.replace({"COM_CCD":"COM", "COM_MMM":"COM", "NM_CAP":"NM", "CFTD":"OTHER", "NON_CM":"OTHER","CM":"UNCLEAR"})
df = df.replace({-0.25:np.nan})
df.to_csv("data_raw/raw_dataset.csv", index=False)

### Filter columns that are non-histologic data from dataset and filter conclusions outside of NM, COM, CNM. Fill N/A. Change CNM:0, COM:1, NM:2

In [28]:
df_filt = pd.read_csv("data_raw/raw_dataset.csv")
df_filt = df_filt.iloc[:,9:]
df_filt = df_filt.drop(df[df["conclusion"]=="OTHER"].index)
df_filt = df_filt.drop(df[df["conclusion"]=="UNCLEAR"].index)
del df_filt["datetime"]
df_filt = df_filt.replace({"CNM":0, "COM":1, "NM":2})
df_filt.fillna(0, inplace=True)
df_filt.to_csv("data_raw/dataset.csv", index=False)

In [29]:
df_filt

Unnamed: 0,conclusion,Coloration HE et TG,Aspect Global Prélèvement,Prélèvement Anormal,Fibre Type 1,Type 1 Hypertrophiques (grandes),Type 1 Atrophiées (petites),Type 1 Taille hétérogène,Fibre Type 2,Type 2 Hypertrophiques (grandes),...,ME: Structure sarcomérique Remanié,ME: Structure sarcomérique en Croissant,ME: Bande A,ME: Materiel Z,Materiel Z: Elargissement,Materiel Z: Streaming,ME: Vacuoles Autophagiques,ME: Citerne,Dilatation de Citerne,ME: Lésions de type CAPS
0,2,0.0,0.0,1.0,0.0,0.00,1.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.00,1.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.00,1.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,1,0.0,0.0,0.0,0.0,0.25,0.25,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,1,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,1,0.0,0.0,0.0,0.0,0.00,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,1,0.0,0.0,0.5,0.0,0.00,0.25,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
def random_term_list(dct):
    term_list = []
    for k, v in dct.items():
        rand_val = random.random()
        if rand_val <= v/100:
            term_list.append(k)
    return term_list

def stat_per_diags(df_raw):
    stat_per_diag = {}
    feature_col = df_raw.columns.to_list()
    feature_col.pop(0) # Pop Conclusion Columns
    feature_col
    all_diag = list(set(df_raw.conclusion.to_list()))
    for i in all_diag:
        ds = df_raw[df_raw["conclusion"]==i][features_col].sum().sort_values(ascending=False)
        nrow = len(df_raw[df_raw["conclusion"]==i])
        ds = ds / nrow * 100
        stat_per_diag[i] = {}
        stat_per_diag[i]["n"] = nrow
        stat_per_diag[i]["feature"] = ds[ds>0].round().to_dict()
    return stat_per_diag

def stat_per_genes(df_raw):
    stat_per_gene = {}
    feature_col = df_raw.columns.to_list()
    feature_col.pop(0) # Pop Conclusion Columns
    feature_col
    all_genes = list(set(df_raw.gene_diag.to_list()))
    for i in all_genes:
        ds = df_raw[df_raw["gene_diag"]==i][features_col].sum().sort_values(ascending=False)
        nrow = len(df_raw[df_raw["gene_diag"]==i])
        ds = ds / nrow * 100
        stat_per_gene[i] = {}
        stat_per_gene[i]["n"] = nrow
        stat_per_gene[i]["feature"] = ds[ds>0].round().to_dict()
    return stat_per_gene


### Generate Cross-validation sets and calculate ontology frequency for each set

In [74]:
cv_partitions = 5
cv = StratifiedKFold(n_splits=cv_partitions, shuffle=True, random_state=777)
cv_generator = cv.split(df_filt.iloc[:,1:], df_filt.iloc[:,0])
train_idx_lst = []
test_idx_lst = []
for train_idx, test_idx in cv_generator:
    train_idx_lst.append(train_idx)
    test_idx_lst.append(test_idx)

train_dfs = []
test_dfs = []
for partition in range(cv_partitions):
    train_df = df_filt.iloc[train_idx_lst[partition]]
    train_df["match"] = partition
    train_df["partition"] = "train"

    train_dfs.append(train_df)

    test_df = df_filt.iloc[test_idx_lst[partition]]
    test_df["match"] = partition
    test_df["partition"] = "test"
    test_df
    test_dfs.append(test_df)

stat_per_diags_dict = []
for index, parition in enumerate(train_dfs):
    stat_per_diags_dict.append(stat_per_diags(parition.iloc[:,0:-2]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [83]:
synth_data_dict = {}
for idx_part, stats in enumerate(stat_per_diags_dict):   
    for disease in [0,1,2]:
        for patient in range(100):
            idx_name = str(idx_part)+"_"+str(disease)+"_"+str(patient)
            synth_data_dict[idx_name] = []
            synth_data_dict[idx_name].append(disease)
            patient_term_list = random_term_list(stats[disease]["feature"])
            for i,value in enumerate(df_filt.columns):
                if value in patient_term_list:
                    synth_data_dict[idx_name].append(1)
                elif value not in ["conclusion", "match", "partition"]:
                    synth_data_dict[idx_name].append(0)
            synth_data_dict[idx_name].append(idx_part)
            synth_data_dict[idx_name].append("train")

In [91]:
columns = df_filt.columns.to_list() + ["match", "partition"]
synthetic_data = pd.DataFrame.from_dict(synth_data_dict, orient='index', columns=columns)
concat_test_dfs = pd.concat(test_dfs)
final_synth_dataset = pd.concat([synthetic_data, concat_test_dfs])
final_synth_dataset.replace({0.25:1, 0.5:1, 0.75:1}, inplace=True)
final_synth_dataset.to_csv("data_raw/synthetic_dataset.csv", index=False)


# To-Do Later: add noise to data

In [None]:
normal_mean = df.sum(axis=1).mean()
normal_std = df.sum(axis=1).std()
term_number = int(np.round(np.random.normal(loc=normal_mean, scale=normal_std)))
print(term_number)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve 

df = pd.read_csv("data/histo_fake_data.csv")
X, Y = df.iloc[:,1:],df.iloc[:,0]
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)


clf = SVC(class_weight="balanced", probability=True, random_state=777)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
train_sizes, train_scores, test_scores = learning_curve(clf, X, label_encoded_y, shuffle=True, cv=cv, scoring="accuracy")
clf = clf.fit(X, label_encoded_y)

print("Cross-Validation Scores:")
print(cross_val_score(clf, X, label_encoded_y, cv=cv, scoring="accuracy"))
plt.plot(range(5), cross_val_score(clf, X, label_encoded_y, cv=cv, scoring="accuracy"), 'o-', color="r",
                 label="Cross-Validation Accuracy")
ylim = plt.ylim(0,1)
xticks = plt.xticks(range(5))


In [None]:
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
plt.ylim(0.75,1.01)
plt.title("Learning Curve (SVM). Mean Accuracy +- 1 std")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy Score")
plt.legend(loc="best")
plt.grid()
plt.show()



In [None]:
# Target - Data DF
df = pd.read_csv("data/histo_feature.csv")
df = df.iloc[:,9:]
# Drop les OTHER pour l'instant (que 3 classes)
df = df.drop(df[df["conclusion"]=="OTHER"].index)
df = df.drop(df[df["conclusion"]=="UNCLEAR"].index)
del df["datetime"]
# Enlever les col remplis de NaN ou avec moins de 5 valeur (annotations)
df = df.dropna(axis=1, thresh=5)
df.fillna(0, inplace=True)
df = df.replace({0.25:1, 0.5:1, 0.75:1})
# Séparer les features des labels et onehot encoding des labels
# NM:2, COM:1, UNCLEAR:4, CNM:0, OTHER:3
X_test, Y_test = df.iloc[:,1:],df.iloc[:,0]
label_encoder_test = LabelEncoder()
label_encoder_test = label_encoder_test.fit(Y_test)
label_encoded_y_test = label_encoder_test.transform(Y_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
y_predict = clf.predict(X_test)
print(accuracy_score(label_encoded_y_test, y_predict))
plot_confusion_matrix(clf, X_test, label_encoded_y_test)