In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import json
import random

In [None]:
df = pd.read_csv("data/histo_feature.csv")
df = df.iloc[:,9:]
df = df.drop(df[df["conclusion"]=="OTHER"].index)
df = df.drop(df[df["conclusion"]=="UNCLEAR"].index)
del df["datetime"]
df = df.dropna(axis=1, thresh=5)
df.fillna(0, inplace=True)
df = df.replace({0.25:1, 0.5:1, 0.75:1})

In [None]:
stat_per_diag = json.load(open("data/stat_per_diag.json","rb"))

def random_term_list(dct):
    term_list = []
    for k, v in dct.items():
        rand_val = random.random()
        if rand_val <= v/100:
            term_list.append(k)
    return term_list

def stat_per_diags(df, features_col):
    stat_per_diag = {}
    all_diag = list(set(df.conclusion.to_list()))
    for i in all_diag:
        ds = df[df["conclusion"]==i][features_col].sum().sort_values(ascending=False)
        nrow = len(df[df["conclusion"]==i])
        ds = ds / nrow * 100
        stat_per_diag[i] = {}
        stat_per_diag[i]["n"] = nrow
        stat_per_diag[i]["feature"] = ds[ds>0].round().to_dict()
    return stat_per_diag

In [None]:
synth_data_dict = {}
for disease in ["COM","NM","CNM"]:
    for patient in range(100):
        synth_data_dict[disease+"_"+str(patient)] = []
        synth_data_dict[disease+"_"+str(patient)].append(disease)
        patient_term_list = random_term_list(stat_per_diag[disease]["feature"])
        for i,value in enumerate(df.columns):
            if value in patient_term_list:
                synth_data_dict[disease+"_"+str(patient)].append(1)
            elif value != "conclusion":
                synth_data_dict[disease+"_"+str(patient)].append(0)


In [None]:
df2 = pd.DataFrame.from_dict(synth_data_dict, orient='index', columns=df.columns)
df2.to_csv("data/histo_fake_data.csv", index=False)

In [None]:
normal_mean = df.sum(axis=1).mean()
normal_std = df.sum(axis=1).std()
term_number = int(np.round(np.random.normal(loc=normal_mean, scale=normal_std)))
print(term_number)

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve 

df = pd.read_csv("data/histo_fake_data.csv")
X, Y = df.iloc[:,1:],df.iloc[:,0]
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)


clf = SVC(class_weight="balanced", probability=True, random_state=777)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
train_sizes, train_scores, test_scores = learning_curve(clf, X, label_encoded_y, shuffle=True, cv=cv, scoring="accuracy")
clf = clf.fit(X, label_encoded_y)

print("Cross-Validation Scores:")
print(cross_val_score(clf, X, label_encoded_y, cv=cv, scoring="accuracy"))
plt.plot(range(5), cross_val_score(clf, X, label_encoded_y, cv=cv, scoring="accuracy"), 'o-', color="r",
                 label="Cross-Validation Accuracy")
ylim = plt.ylim(0,1)
xticks = plt.xticks(range(5))


In [None]:
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
plt.ylim(0.75,1.01)
plt.title("Learning Curve (SVM). Mean Accuracy +- 1 std")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy Score")
plt.legend(loc="best")
plt.grid()
plt.show()



In [None]:
# Target - Data DF
df = pd.read_csv("data/histo_feature.csv")
df = df.iloc[:,9:]
# Drop les OTHER pour l'instant (que 3 classes)
df = df.drop(df[df["conclusion"]=="OTHER"].index)
df = df.drop(df[df["conclusion"]=="UNCLEAR"].index)
del df["datetime"]
# Enlever les col remplis de NaN ou avec moins de 5 valeur (annotations)
df = df.dropna(axis=1, thresh=5)
df.fillna(0, inplace=True)
df = df.replace({0.25:1, 0.5:1, 0.75:1})
# Séparer les features des labels et onehot encoding des labels
# NM:2, COM:1, UNCLEAR:4, CNM:0, OTHER:3
X_test, Y_test = df.iloc[:,1:],df.iloc[:,0]
label_encoder_test = LabelEncoder()
label_encoder_test = label_encoder_test.fit(Y_test)
label_encoded_y_test = label_encoder_test.transform(Y_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
y_predict = clf.predict(X_test)
print(accuracy_score(label_encoded_y_test, y_predict))
plot_confusion_matrix(clf, X_test, label_encoded_y_test)

In [None]:
y_predict

In [None]:
def stat_per_genes(df, features_col):
    stat_per_gene = {}
    all_genes = list(set(df.gene_diag.to_list()))
    for i in all_genes:
        ds = df[df["gene_diag"]==i][features_col].sum().sort_values(ascending=False)
        nrow = len(df[df["gene_diag"]==i])
        ds = ds / nrow * 100
        stat_per_gene[i] = {}
        stat_per_gene[i]["n"] = nrow
        stat_per_gene[i]["feature"] = ds[ds>0].round().to_dict()
    return stat_per_gene