In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve 

## RandomForest Model

In [None]:
# Target - Data DF
df = pd.read_csv("data/histo_feature.csv")
df = df.iloc[:,9:]
# Drop les OTHER pour l'instant (que 3 classes)
df = df.drop(df[df["conclusion"]=="OTHER"].index)
df = df.drop(df[df["conclusion"]=="UNCLEAR"].index)
del df["datetime"]
# Enlever les col remplis de NaN ou avec moins de 5 valeur (annotations)
df = df.dropna(axis=1, thresh=5)
df.fillna(0, inplace=True)
df = df.replace({0.25:1, 0.5:1, 0.75:1})
# Séparer les features des labels et onehot encoding des labels
# NM:2, COM:1, UNCLEAR:4, CNM:0, OTHER:3
X, Y = df.iloc[:,1:],df.iloc[:,0]
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)


# clf = RandomForestClassifier(criterion="entropy", max_depth=4, n_jobs=8, class_weight="balanced")
clf = SVC(class_weight="balanced", probability=True, random_state=777)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
train_sizes, train_scores, test_scores = learning_curve(clf, X, label_encoded_y, cv=cv, scoring="accuracy")
clf = clf.fit(X, label_encoded_y)


# Evalution de la cross-val
print("Cross-Validation Scores:")
print(cross_val_score(clf, X, label_encoded_y, cv=cv, scoring="accuracy"))
plt.plot(range(5), cross_val_score(clf, X, label_encoded_y, cv=cv, scoring="accuracy"), 'o-', color="r",
                 label="Cross-Validation Accuracy")
ylim = plt.ylim(0,1)
xticks = plt.xticks(range(5))

In [None]:
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
plt.ylim(0.4,1.05)
plt.title("Learning Curve (SVM). Mean Accuracy +- 1 std")
plt.xlabel("Training Examples")
plt.ylabel("Accuracy Score")
plt.legend(loc="best")
plt.grid()
plt.show()


In [None]:
predicted_targets = np.array([])
actual_targets = np.array([])

for train_ix, test_ix in cv.split(X, label_encoded_y):
    X_np = X.values
    train_x, train_y, test_x, test_y = X_np[train_ix], label_encoded_y[train_ix], X_np[test_ix], label_encoded_y[test_ix]

    # Fit the classifier
    classifier = clf = SVC(class_weight="balanced", probability=True, random_state=777).fit(train_x, train_y)

    # Predict the labels of the test set samples
    predicted_labels = classifier.predict(test_x)

    predicted_targets = np.append(predicted_targets, predicted_labels)
    actual_targets = np.append(actual_targets, test_y)




In [None]:
import itertools
cnf_matrix = confusion_matrix(actual_targets, predicted_targets)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plt.imshow(cnf_matrix, interpolation='nearest', cmap="viridis")
plt.colorbar()
classes = ["CNM","COM","NM"]
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

fmt = 'd'
thresh = cnf_matrix.max() / 2.

for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
    plt.text(j, i, format(cnf_matrix[i, j], fmt), horizontalalignment="center",
                color="black" if cnf_matrix[i, j] > thresh else "yellow")

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')


plt.show()


In [None]:
import pickle
#pickle.dump(clf, open("model/random_forest.sav", 'wb'))
#pickle.dump(X.columns.tolist(), open("model/features_col.list", 'wb'))

In [None]:
FEATURE_LIST = pickle.load(open(os.path.join("data", "features_col.list"), "rb"))
df = pd.read_csv("data/histo_feature.csv")
df = df.iloc[:,9:]
# Drop les OTHER pour l'instant (que 3 classes)
df = df[df["conclusion"]=="UNCLEAR"]
del df["datetime"]
# Enlever les col remplis de NaN ou avec moins de 5 valeur (annotations)
df.fillna(0, inplace=True)
df = df.replace({0.25:1, 0.5:1, 0.75:1})
df = df[FEATURE_LIST]

In [None]:
X = df
pred_unclear = []
pred_unclear_dict = {}
labels={0:"CNM",1:"COM",2:"NM"}
for i in range(len(X)):
    
    class_predict = clf.predict([X.iloc[i,:]])
    proba_class = np.amax(clf.predict_proba([X.iloc[i,:]]))
    if proba_class > 0.5:
        pred_unclear_dict[i] = labels[class_predict[0]]
        pred_unclear.append(labels[class_predict[0]])
    else:
        pred_unclear_dict[i] = "No_Pred"
        pred_unclear.append("No_Pred")


In [None]:
pred_unclear_dict_boqa = {0: 'NM',
 1: 'No_Pred',
 2: 'No_Pred',
 3: 'COM',
 4: 'NM',
 5: 'NM',
 6: 'COM',
 7: 'No_Pred',
 8: 'NM',
 9: 'No_Pred',
 10: 'NM',
 11: 'No_Pred',
 12: 'No_Pred',
 13: 'No_Pred',
 14: 'NM',
 15: 'No_Pred',
 16: 'NM',
 17: 'COM',
 18: 'No_Pred',
 19: 'COM',
 20: 'CNM',
 21: 'COM',
 22: 'COM'}

In [None]:
merged_dict = {}
for i in pred_unclear_dict:
    merged_dict[i] = [pred_unclear_dict[i], pred_unclear_dict_boqa[i]]

In [None]:
merged_dict

In [None]:
# No Pred, CNM, COM, NM
index_dict = {"No_Pred":0, "CNM":1,"COM":2,"NM":3}
matrix_conf = np.zeros((4,4))
for idx, value in merged_dict.items():
    matrix_conf[index_dict[value[0]], index_dict[value[1]]] += 1
matrix_conf = matrix_conf.astype(int)

In [None]:
import itertools
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plt.imshow(matrix_conf, interpolation='nearest', cmap="viridis")
plt.colorbar()
classes = ["No_pred","CNM","COM","NM"]
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

fmt = 'd'
thresh = matrix_conf.max() / 2.

for i, j in itertools.product(range(matrix_conf.shape[0]), range(matrix_conf.shape[1])):
    plt.text(j, i, format(matrix_conf[i, j], fmt), horizontalalignment="center",
                color="black" if matrix_conf[i, j] > thresh else "yellow")

plt.tight_layout()
plt.ylabel('SVM Prediction')
plt.xlabel('BOQA Prediction')


plt.show()


In [None]:
import seaborn as sns

pred_series = pd.Series(pred_unclear).value_counts()
fig_dims = (6, 4)
fig, ax = plt.subplots(figsize=fig_dims)
g = sns.barplot(x=pred_series.index, y=pred_series, ax=ax, palette=["tab:green", "tab:orange", "tab:blue","tab:red"])
for i in range(len(pred_series)):
    g.text(i, pred_series[i]+0.1, pred_series[i], color='black', ha="center")