In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix

## XGBOOST Model

In [None]:
# Target - Data DF
df = pd.read_csv("histo_feature.csv")
df = df.iloc[:,9:]
# Drop les OTHER pour l'instant (que 3 classes)
df = df.drop(df[df["conclusion"]=="OTHER"].index)
del df["datetime"]
# Enlever les col remplis de NaN ou avec moins de 5 valeur (annotations)
df = df.dropna(axis=1, thresh=5)
# Séparer les features des labels et onehot encoding des labels
# NM:2, COM:1, CNM:0, OTHER:3
X, Y = df.iloc[:,1:],df.iloc[:,0]
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# Split Train / Test
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=0.2, random_state=777, shuffle=True, stratify=label_encoded_y)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
# Make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# Evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## RandomForest Model

In [None]:
# Target - Data DF
df = pd.read_csv("histo_feature.csv")
df = df.iloc[:,9:]
# Drop les OTHER pour l'instant (que 3 classes)
df = df.drop(df[df["conclusion"]=="OTHER"].index)
del df["datetime"]
# Enlever les col remplis de NaN ou avec moins de 5 valeur (annotations)
df = df.dropna(axis=1, thresh=5)
df.fillna(0, inplace=True)
#df = df.replace({0.25:1, 0.5:1, 0.75:1})
# Séparer les features des labels et onehot encoding des labels
# NM:2, COM:1, UNCLEAR:4, CNM:0, OTHER:3
X, Y = df.iloc[:,1:],df.iloc[:,0]
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
# Split Train / Test
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=0.2, random_state=777, shuffle=True, stratify=label_encoded_y)
clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)
print(np.mean(cross_val_score(clf, X_train, y_train, cv=5)))
# Make predictions for test data
y_pred = clf.predict(X_test)
predictions = [round(value) for value in y_pred]
# Evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit, LeaveOneOut
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):

    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)


    # Plot learning curve
    axes.grid()
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes.legend(loc="best")
    return plt


X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=0.2, random_state=107, shuffle=True, stratify=label_encoded_y)
clf = RandomForestClassifier(criterion="entropy", max_depth=4, n_jobs=8)
clf = clf.fit(X_train, y_train)
fig, axes = plt.subplots(1, 1, figsize=(7, 7))
title = "Learning Curves (Random Forest)"
plot_learning_curve(clf, title, X_train, y_train, axes=axes, ylim=(0, 1.05),
                    cv=LeaveOneOut())
plt.show()

# Make predictions for test data
y_pred = clf.predict(X_test)
predictions = [round(value) for value in y_pred]

# Evaluate predictions
plot_confusion_matrix(clf, X_test, y_test, display_labels=["CNM","COM","NM"])
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


In [None]:
# TO-DO: try to predict unclear with the model