In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from tabulate import tabulate
import time
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv("/content/Protocols_dataset.csv")

In [4]:
# Print the shape of the data
print("Shape of the data:", df.shape)

# Print a brief description of the data
print(f"The data contains {df.shape[0]} rows and {df.shape[1]} columns.")

Shape of the data: (4327, 13)
The data contains 4327 rows and 13 columns.


In [6]:
classifiers  = [
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', C=1)),
    ('K-Nearest Neighbors', KNeighborsClassifier(n_neighbors=5)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42))
]


results_table = []
advantages = {
    'Random Forest': 'Ensemble d\'arbres de décision, robuste contre le surajustement, gère bien la non-linéarité',
    'SVM': 'Efficace dans les espaces de grande dimension, adapté aux problèmes linéaires et non linéaires',
    'K-Nearest Neighbors': 'Simple et intuitif, pas de temps d\'entraînement, peut gérer des problèmes multi-classes',
    'Decision Tree': 'Interprétable, gère à la fois les données numériques et catégorielles, utile pour la sélection de fonctionnalités'
}

for classifier_name, classifier in classifiers:
    start_time = time.time()


    X = df.drop(columns=['Protocole'])  # Features
    y = df['Protocole']  # Target variable


    categorical_features = ['sexe', 'motif d\'admission', 'type de greffe']
    label_encoders = {}
    for feature in categorical_features:
        label_encoders[feature] = LabelEncoder()
        X[feature] = label_encoders[feature].fit_transform(X[feature])

    X['date de début de conditionnement/admission'] = pd.to_datetime(X['date de début de conditionnement/admission'])
    X['date de la greffe/début du protocole'] = pd.to_datetime(X['date de la greffe/début du protocole'])

    X['start_year'] = X['date de début de conditionnement/admission'].dt.year
    X['start_month'] = X['date de début de conditionnement/admission'].dt.month
    X['start_day'] = X['date de début de conditionnement/admission'].dt.day
    X['protocol_start_year'] = X['date de la greffe/début du protocole'].dt.year
    X['protocol_start_month'] = X['date de la greffe/début du protocole'].dt.month
    X['protocol_start_day'] = X['date de la greffe/début du protocole'].dt.day

    X = X.drop(columns=['date de début de conditionnement/admission', 'date de la greffe/début du protocole'])

    numerical_features = ['age', 'poids', 'taille', 'surface corporelle', 'IMC', 'creatinine plasmatique', 'clairance creatinine', 'start_year', 'start_month', 'start_day', 'protocol_start_year', 'protocol_start_month', 'protocol_start_day']
    scaler = StandardScaler()
    X[numerical_features] = scaler.fit_transform(X[numerical_features])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    end_time = time.time()
    runtime = end_time - start_time

    details = f"Number of Classes: {len(np.unique(y_test))}, Model: {classifier_name}\nRuntime: {runtime:.2f} seconds\nAdvantage: {advantages[classifier_name]}"

    results_table.append([classifier_name, accuracy, details])

headers = ["Modèle", "Précision", "Détails"]
print(tabulate(results_table, headers=headers, tablefmt="grid"))

+---------------------+-------------+------------------------------------------------------------------------------------------------------------------------------+
| Modèle              |   Précision | Détails                                                                                                                      |
| Random Forest       |    0.839492 | Number of Classes: 8, Model: Random Forest                                                                                   |
|                     |             | Runtime: 1.01 seconds                                                                                                        |
|                     |             | Advantage: Ensemble d'arbres de décision, robuste contre le surajustement, gère bien la non-linéarité                        |
+---------------------+-------------+------------------------------------------------------------------------------------------------------------------------------+
| SVM     