In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


# Chargement du dataset

In [2]:
url = "heart_disease_uci.csv"
df = pd.read_csv(url)

# Affichage des premières lignes du datas

In [3]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


# Vérification des valeurs manquantes

In [4]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [None]:
df

# Encodage des variables catégorielles

In [23]:

label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])



In [24]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,1,0,3,145.0,233.0,1,0,150.0,0,2.3,0,0.0,0,0
1,2,67,1,0,0,160.0,286.0,0,0,108.0,1,1.5,1,3.0,1,2
2,3,67,1,0,0,120.0,229.0,0,0,129.0,1,2.6,1,2.0,2,1
3,4,37,1,0,2,130.0,250.0,0,1,187.0,0,3.5,0,0.0,1,0
4,5,41,0,0,1,130.0,204.0,0,0,172.0,0,1.4,2,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,0,3,0,127.0,333.0,1,2,154.0,0,0.0,3,,3,1
916,917,62,1,3,3,,139.0,0,2,,2,,3,,3,0
917,918,55,1,3,0,122.0,223.0,1,2,100.0,0,0.0,3,,0,2
918,919,58,1,3,0,,385.0,1,0,,2,,3,,3,0


# Remplissage des valeurs manquantes

In [31]:
# Remplissage par la moyenne pour les colonnes numériques
df['trestbps'].fillna(df['trestbps'].mean(), inplace=True)
df['chol'].fillna(df['chol'].mean(), inplace=True)
df['thalch'].fillna(df['thalch'].mean(), inplace=True)
df['oldpeak'].fillna(df['oldpeak'].mean(), inplace=True)
df['ca'].fillna(df['ca'].median(), inplace=True)

# Remplissage par la valeur la plus fréquente pour les colonnes catégorielles
categorical_columns = ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

for column in categorical_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)



# Vérification les valeurs manquantes après le remplissage

In [32]:
df.isnull().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

# Séparation des features et de la cible

In [34]:
X = df.drop('num', axis=1)
y = df['num']
X,y

(      id  age  sex  dataset  cp    trestbps   chol  fbs  restecg      thalch  \
 0      1   63    1        0   3  145.000000  233.0    1        0  150.000000   
 1      2   67    1        0   0  160.000000  286.0    0        0  108.000000   
 2      3   67    1        0   0  120.000000  229.0    0        0  129.000000   
 3      4   37    1        0   2  130.000000  250.0    0        1  187.000000   
 4      5   41    0        0   1  130.000000  204.0    0        0  172.000000   
 ..   ...  ...  ...      ...  ..         ...    ...  ...      ...         ...   
 915  916   54    0        3   0  127.000000  333.0    1        2  154.000000   
 916  917   62    1        3   3  132.132404  139.0    0        2  137.545665   
 917  918   55    1        3   0  122.000000  223.0    1        2  100.000000   
 918  919   58    1        3   0  132.132404  385.0    1        0  137.545665   
 919  920   62    1        3   1  120.000000  254.0    0        0   93.000000   
 
      exang   oldpeak  slo

# Normalisation des données

In [35]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Division du dataset en ensembles d'entraînement et de test

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

(array([[ 1.58332129,  0.90122351,  0.51693097, ...,  1.20424885,
         -0.36139973,  0.82954756],
        [-0.00941333,  0.05192709,  0.51693097, ...,  1.20424885,
         -0.36139973,  0.82954756],
        [ 1.2707989 , -0.26655907,  0.51693097, ...,  1.20424885,
         -0.36139973,  0.82954756],
        ...,
        [ 1.50801469,  2.28133018,  0.51693097, ..., -1.8675874 ,
         -0.36139973, -0.22749365],
        [-0.09225059, -0.05423496, -1.93449426, ..., -0.84364199,
         -0.36139973,  0.82954756],
        [-1.3461055 ,  0.37041325, -1.93449426, ...,  0.18030343,
          1.22945077, -1.28453487]]),
 array([[-0.52902887, -1.85898985,  0.51693097, ...,  1.20424885,
         -0.36139973,  0.82954756],
        [-0.31063973, -0.90353138,  0.51693097, ...,  1.20424885,
         -0.36139973,  0.82954756],
        [ 0.29557841, -0.58504522,  0.51693097, ..., -0.84364199,
         -0.36139973,  0.82954756],
        ...,
        [ 1.68121987,  0.79506145,  0.51693097, ...,  

# Construire un arbre de décision avec scikit-learn

In [37]:
from sklearn.tree import DecisionTreeClassifier

# Création du modèle d'arbre de décision
model = DecisionTreeClassifier(criterion='entropy', max_depth=3)

# Entraînement du modèle
model.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3)

# Évaluation du modèle

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Prédictions sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Accuracy: 0.6086956521739131
Confusion Matrix:
[[66  9  0  0  0]
 [11 40  3  0  0]
 [ 4 15  6  0  0]
 [ 3 21  2  0  0]
 [ 1  2  1  0  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        75
           1       0.46      0.74      0.57        54
           2       0.50      0.24      0.32        25
           3       0.00      0.00      0.00        26
           4       0.00      0.00      0.00         4

    accuracy                           0.61       184
   macro avg       0.35      0.37      0.34       184
weighted avg       0.52      0.61      0.55       184



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Optimiser les hyperparamètres

In [40]:
from sklearn.model_selection import GridSearchCV

# Définir les paramètres à tester
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2'],
    
}
# Créer un objet GridSearchCV
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)

# Entraîner le modèle avec GridSearch
grid_search.fit(X_train, y_train)

# Afficher les meilleurs paramètres
print(f"Best Parameters: {grid_search.best_params_}")

# Évaluer le modèle avec les meilleurs paramètres
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Best Model Accuracy: {accuracy_best}")

Best Parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 20}
Best Model Accuracy: 0.5760869565217391
