In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Convenience function to create display a progress bar.
# Source : https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def print_progress_bar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

script_dir = Path.cwd() 
df = pd.read_csv(script_dir.parent / '4 - Dataset' / "dataset_classification.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True) #shuffle le dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553 entries, 0 to 552
Data columns (total 26 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   AC or DC                                      553 non-null    float64
 1   Aluminium concentration (%)                   553 non-null    float64
 2   Boron concentration (%)                       553 non-null    float64
 3   Carbon concentration (weight%)                553 non-null    float64
 4   Chromium concentration (weight%)              553 non-null    float64
 5   Copper concentration (weight%)                553 non-null    float64
 6   Current (A)                                   553 non-null    float64
 7   Electrode positive or negative                553 non-null    float64
 8   Heat input (kJ/mm)                            553 non-null    float64
 9   Interpass temperature (deg C)                 553 non-null    flo

In [3]:
X = df.drop(columns=['cluster'])
y = df['cluster']

cluster_mapping_inv = {
        'MoindreResistance/MeilleurDuctilite' : 1,
        'MeilleirResistance/MoindreDuctilite' : 2,
        'ResistanceIntermediaire/DuctiliteIntermediaire' : 0
    }

y= y.replace(cluster_mapping_inv)


  y= y.replace(cluster_mapping_inv)


In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

def scale(X,scaler=MinMaxScaler()) :
    scaler = scaler
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    return X_scaled

X_norm = scale(X, StandardScaler())

In [5]:
from sklearn.model_selection import train_test_split

def trainTest(X,y) :
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Implémentation des modèles

In [6]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [22]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Prévisions et métriques
    y_pred_train = model.predict(X_train)  # Prédictions sur l'ensemble d'entraînement
    y_pred_test = model.predict(X_test)  # Prédictions sur l'ensemble de test

    accuracy_train = accuracy_score(y_train, y_pred_train)  # Accuracy sur l'ensemble d'entraînement
    accuracy_test = accuracy_score(y_test, y_pred_test)  # Accuracy sur l'ensemble de test

    precision = precision_score(y_test, y_pred_test, average='weighted')
    recall = recall_score(y_test, y_pred_test, average='weighted')
    f1 = f1_score(y_test, y_pred_test, average='weighted')
    
    # Validation croisée avec KFold
    kfold = KFold(n_splits=5)
    cross_val_accuracy = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy').mean()

    return {
        "Model": model.__class__.__name__,
        "Train Accuracy": accuracy_train,  # Ajout de l'accuracy sur l'ensemble d'entraînement
        "Test Accuracy": accuracy_test,  # Ajout de l'accuracy sur l'ensemble de test
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        #"Cross-Validated Accuracy": cross_val_accuracy,
        "Best Hyperparameters": model.get_params()
    }


def knn_classifier(X_train, X_test, y_train, y_test):
    param_grid = {
        'n_neighbors': list(range(1, 30)),
        'weights': ['uniform', 'distance']
    }
    model = KNeighborsClassifier()
    random_search = RandomizedSearchCV(model, param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    return evaluate_model(best_model, X_train, X_test, y_train, y_test)

def svm_classifier(X_train, X_test, y_train, y_test):
    param_grid = {
        'C': np.logspace(-3, 3, 10),
        'kernel': ['linear', 'rbf']
    }
    model = SVC()
    random_search = RandomizedSearchCV(model, param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    return evaluate_model(best_model, X_train, X_test, y_train, y_test)

def decision_tree_classifier(X_train, X_test, y_train, y_test):
    param_grid = {
        'max_depth': list(range(1, 20)),
        'min_samples_split': list(range(2, 20))
    }
    model = DecisionTreeClassifier()
    random_search = RandomizedSearchCV(model, param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    return evaluate_model(best_model, X_train, X_test, y_train, y_test)

def random_forest_classifier(X_train, X_test, y_train, y_test):
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': list(range(1, 20)),
        'min_samples_split': list(range(2, 20))
    }
    model = RandomForestClassifier()
    random_search = RandomizedSearchCV(model, param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    return evaluate_model(best_model, X_train, X_test, y_train, y_test)

def gbm_classifier(X_train, X_test, y_train, y_test):
    param_grid = {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.001, 0.01, 0.1],
        'max_depth': list(range(1, 20))
    }
    model = GradientBoostingClassifier()
    random_search = RandomizedSearchCV(model, param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    return evaluate_model(best_model, X_train, X_test, y_train, y_test)


def lightgbm_classifier(X_train, X_test, y_train, y_test):
    param_grid = {
        'n_estimators': [100, 200, 500],
        'learning_rate': [0.001, 0.01, 0.1],
        'max_depth': list(range(1, 20))
    }
    model = lgb.LGBMClassifier()
    random_search = RandomizedSearchCV(model, param_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    return evaluate_model(best_model, X_train, X_test, y_train, y_test)




# Résultat modèles

In [29]:
def evaluate_all_classification_models(X_train, X_test, y_train, y_test):
    results = []
    progress = 0
    # Display a progress bar
    print_progress_bar(progress, int(9), prefix='Progress:', suffix='Complete', length=50)


    # Appel des fonctions de classification et stockage des résultats
    results.append(knn_classifier(X_train, X_test, y_train, y_test))
    progress += 1
    print_progress_bar(progress, int(6), prefix='Progress:', suffix='Complete', length=50)
    results.append(svm_classifier(X_train, X_test, y_train, y_test))
    progress += 1
    print_progress_bar(progress, int(6), prefix='Progress:', suffix='Complete', length=50)
    results.append(decision_tree_classifier(X_train, X_test, y_train, y_test))
    progress += 1
    print_progress_bar(progress, int(6), prefix='Progress:', suffix='Complete', length=50)
    results.append(random_forest_classifier(X_train, X_test, y_train, y_test))
    progress += 1
    print_progress_bar(progress, int(6), prefix='Progress:', suffix='Complete', length=50)
    results.append(gbm_classifier(X_train, X_test, y_train, y_test))
    progress += 1
    print_progress_bar(progress, int(6), prefix='Progress:', suffix='Complete', length=50)
    results.append(lightgbm_classifier(X_train, X_test, y_train, y_test))
    progress += 1
    print_progress_bar(progress, int(6), prefix='Progress:', suffix='Complete', length=50)

    # Conversion des résultats en DataFrame
    return pd.DataFrame(results)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

results_df = evaluate_all_classification_models(X_train, X_test, y_train, y_test)
print(results_df)

Progress: |████████------------------------------------------| 16.7% Complete

KeyboardInterrupt: 

In [27]:
results_df.sort_values(by='Test Accuracy', ascending=False)

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Precision,Recall,F1 Score,Best Hyperparameters
4,GradientBoostingClassifier,1.0,0.753012,0.7586,0.753012,0.754183,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'..."
3,RandomForestClassifier,0.950904,0.740964,0.754512,0.740964,0.741053,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
1,SVC,0.994832,0.728916,0.732565,0.728916,0.729736,"{'C': 46.41588833612773, 'break_ties': False, ..."
5,LGBMClassifier,1.0,0.698795,0.717822,0.698795,0.70297,"{'boosting_type': 'gbdt', 'class_weight': None..."
0,KNeighborsClassifier,1.0,0.686747,0.692178,0.686747,0.688484,"{'algorithm': 'auto', 'leaf_size': 30, 'metric..."
2,DecisionTreeClassifier,0.912145,0.650602,0.657326,0.650602,0.652148,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit..."
