In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Convenience function to create display a progress bar.
# Source : https://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console
def print_progress_bar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total:
        print()

script_dir = Path.cwd() 
df = pd.read_csv(script_dir.parent / '4 - Dataset' / "regression_weld_data.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1652 entries, 0 to 1651
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Carbon concentration (weight%)                1652 non-null   float64
 1   Silicon concentration (weight%)               1652 non-null   float64
 2   Manganese concentration (weight%)             1652 non-null   float64
 3   Sulphur concentration (weight%)               1652 non-null   float64
 4   Phosphorus concentration (weight%)            1652 non-null   float64
 5   Oxygen concentration (%)                      1652 non-null   float64
 6   Nitrogen concentration (%)                    1652 non-null   float64
 7   Current (A)                                   1652 non-null   float64
 8   Voltage (V)                                   1652 non-null   float64
 9   Heat input (kJ/mm)                            1652 non-null   f

In [17]:
df_yieldStrength = df.iloc[:,:26]
df_yieldStrength = df_yieldStrength.dropna()
df_yieldStrength = df_yieldStrength.reset_index(drop=True) #Réinitialiser l'indexation
#shuffle
df_yieldStrength = df_yieldStrength.sample(frac=1, random_state=42).reset_index(drop=True)
print(df_yieldStrength.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Data columns (total 26 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Carbon concentration (weight%)                780 non-null    float64
 1   Silicon concentration (weight%)               780 non-null    float64
 2   Manganese concentration (weight%)             780 non-null    float64
 3   Sulphur concentration (weight%)               780 non-null    float64
 4   Phosphorus concentration (weight%)            780 non-null    float64
 5   Oxygen concentration (%)                      780 non-null    float64
 6   Nitrogen concentration (%)                    780 non-null    float64
 7   Current (A)                                   780 non-null    float64
 8   Voltage (V)                                   780 non-null    float64
 9   Heat input (kJ/mm)                            780 non-null    flo

In [16]:
X = df.iloc[:,:25]
ys = df.iloc[:,25:]
print(ys.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780 entries, 0 to 779
Data columns (total 1 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Yield strength (MPa)  780 non-null    float64
dtypes: float64(1)
memory usage: 6.2 KB
None


In [11]:
from sklearn.model_selection import train_test_split

def trainTest(X,y) :
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, cross_val_predict, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# Fonction pour calculer le RMSE
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Configuration de la validation croisée (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Fonction pour entraîner et évaluer le modèle Bayesian Ridge
def bayesian_ridge_regression(X_train, X_test, y_train, y_test):
    model = BayesianRidge()

    # Validation croisée sur l'ensemble d'entraînement
    y_train_pred_cv = cross_val_predict(model, X_train, y_train, cv=kf)

    # Ajustement du modèle et prédictions sur l'ensemble de test
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)

    # Calcul des scores
    rmse_train = calculate_rmse(y_train, y_train_pred_cv)
    rmse_test = calculate_rmse(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred_cv)
    r2_test = r2_score(y_test, y_test_pred)

    return {"Model": "Bayesian Ridge", "RMSE_Train": rmse_train, "RMSE_Test": rmse_test, 
            "R2_Train": r2_train, "R2_Test": r2_test}

# Fonction pour entraîner et évaluer le modèle SVR avec Random Grid Search
def svr_regression(X_train, X_test, y_train, y_test):
    model = SVR()
    param_grid = {
        'kernel': ['rbf', 'poly'],
        'C': np.logspace(-3, 3, 7),
        'gamma': ['scale', 'auto'],
        'degree': [2, 3, 4]
    }
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=4, cv=kf, random_state=42)
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    # Validation croisée sur l'ensemble d'entraînement
    y_train_pred_cv = cross_val_predict(best_model, X_train, y_train, cv=kf)

    # Prédictions sur l'ensemble de test
    y_test_pred = best_model.predict(X_test)

    # Calcul des scores
    rmse_train = calculate_rmse(y_train, y_train_pred_cv)
    rmse_test = calculate_rmse(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred_cv)
    r2_test = r2_score(y_test, y_test_pred)

    return {"Model": "SVR", "RMSE_Train": rmse_train, "RMSE_Test": rmse_test, 
            "R2_Train": r2_train, "R2_Test": r2_test}

# Fonction pour entraîner et évaluer le modèle de Gradient Boosting avec Random Grid Search
def gradient_boosting_regression(X_train, X_test, y_train, y_test):
    model = GradientBoostingRegressor()
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.05],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5, 10]
    }
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=4, cv=kf, random_state=42)
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    # Validation croisée sur l'ensemble d'entraînement
    y_train_pred_cv = cross_val_predict(best_model, X_train, y_train, cv=kf)

    # Prédictions sur l'ensemble de test
    y_test_pred = best_model.predict(X_test)

    # Calcul des scores
    rmse_train = calculate_rmse(y_train, y_train_pred_cv)
    rmse_test = calculate_rmse(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred_cv)
    r2_test = r2_score(y_test, y_test_pred)

    return {"Model": "Gradient Boosting", "RMSE_Train": rmse_train, "RMSE_Test": rmse_test, 
            "R2_Train": r2_train, "R2_Test": r2_test}

# Fonction pour entraîner et évaluer le modèle de Forêt Aléatoire avec Random Grid Search
def random_forest_regression(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=4, cv=kf, random_state=42)
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    # Validation croisée sur l'ensemble d'entraînement
    y_train_pred_cv = cross_val_predict(best_model, X_train, y_train, cv=kf)

    # Prédictions sur l'ensemble de test
    y_test_pred = best_model.predict(X_test)

    # Calcul des scores
    rmse_train = calculate_rmse(y_train, y_train_pred_cv)
    rmse_test = calculate_rmse(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred_cv)
    r2_test = r2_score(y_test, y_test_pred)

    return {"Model": "Random Forest", "RMSE_Train": rmse_train, "RMSE_Test": rmse_test, 
            "R2_Train": r2_train, "R2_Test": r2_test}

# Fonction pour entraîner et évaluer la régression linéaire
def linear_regression(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Validation croisée sur l'ensemble d'entraînement
    y_train_pred_cv = cross_val_predict(model, X_train, y_train, cv=kf)

    # Prédictions sur l'ensemble de test
    y_test_pred = model.predict(X_test)

    # Calcul des scores
    rmse_train = calculate_rmse(y_train, y_train_pred_cv)
    rmse_test = calculate_rmse(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred_cv)
    r2_test = r2_score(y_test, y_test_pred)

    return {"Model": "Linear Regression", "RMSE_Train": rmse_train, "RMSE_Test": rmse_test, 
            "R2_Train": r2_train, "R2_Test": r2_test}

# Fonction pour entraîner et évaluer la régression Ridge avec Random Grid Search
def ridge_regression(X_train, X_test, y_train, y_test):
    model = Ridge()
    param_grid = {
        'alpha': [0.01, 0.1, 1, 10, 100]
    }
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=4, cv=kf, random_state=42)
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    # Validation croisée sur l'ensemble d'entraînement
    y_train_pred_cv = cross_val_predict(best_model, X_train, y_train, cv=kf)

    # Prédictions sur l'ensemble de test
    y_test_pred = best_model.predict(X_test)

    # Calcul des scores
    rmse_train = calculate_rmse(y_train, y_train_pred_cv)
    rmse_test = calculate_rmse(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred_cv)
    r2_test = r2_score(y_test, y_test_pred)

    return {"Model": "Ridge Regression", "RMSE_Train": rmse_train, "RMSE_Test": rmse_test, 
            "R2_Train": r2_train, "R2_Test": r2_test}

# Fonction pour entraîner et évaluer la régression Lasso avec Random Grid Search
def lasso_regression(X_train, X_test, y_train, y_test):
    model = Lasso()
    param_grid = {
        'alpha': [0.01, 0.1, 1, 10, 100]
    }
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=4, cv=kf, random_state=42)
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_

    # Validation croisée sur l'ensemble d'entraînement
    y_train_pred_cv = cross_val_predict(best_model, X_train, y_train, cv=kf)

    # Prédictions sur l'ensemble de test
    y_test_pred = best_model.predict(X_test)

    # Calcul des scores
    rmse_train = calculate_rmse(y_train, y_train_pred_cv)
    rmse_test = calculate_rmse(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred_cv)
    r2_test = r2_score(y_test, y_test_pred)

    return {"Model": "Lasso Regression", "RMSE_Train": rmse_train, "RMSE_Test": rmse_test, 
            "R2_Train": r2_train, "R2_Test": r2_test}

# Fonction principale qui exécute tous les modèles et retourne les résultats dans un DataFrame
def evaluate_all_models(X_train, X_test, y_train, y_test):
    results = []

    # Appel des fonctions de régression et stockage des résultats
    results.append(bayesian_ridge_regression(X_train, X_test, y_train, y_test))
    results.append(svr_regression(X_train, X_test, y_train, y_test))
    results.append(gradient_boosting_regression(X_train, X_test, y_train, y_test))
    results.append(random_forest_regression(X_train, X_test, y_train, y_test))
    results.append(linear_regression(X_train, X_test, y_train, y_test))
    results.append(ridge_regression(X_train, X_test, y_train, y_test))
    results.append(lasso_regression(X_train, X_test, y_train, y_test))
    # Conversion des résultats en DataFrame
    return pd.DataFrame(results)


def evaluateAllTarget(df, indexRangeFeatures , indexRangeTargets) :
    progress = 0
    # Display a progress bar
    print_progress_bar(progress, int(len(indexRangeTargets)), prefix = 'Progress:', suffix = 'Complete', length = 50)
    for i in range (len(indexRangeTargets)) :
        progress += 1
        print_progress_bar(progress, int(len(indexRangeTargets)), prefix = 'Progress:', suffix = 'Complete', length = 50)

        df_weld = df.iloc[:, indexRangeFeatures + [indexRangeTargets[i]] ]
        df_weld = df_weld.dropna()
        X = df_weld.iloc[:,indexRangeFeatures]
        y = df_weld.iloc[:,len(indexRangeFeatures)]
        col_name = df_weld.columns[len(indexRangeFeatures)]

        print(f"\n\nTaille du dataset avec target {col_name} : {df_weld.shape}")
        X_train, X_test, y_train, y_test = train_test_split(X,y)
        print(f"Résultat pour {col_name} avec pour moyenne {y.mean()}: ")
        df_results = evaluate_all_models(X_train, X_test, y_train, y_test)
        print(df_results)

#baye = bayesian_ridge_regression(X_train,  X_test, y_train, y_test)
#svr = svr_regression(X_train,  X_test, y_train, y_test)
#print(baye)
#print(svr)

evaluateAllTarget(df, list(range(25)), list(range(25,30)))

Progress: |██████████----------------------------------------| 20.0% Complete

Taille du dataset avec target Yield strength (MPa) : (780, 26)
Résultat pour Yield strength (MPa) avec pour moyenne 508.55717948717944: 
               Model  RMSE_Train  RMSE_Test  R2_Train   R2_Test
0     Bayesian Ridge   84.904131  79.159697  0.176272  0.229962
1                SVR   82.999739  80.881945  0.212810  0.196090
2  Gradient Boosting   54.248394  51.550607  0.663721  0.673433
3      Random Forest   59.242829  50.788595  0.598951  0.683017
4  Linear Regression   83.762880  77.457930  0.198268  0.262714
5   Ridge Regression   79.944565  76.394933  0.269696  0.282812
6   Lasso Regression   80.466548  76.703696  0.260128  0.277003
Progress: |████████████████████------------------------------| 40.0% Complete

Taille du dataset avec target Ultimate tensile strength (MPa) : (738, 26)
Résultat pour Ultimate tensile strength (MPa) avec pour moyenne 594.3863143631436: 
               Model  RMSE_Train  R