In [4]:

# Import all datasets except boilerplate from data folder

from data.datasets import CreditScore, StudentAddiction, Thyroid

# Import all the necessary libraries

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from data.boilerplate import Dataset as DatasetBoilerplate


from datetime import datetime
import pandas as pd
import joblib
import os
import json

# Import json and make a small helper function to keep the code clean
def load_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)
    
def dataset_log(search: GridSearchCV, X_test, y_test, pred_model, labels):
        
    best_model = search.best_estimator_
    best_params = search.best_params_
    
    y_pred = best_model.predict(X_test)
    
    # Use a single formatted string instead of incremental additions
    
    log_model = f"""
    MODEL {pred_model}:
    Best parameters: {best_params}
    Training score: {search.best_score_}
    Test score: {search.score(X_test, y_test)}
    
    Classification report:
    {classification_report(y_test, y_pred, labels=labels)}
    Accuracy: {accuracy_score(y_test, y_pred)}
    
    Confusion matrix:
    {confusion_matrix(y_test, y_pred, labels=labels)}
    
    Labels: {labels}
    """
    return log_model


output = 'trained_models' # Output folder for trained models
random_state = 42 # Random state for reproducibility
n_jobs = 4 # Number of cores to use for parallel processing


In [5]:

# Here it is possible to adjust the datasets and models that will be used in the project
# Set any of the pred_models to None if you don't want to use it

# Load datasets from training_settings.json

datasets = load_json_file('training_settings.json')

# e.g. if you want to remove the random_forest model from the breast_cancer dataset
# datasets['breast_cancer']['pred_models']['random_forest'] = None

In [6]:

######################################
### Run the models on the datasets ###
######################################

# Loop through all the datasets

# Record the current date and time to the nearest minute
now_format = datetime.now().strftime("%d_%m_%Y__%H-%M")

datasets = load_json_file('training_settings.json')

for dataset in datasets:
    
    log = f"Model training evaluation for {dataset} on {now_format}"
    
    
    # Get the model name from the dataset and load it, if it exists
    dataset_model_name = datasets[dataset]['data_model']
    smote = datasets[dataset]['smote']
    
    # Smote is applied to the data if it is set to True in the datase
    
    if dataset_model_name in globals():
        
        cleaned_data = globals()[dataset_model_name](smote=smote)
        
    else:
        print(f"Dataset {dataset} ({dataset_model_name}) not found")
        break
    
    
    # Split the data into target variable and featurest
    
    X = cleaned_data['X']
    y = cleaned_data['y']
        
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = cleaned_data.train_test_split(X, y)
    
    
    # Train all the models on the dataset
    for pred_model in datasets[dataset]['pred_models']:
        
        print(f"Running {pred_model} on {dataset}")
        
        # Set the steps and parameters for the pipeline
        pipe_params = {}

        match pred_model:
            
            case 'rf':
                steps = [('rf', RandomForestClassifier())]
                pipe_params.update({
                    'rf__n_estimators': [100, 200, 300],
                    'rf__max_depth': [None, 2, 4, 8, 10],
                    'rf__min_samples_split': [2, 5, 10],
                    'rf__min_samples_leaf': [1, 2, 4]
                })
                
            case 'rbf_svm':
                steps = [
                        ('scaler', StandardScaler()),
                        ('rbf_svm', SVC())]
                pipe_params.update({
                    'rbf_svm__C': [0.1, 1, 10, 100],
                    'rbf_svm__gamma': [1, 0.1, 0.01, 0.001],
                    'rbf_svm__kernel': ['rbf']
                })
                
            case 'dnn':
                model = MLPClassifier()
                pipe_params.update({
                    'hidden_layer_sizes': [(100,)],
                    'activation': ['relu', 'tanh'],
                    'alpha': [0.0001, 0.001, 0.01],
                    'learning_rate': ['constant', 'adaptive']
                })
                    
                
            case _:
                steps = None
                print(f"Model {pred_model} not found")
                
        # Apply override parameters
        pipe_params.update(datasets[dataset]['pred_models'][pred_model])
                
        pipe = Pipeline(steps=steps)
        pipe.fit(X_train, y_train)
        
        search = GridSearchCV(pipe, pipe_params, n_jobs=n_jobs)
        search.fit(X_train, y_train)
        
        
        # Create a folder with the datetime if it doesn't exist
        base = output + '/' + now_format
        if not os.path.exists(base):
            os.makedirs(base)
        
        # Save the model to the output folder with the dataset name and the model name
        joblib.dump(search.best_estimator_, f"{base}/{dataset}_{pred_model}.joblib")
        
        # Log the evaluation metrics to a log file, including the best parameters
        log += dataset_log(search, X_test, y_test, pred_model, cleaned_data.get_labels())
    
        
    # Save the log to a file
    with open(f"{base}/{dataset}_model_comparison.txt", 'w') as file:
        file.write(log)
        file.close()
        
        
        

Running rf on credit_score
