In [5]:
# from pathlib import Path
# import nbformat

# def load_notebook(notebook_path):
#     with open(notebook_path, 'r', encoding='utf-8') as f:
#         nb = nbformat.read(f, as_version=4)
#     code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
#     exec('\n'.join(code_cells), globals())

# # import written function and variable

# parent_directory = Path('./')
# data_preprocessing_utils_path = parent_directory / 'data_preprocessing_utils.ipynb'
# load_notebook(data_preprocessing_utils_path)

# Obtain and log best training and model parameters

In [None]:
import ast
import math
import numpy as np

def obtain_best_train_params(model_name, record_data_df):
    
    train_best_params = {
        'MSE': {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
        'MAE': {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
        'RMSE': {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
    }
    
    if model_name == "AutoInt":
        record_data_df = record_data_df[(record_data_df[f"{model_name}_atten_embed_dim"]==32) &
                                    (record_data_df[f"{model_name}_num_layers"]==3) &
                                    (record_data_df[f"{model_name}_mlp_dims"]=='(256, 256, 256)') &
                                    (record_data_df["Epoch/Epochs_num"]=='100th/100')]
    elif model_name == "MLP":
        record_data_df = record_data_df[(record_data_df[f"{model_name}_embed_dims"]=='(256, 256, 256)') &
                                    (record_data_df["Epoch/Epochs_num"]=='100th/100')]
    batch_size = 'Batch_size'
    dl_learning_rate = 'DL_learning_rate'
    dropouts = f'{model_name}_dropouts'
    dropout = f'{model_name}_dropout'

    with open(f"{model_name}_train_best_params.txt", "w") as f:
        for metric in ['MSE', 'MAE', 'RMSE']:
            print(f'Train {metric}:\n')
            min_metric_indices = record_data_df[metric].nsmallest(5).index
            n = 0
            for index in min_metric_indices:
                n += 1
                train_best_params[metric][f'{n}']['batch_size'] = int(record_data_df.loc[index, batch_size])
                train_best_params[metric][f'{n}']['dl_learning_rate'] = float(record_data_df.loc[index, dl_learning_rate])
                if model_name == "MLP":
                    train_best_params[metric][f'{n}']['dropout'] = float(record_data_df.loc[index, dropout])
                else: # AutoInt
                    train_best_params[metric][f'{n}']['dropout'] = ast.literal_eval(record_data_df.loc[index, dropouts])
                print(f'{n}. ',
                    "batch_size:", train_best_params[metric][f'{n}']['batch_size'], 
                    "dl_learning_rate:", train_best_params[metric][f'{n}']['dl_learning_rate'], 
                    "dropout:", train_best_params[metric][f'{n}']['dropout'],
                    #               "\n",
                    "MSE", record_data_df.loc[index, 'MSE'], 
                    "MAE", record_data_df.loc[index, 'MAE'], 
                    "RMSE", record_data_df.loc[index, 'RMSE'], 
                    "Index:", index
                )
                f.write(f'Train {metric}:\n')
                f.write(f'{n}. ' + 
                      " batch_size: " + str(train_best_params[metric][f'{n}']['batch_size']) + 
                      " dl_learning_rate: " + str(train_best_params[metric][f'{n}']['dl_learning_rate']) + 
                      " dropout: " + str(train_best_params[metric][f'{n}']['dropout']) +
        #               "\n",
                      " MSE: " + str(record_data_df.loc[index, 'MSE']) +
                      " MAE: " + str(record_data_df.loc[index, 'MAE']) +
                      " RMSE: " + str(record_data_df.loc[index, 'RMSE']) + 
                      " Index: " + str(index)
                )
            print("\n")
            f.write("\n\n")

    return train_best_params
    
def obtain_best_all_params(model_name, record_data, train_best_params=None):
    
    if model_name in ["AutoInt", "MLP"]:
        
        if train_best_params == None:
            print("AutoInt and MLP need training params first !")
            return 0
        
        best_all_params = {
            "Train_MSE": {
                "Model_MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
            },
            "Train_MAE": {
                "Model_MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
            },
            "Train_RMSE": {
                "Model_MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
            },    
        }
        
        batch_size = 'Batch_size'
        dl_learning_rate = 'DL_learning_rate'
        dropouts = f'{model_name}_dropouts'
        dropout = f'{model_name}_dropout'
        
        atten_embed_dim = f"{model_name}_atten_embed_dim" # AutoInt
        num_layers = f"{model_name}_num_layers" # AutoInt
        mlp_dims = f"{model_name}_mlp_dims" # AutoInt
        embed_dims = f"{model_name}_embed_dims" # MLP
        
        for train_metric in ["MSE", "MAE", "RMSE"]:
            
            record_data_df = record_data
            
            metric_batch_size = train_best_params[train_metric]['1']['batch_size']
            metric_dl_learning_rate = train_best_params[train_metric]['1']['dl_learning_rate']
            metric_dropout = train_best_params[train_metric]['1']['dropout']
#             print("metric_batch_size:", metric_batch_size, "metric_batch_size type:", type(metric_batch_size))
#             print("metric_dl_learning_rate:", metric_dl_learning_rate, "metric_dl_learning_rate type:", type(metric_dl_learning_rate))
#             print("metric_dropout:", metric_dropout, "metric_dropout type:", type(metric_dropout))
            record_data_df = record_data_df[(record_data_df[f"{batch_size}"]==metric_batch_size) &
                                        (record_data_df[f"{dl_learning_rate}"]==metric_dl_learning_rate) &
                                        (record_data_df["Epoch/Epochs_num"]=='100th/100')]
#             print(len(record_data_df))
            if model_name == "AutoInt":
                record_data_df = record_data_df[record_data_df[f"{dropouts}"]==str(metric_dropout)]
#                 print(len(record_data_df))
            elif model_name == "MLP":
                record_data_df = record_data_df[record_data_df[f"{dropout}"]==metric_dropout]
#                 print(len(record_data_df))
            else:
                print("Currently support FM, AutoInt, MLP, XGBoost, Random Forest !")
                return 0
            
            for model_metric in ["MSE", "MAE", "RMSE"]:
            
                min_metric_indices = record_data_df[model_metric].nsmallest(5).index
                n = 0
                for index in min_metric_indices:
                    n += 1
                    best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['batch_size'] = int(record_data_df.loc[index, batch_size])
                    best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['dl_learning_rate'] = float(record_data_df.loc[index, dl_learning_rate])
                    
                    if model_name == "AutoInt":
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['dropout'] = ast.literal_eval(record_data_df.loc[index, dropouts])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['atten_embed_dim'] = int(record_data_df.loc[index, atten_embed_dim])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['num_layers'] = int(record_data_df.loc[index, num_layers])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['mlp_dims'] = ast.literal_eval(record_data_df.loc[index, mlp_dims])
                        
                    elif model_name == "MLP":
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['dropout'] = float(record_data_df.loc[index, dropout])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['embed_dims'] = ast.literal_eval(record_data_df.loc[index, embed_dims])
                    
                    else:
                        print("Currently support FM, AutoInt, MLP, XGBoost, Random Forest !")
                        return 0
        
        print_nested_dict(best_all_params)
        with open(f"{model_name}_all_best_params.txt", "w") as f:
            write_nested_dict_to_file(best_all_params, f)
        
        return best_all_params
    
    elif model_name in ["FM", "XGBoost", "RandomForest"]:
        
        best_all_params = {
            "MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
            "MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
            "RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
        }
        
        # FM
        batch_size = 'Batch_size'
        dl_learning_rate = 'DL_learning_rate'
        factors_num = f"{model_name}_factors_num"
        
        # XGBoost
        xgb_colsample_bytree = f"{model_name}_colsample_bytree"
        xgb_lambda = f"{model_name}_lambda"
        xgb_learning_rate = f"{model_name}_learning_rate"
        xgb_max_depth = f"{model_name}_max_depth"
        xgb_min_child_weight = f"{model_name}_min_child_weight"
        xgb_n_estimators = f"{model_name}_n_estimators"
        xgb_subsample = f"{model_name}_subsample"
        
        # Random Forest
        rf_max_depth = f"{model_name}_max_depth"
        rf_max_features = f"{model_name}_max_features"
        rf_min_samples_leaf = f"{model_name}_min_samples_leaf"
        rf_min_samples_split = f"{model_name}_min_samples_split"
        rf_n_estimators = f"{model_name}_n_estimators"
        
        record_data_df = record_data
        
        if model_name == "FM":
            record_data_df = record_data_df[record_data_df["Epoch/Epochs_num"]=='100th/100']
        
        for metric in ["MSE", "MAE", "RMSE"]:
            min_metric_indices = record_data_df[metric].nsmallest(5).index
            n = 0
            for index in min_metric_indices:
                n += 1
                if model_name == "FM":
                    best_all_params[metric][f'{n}']['factors_num'] = int(record_data_df.loc[index, factors_num])
                    best_all_params[metric][f'{n}']['batch_size'] = int(record_data_df.loc[index, batch_size])
                    best_all_params[metric][f'{n}']['dl_learning_rate'] = float(record_data_df.loc[index, dl_learning_rate])
                
                elif model_name == "XGBoost":
                    best_all_params[metric][f'{n}']['colsample_bytree'] = float(record_data_df.loc[index, xgb_colsample_bytree])
                    best_all_params[metric][f'{n}']['lambda'] = float(record_data_df.loc[index, xgb_lambda])
                    best_all_params[metric][f'{n}']['learning_rate'] = float(record_data_df.loc[index, xgb_learning_rate])
                    best_all_params[metric][f'{n}']['max_depth'] = int(record_data_df.loc[index, xgb_max_depth])
                    best_all_params[metric][f'{n}']['min_child_weight'] = int(record_data_df.loc[index, xgb_min_child_weight])
                    best_all_params[metric][f'{n}']['n_estimators'] = int(record_data_df.loc[index, xgb_n_estimators])
                    best_all_params[metric][f'{n}']['subsample'] = float(record_data_df.loc[index, xgb_subsample])
    
                elif model_name == "RandomForest":
                    max_feature_num = record_data_df.loc[index, rf_max_features]
                    max_feature_num = int(max_feature_num) if max_feature_num.isdigit() else max_feature_num
                    max_depth = record_data_df.loc[index, rf_max_depth]
                    max_depth = None if math.isnan(max_depth) else int(max_depth)
                    best_all_params[metric][f'{n}']['max_depth'] = max_depth
                    best_all_params[metric][f'{n}']['max_features'] = max_feature_num
                    best_all_params[metric][f'{n}']['min_samples_leaf'] = int(record_data_df.loc[index, rf_min_samples_leaf])
                    best_all_params[metric][f'{n}']['min_samples_split'] = int(record_data_df.loc[index, rf_min_samples_split])
                    best_all_params[metric][f'{n}']['n_estimators'] = int(record_data_df.loc[index, rf_n_estimators])
        
        print_nested_dict(best_all_params)
        with open(f"{model_name}_all_best_params.txt", "w") as f:
            write_nested_dict_to_file(best_all_params, f)
        
        return best_all_params
        
    else:
        print("Currently support FM, AutoInt, MLP, XGBoost, Random Forest !")
        return 0
    

# Testing with mlflow experiment name (run id)

In [None]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def test_with_mlflow_experiment_name(X_train, y_train, X_valid, y_valid, model_name, 
                                experiment_name, metric, sorting='ASC', top_n=1, save_records=True):
    
    if "tv" not in experiment_name:
        print("Experiment must come from validation experiment !")
        return 0
    
    experiment = mlflow.get_experiment_by_name(experiment_name)

    experiment_id = 0
    if experiment is not None:
        experiment_id = experiment.experiment_id
    else:
        print("Experiment is empty !")
        return 0
    
    search_params = {
        "order_by": [f"metrics.eval_{metric.lower()} {sorting}"],
        "max_results": top_n
    }
    results = mlflow.search_runs(experiment_ids=experiment_id, 
                                 order_by=search_params["order_by"], 
                                 max_results=search_params["max_results"])
    run_id = 0
    if not results.empty:
        run_id = results.iloc[0]["run_id"]
        run_id = str(run_id)
        print(f"Run ID from best {metric} params in {experiment_name}:", run_id)
    else:
        print("Run ID is empty !")
        return 0

    run_info = mlflow.get_run(run_id)
    params = run_info.data.params
    
    print_nested_dict(params)
    
    phase = "test"
    
    deep_learning_model_names = ["FM", "MLP", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
    machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                    "AdaBoost", "XGBoost"]
    regression_loss_list = ["MSE, RMSE", "MAE"]
    classification_loss_list = ["CrossEntropy", "BinaryCrossEntropy"]
    
    if save_records:
            
        validation_column_list = [
            "Timestamp", "Model_name", *params.keys(), "RMSE", "MSE", "MAE", "Accuracy", "AUC_score", "F1", "Precision", "Recall", "Specificity"
        ]

        # Create folder if not exists
        folder_name = f"{model_name}_result_records"
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        validation_file_path = os.path.join(folder_name, f'{model_name}_testing_result_records_with_runID.csv')

        if not os.path.exists(validation_file_path):
            empty_df = pd.DataFrame(columns=validation_column_list)
            empty_df.to_csv(validation_file_path, index=False, float_format='%.6f')

        else:
            record_df = pd.read_csv(validation_file_path)
            columns_to_compare = ["Model_name", *params.keys()]
            record_df = record_df[columns_to_compare]
            record_df = record_df.values.tolist()
            record_df = [[str(value) for value in onelist] for onelist in record_df]
            temp_df_values = [[model_name, *params.values()]]
            temp_df = pd.DataFrame(temp_df_values, columns=columns_to_compare)
            temp_df = temp_df.values.tolist()[0]
            temp_df = [str(value) for value in temp_df]

            if temp_df in record_df:
                print("Parameters already exists in test results !")
                return 0

    else:
        print("Test results are not saved to csv !")
    
    
    if model_name in deep_learning_model_names:
 
        with mlflow.start_run(run_name=f"Trained {metric.upper()} params") as run:
            
            mlflow.log_params(params) # Log training parameters
            model_dir = f"runs:/{run_id}/{model_name}_model"
            model = mlflow.pytorch.load_model(model_dir) # Load the PyTorch model from the specified directory
            
            # Log model summary.
            with open(f"{model_name}_model_summary.txt", "w") as f:
                f.write(str(summary(model)))
            mlflow.log_artifact(f"{model_name}_model_summary.txt")

            # Convert to float tensor
            X_train_tensor = torch.from_numpy(X_train).float()
            Xi_train_tensor = torch.arange(X_train_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_train_tensor.size(0), 1, 1).int()
            y_train_tensor = torch.from_numpy(y_train).float()
            X_valid_tensor = torch.from_numpy(X_valid).float()
            Xi_valid_tensor = torch.arange(X_valid_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_valid_tensor.size(0), 1, 1).int()
            y_valid_tensor = torch.from_numpy(y_valid).float()

            # Convert data to DataLoader
            train_dataset = TensorDataset(Xi_train_tensor, X_train_tensor, y_train_tensor)
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            validation_dataset = TensorDataset(Xi_valid_tensor, X_valid_tensor, y_valid_tensor)
            validation_loader = DataLoader(validation_dataset, batch_size=len(validation_data), shuffle=True)
            
            y_pred = 0
            validation_results_df = []
            criterion = 0
            
            print(f"Start testing with best {metric.upper()} params ...")
            
            for t, (xi, x, y_true) in enumerate(validation_loader):

                if model_name == "DeepFM":
                    y_pred = model(xi, x)
                elif model_name == "MLP":
                    y_pred = model(x)
                    y_pred = y_pred.view(-1)
                else:
                    y_pred = model(x)
            
            # Calculate metric values
            loss = 0.0

            rmse = 0.0
            mse = 0.0
            mae = 0.0
            accuracy = 0.0
            auc_score = 0.0
            f1 = 0.0
            precision = 0.0
            recall = 0.0
            specificity = 0.0

            if params['task_type'] == "Regression":
                criterion = nn.MSELoss()
                if params['loss_type'] == "RMSE":
                    eps = 1e-6
                    if model_name == "FM":
                        loss = torch.sqrt(criterion(y_pred, y_true.view(-1, 1)) + eps)
                    else:
                        loss = torch.sqrt(criterion(y_pred, y_true.view(-1)) + eps)
                elif params['loss_type'] == "MSE":
                    if model_name == "FM":
                        loss = criterion(y_pred, y_true.view(-1, 1))
                    else:        
                        loss = criterion(y_pred, y_true.view(-1))
                else:
                    print(f"Please make sure loss type is in {regression_loss_list}")
                    return 0

                y_true = y_true.detach().numpy() # Convert PyTorch tensor to NumPy array
                y_pred = y_pred.detach().numpy() # Convert PyTorch tensor to NumPy array
                y_pred = np.nan_to_num(y_pred, nan=0) # prevent error
                predictions = y_pred
                
                rmse = root_mean_squared_error(y_true, y_pred)
                mse = mean_squared_error(y_true, y_pred)
                mae = mean_absolute_error(y_true, y_pred)

            elif params['task_type'] == "Classification":
                if params['loss_type'] == "BCE":
                        criterion = nn.BCELoss()
                        y_pred = torch.sigmoid(y_pred)
                elif params['loss_type'] == "CE":
                    criterion = nn.CrossEntropyLoss()
                else:
                    print(f"Please make sure loss type is in {classification_loss_list}")
                    return 0

                loss = criterion(y_pred, y_true)

                y_true = y_true.detach().numpy() # Convert PyTorch tensor to NumPy array
                y_pred = y_pred.detach().numpy() # Convert PyTorch tensor to NumPy array
                predictions = y_pred
                
                accuracy = accuracy_score(y_true, y_pred) # Accuracy
                auc_score = roc_auc_score(y_true, y_pred) # AUC
                f1 = f1_score(y_true, y_pred) # F1-score
                precision = precision_score(y_true, y_pred) # Precision
                recall = recall_score(y_true, y_pred) # Recall
                specificity = recall_score(y_true, y_pred, pos_label=1) # Specificity (True Negative Rate)

            else:
                print("Please make sure the task is regression or classification !")
                return 0

#             mlflow.log_metric(f"{phase}_loss", f"{loss:6f}")
            mlflow.log_metric(f"{phase}_rmse", f"{rmse:6f}")
            mlflow.log_metric(f"{phase}_mse", f"{mse:6f}")
            mlflow.log_metric(f"{phase}_mae", f"{mae:6f}")

            mlflow.log_metric(f"{phase}_accuracy", f"{accuracy:6f}")
            mlflow.log_metric(f"{phase}_auc_score", f"{auc_score:6f}")
            mlflow.log_metric(f"{phase}_f1", f"{f1:6f}")
            mlflow.log_metric(f"{phase}_precision", f"{precision:6f}")
            mlflow.log_metric(f"{phase}_recall", f"{recall:6f}")
            mlflow.log_metric(f"{phase}_specificity", f"{specificity:6f}")

            # Save the trained model to MLflow.
            input_example = X_train[0]
            signatures = infer_signature(input_example, predictions)
            mlflow.pytorch.log_model(model, f"{model_name}_model", signature=signatures)
            
            value_list = [
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), model_name, *params.values(), 
                rmse, mse, mae, accuracy, auc_score, f1, precision, recall, specificity
            ]

            validation_results_df.append(value_list)
            
            # Save training and validation results to file
            validation_records_df = pd.DataFrame(validation_results_df, columns=validation_column_list)
            validation_records_df.to_csv(validation_file_path, mode='a', header=False, index=False, float_format='%.6f')
    
    elif model_name in machine_learning_model_names:
        
        with mlflow.start_run(run_name=f"Trained {metric.upper()} params") as run:
            
            mlflow.log_params(params) # Log training parameters
            model_dir = f"runs:/{run_id}/{model_name}_model"
            model = 0
            
            if model_name in ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", "AdaBoost"]:
                model = mlflow.sklearn.load_model(model_dir)
            elif model_name == "XGBoost":
                model = mlflow.xgboost.load_model(model_dir)
            elif model_name == "CatBoost":
                model = mlflow.catboost.load_model(model_dir)
            else:
                print(f"Model not in {machine_learning_model_names}, couldn't load the model !")
                        
            validation_results_df = []

            print(f"Start testing with best {metric.upper()} params ...")

            y_pred = model.predict(X_valid)
            y_pred = np.nan_to_num(y_pred, nan=0) # prevent error
            predictions = y_pred
            
            # Calculate metric values
            rmse = 0.0
            mse = 0.0
            mae = 0.0
            accuracy = 0.0
            auc_score = 0.0
            f1 = 0.0
            precision = 0.0
            recall = 0.0
            specificity = 0.0

            if params['task_type'] == "Regression":
                rmse = root_mean_squared_error(y_valid, y_pred)
                mse = mean_squared_error(y_valid, y_pred)
                mae = mean_absolute_error(y_valid, y_pred)
            elif params['task_type'] == "Classification":
                accuracy = accuracy_score(y_valid, y_pred) # Accuracy
                auc_score = roc_auc_score(y_valid, y_pred) # AUC
                f1 = f1_score(y_valid, y_pred) # F1-score
                precision = precision_score(y_valid, y_pred) # Precision
                recall = recall_score(y_valid, y_pred) # Recall
                specificity = recall_score(y_valid, y_pred, pos_label=1) # Specificity (True Negative Rate)
            else:
                print("Please make sure the task is regression or classification !")
                return 0

            mlflow.log_metric(f"{phase}_rmse", f"{rmse:4f}")
            mlflow.log_metric(f"{phase}_mse", f"{mse:4f}")
            mlflow.log_metric(f"{phase}_mae", f"{mae:4f}")
            mlflow.log_metric(f"{phase}_accuracy", f"{accuracy:4f}")
            mlflow.log_metric(f"{phase}_auc_score", f"{auc_score:4f}")
            mlflow.log_metric(f"{phase}_f1", f"{f1:4f}")
            mlflow.log_metric(f"{phase}_precision", f"{precision:4f}")
            mlflow.log_metric(f"{phase}_recall", f"{recall:4f}")
            mlflow.log_metric(f"{phase}_specificity", f"{specificity:4f}")

            input_example = X_train[0]
            signatures = infer_signature(input_example, predictions)
            # Save the trained model to MLflow.
            if model_name in ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", "AdaBoost"]:
                mlflow.sklearn.log_model(model, f"{model_name}_model", signature=signatures)
    #                 mlflow.sklearn.log_model(model, f"{model_name}_model")
            elif model_name == "XGBoost":
                mlflow.xgboost.log_model(model, f"{model_name}_model", signature=signatures)
    #                 mlflow.xgboost.log_model(model, f"{model_name}_model")
            elif model_name == "CatBoost":
                mlflow.catboost.log_model(model, f"{model_name}_model", signature=signatures)
    #                 mlflow.catboost.log_model(model, f"{model_name}_model")
            else:
                print(f"Model not in {machine_learning_model_names}, couldn't save the model !")
            
            value_list = [
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), model_name, *params.values(), 
                rmse, mse, mae, accuracy, auc_score, f1, precision, recall, specificity
            ]

            validation_results_df.append(value_list)       
    #         print(f"RMSE on the validation set: {rmse}")

            # Save validation results to file
            validation_records_df = pd.DataFrame(validation_results_df, columns=validation_column_list)
            validation_records_df.to_csv(validation_file_path, mode='a', header=False, index=False, float_format='%.6f')
    
    else:
        return f"Please select a model in {deep_learning_model_names} and {machine_learning_model_names} !"

# Review num distribution

In [None]:
import pandas as pd
from collections import defaultdict
import nltk
# nltk.download('punkt')

def get_word_tokens(text):
    tokens = nltk.word_tokenize(text)
    return tokens

def calcuate_reviews_num(reviews_df): 

    # Count reviews per user and calculate total word count and token count per user
    user_stats = defaultdict(dict)
    
    for index, review in reviews_df.iterrows():
        user_id = review['user_id']
        if user_id not in user_stats:
            user_stats[user_id]['reviews_num'] = 0
            user_stats[user_id]['words_num'] = 0
            user_stats[user_id]['tokens_num'] = 0

        user_stats[user_id]['reviews_num'] += 1
        user_stats[user_id]['words_num'] += len(review['text'].split())
        user_stats[user_id]['tokens_num'] += len(get_word_tokens(review['text']))

    # Count reviews per business and calculate total word count and token count per business
    business_stats = defaultdict(dict)

    for index, review in reviews_df.iterrows():
        business_id = review['business_id']
        if business_id not in business_stats:
            business_stats[business_id]['reviews_num'] = 0
            business_stats[business_id]['words_num'] = 0
            business_stats[business_id]['tokens_num'] = 0

        business_stats[business_id]['reviews_num'] += 1
        business_stats[business_id]['words_num'] += len(review['text'].split())
        business_stats[business_id]['tokens_num'] += len(get_word_tokens(review['text']))

    # Convert to DataFrame
    user_df = pd.DataFrame.from_dict(user_stats, orient='index').reset_index()
    user_df.columns = ['user_id', 'reviews_num', 'words_num', 'tokens_num']

    business_df = pd.DataFrame.from_dict(business_stats, orient='index').reset_index()
    business_df.columns = ['business_id', 'reviews_num', 'words_num', 'tokens_num']
    
    return user_df, business_df

In [None]:
def remove_outliers(data):
    mean = data.mean()
    std = data.std()
    lower_bound = mean - 2 * std
    upper_bound = mean + 2 * std
    return data[(data >= lower_bound) & (data <= upper_bound)]

def calculate_distribution(data):
    
    data = remove_outliers(data)
    
    distribution = {
        'min': np.min(data),
        'max': np.max(data),
        'mean': np.mean(data),
        'std': np.std(data),
        'mode': float(np.argmax(np.bincount(data))),
        'percentiles': {
            '5th': np.percentile(data, 5),
            '10th': np.percentile(data, 10),
            '15th': np.percentile(data, 15),
            '20th': np.percentile(data, 20),
            '25th': np.percentile(data, 25),
            '30th': np.percentile(data, 30),
            '35th': np.percentile(data, 35),
            '40th': np.percentile(data, 40),
            '45th': np.percentile(data, 45),
            '50th': np.percentile(data, 50),
            '55th': np.percentile(data, 55),
            '60th': np.percentile(data, 60),
            '65th': np.percentile(data, 65),
            '70th': np.percentile(data, 70),
            '75th': np.percentile(data, 75),
            '80th': np.percentile(data, 80),
            '85th': np.percentile(data, 85),
            '90th': np.percentile(data, 90),
            '95th': np.percentile(data, 95),
            '100th': np.percentile(data, 100),
        }
    }
    
#     print("Minimum:", distribution['min'])
#     print("Maximum:", distribution['max'])
#     print("Mean:", distribution['mean'])
#     print("Standard Deviation:", distribution['std'])
#     print("Mode:", distribution['mode'])

#     percentiles = distribution['percentiles']
#     for percentile, value in percentiles.items():
#         print(f"{percentile.capitalize()} Percentile:", value)
    
    return distribution

# Few reviews performance inspection

In [1]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def few_reviews_performance_inspection(train_df, test_df, run_id_experiment_task_model, 
                                       user_reviews_num=3,user_comparison="equal", 
                                       user_reviews_num_range=None, business_reviews_num=100000,
                                       business_comparison="max", business_reviews_num_range=None):
    
    # filter reviews
    filtered_reviews, calculation_results = sample_reviews_and_calculate_price_then_return_data(train_df, 100, 
                                                        user_reviews_num=user_reviews_num,
                                                        user_comparison=user_comparison,
                                                        user_reviews_num_range=user_reviews_num_range,
                                                        business_reviews_num=business_reviews_num,
                                                        business_comparison=business_comparison,
                                                        business_reviews_num_range=business_reviews_num_range,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
    
    filtered_reviews_num = calculation_results["filtered_reviews_num"]
    filtered_users_count = calculation_results["filtered_users_count"]
    filtered_businesses_count = calculation_results["filtered_businesses_count"]
    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
        return 0
    print("filtered_reviews_num:", filtered_reviews_num)
    print("filtered_users_count:", filtered_users_count)
    print("filtered_businesses_count:", filtered_businesses_count)
    
    unique_user_ids = filtered_reviews['user_id'].unique()
    unique_business_ids = filtered_reviews['business_id'].unique()

    test_data = test_df[
        (test_df['user_id'].isin(unique_user_ids)) & 
        (test_df['business_id'].isin(unique_business_ids))
    ]
    test_data.reset_index(inplace=True)

    # get X_test and y_test
    cols = list(test_data.columns)
    columns_to_train = [col for col in cols if "vector" in col.lower()]

    concatenated_vectors = []
    for i in range(len(test_data)):
        col_vectors = []
        for col in columns_to_train:
            col_vectors.append(np.array(eval(test_data.loc[i, col])))
        concatenated_vector = np.concatenate(col_vectors)
        concatenated_vectors.append(concatenated_vector)

    X_test = np.array(concatenated_vectors)
    y_test = np.array(test_data['stars'])

    deep_learning_model_names = ["FM", "MLP", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
    machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                    "AdaBoost", "XGBoost", "CatBoost"]
    
    num_experiments = len(run_id_experiment_task_model["run_id"])
    performance_results = {}
    for i in range(num_experiments):
        run_id = run_id_experiment_task_model["run_id"][i]
        experiment_name = run_id_experiment_task_model["experiment_name"][i]
    
        task_type = run_id_experiment_task_model["task_type"]
        model_name = run_id_experiment_task_model["model_name"]

        print("run_id:", run_id)
        print("experiment_name:", experiment_name)
        print("task_type:", task_type)
        print("model_name", model_name)

        model = 0
        y_pred = 0

        # Specify the directory containing the MLmodel file
        mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
        model_dir = f"runs:/{run_id}/{model_name}_model"

        print(f'Start predicting with {model_name} model ...')

        if model_name in deep_learning_model_names:

            # Load the model from the specified directory
            model = mlflow.pytorch.load_model(model_dir)

            # Convert to float tensor
            X_test_tensor = torch.from_numpy(X_test).float()
            Xi_test_tensor = torch.arange(X_test_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_test_tensor.size(0), 1, 1).int()
            y_test_tensor = torch.from_numpy(y_test).float()

            # Convert data to DataLoader
            test_dataset = TensorDataset(Xi_test_tensor, X_test_tensor, y_test_tensor)
            test_loader = DataLoader(test_dataset, batch_size=len(y_test), shuffle=False)

            for t, (xi, x, _) in enumerate(test_loader):

                if model_name == "DeepFM":
                    y_pred = model(xi, x)
                elif model_name == "MLP":
                    y_pred = model(x)
                    y_pred = y_pred.view(-1)
                else:    
                    y_pred = model(x)

                y_pred = y_pred.detach().numpy()

        elif model_name in machine_learning_model_names:
            if model_name == "XGBoost":
                model = mlflow.xgboost.load_model(model_dir)
            elif model_name == "CatBoost":
                model = mlflow.catboost.load_model(model_dir)
            else:
                model = mlflow.sklearn.load_model(model_dir)

            y_pred = model.predict(X_test)


        else:
            print(f"Please select model in {deep_learning_model_names} or {machine_learning_model_names} !")
            return 0

        results = {}

        rmse = 0.0
        mse = 0.0
        mae = 0.0
        accuracy = 0.0
        auc_score = 0.0
        f1 = 0.0
        precision = 0.0
        recall = 0.0
        specificity = 0.0

        if task_type=="regression":
            rmse = root_mean_squared_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
        else:
            accuracy = accuracy_score(y_test, y_pred) # Accuracy
            auc_score = roc_auc_score(y_test, y_pred) # AUC
            f1 = f1_score(y_test, y_pred) # F1-score
            precision = precision_score(y_test, y_pred) # Precision
            recall = recall_score(y_test, y_pred) # Recall
            specificity = recall_score(y_test, y_pred, pos_label=0) # Specificity (True Negative Rate)

        results["rmse"] = rmse
        results["mse"] = mse
        results["mae"] = mae
        results["accuracy"] = accuracy
        results["auc_score"] = auc_score
        results["f1"] = f1
        results["precision"] = precision
        results["recall"] = recall
        results["specificity"] = specificity
        
        print(f"{experiment_name}:\n", results)
        
        performance_results[experiment_name] = results

    return performance_results

# Print nested dictionary data

In [None]:
def print_nested_dict(d, indent=0):
    for key, value in d.items():
        if isinstance(value, dict):
            print(' ' * indent + str(key) + ':')
            print_nested_dict(value, indent + 4)
        else:
            print(' ' * indent + str(key) + ': ' + str(value))

def write_nested_dict_to_file(d, f, indent=0):
    for key, value in d.items():
        if isinstance(value, dict):
            f.write("  " * indent + str(key) + ":\n")
            write_nested_dict_to_file(value, f, indent + 1)
        else:
            f.write("  " * indent + str(key) + ": " + str(value) + "\n")

# Different number of reviews performance inspection

In [None]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def return_filtered_train_test_data(train_df, test_df, user_reviews_num=3,user_comparison="equal", 
                                    user_reviews_num_range=None, business_reviews_num=100000,
                                    business_comparison="max", business_reviews_num_range=None):
    
    # filter reviews
    filtered_reviews, calculation_results = sample_reviews_and_calculate_price_then_return_data(train_df, 100, 
                                                        user_reviews_num=user_reviews_num,
                                                        user_comparison=user_comparison,
                                                        user_reviews_num_range=user_reviews_num_range,
                                                        business_reviews_num=business_reviews_num,
                                                        business_comparison=business_comparison,
                                                        business_reviews_num_range=business_reviews_num_range,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
    
    filtered_reviews_num = calculation_results["filtered_reviews_num"]
    filtered_users_count = calculation_results["filtered_users_count"]
    filtered_businesses_count = calculation_results["filtered_businesses_count"]
    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
        return 0
    
    print(f"User reviews: {user_reviews_num} {user_comparison} or {user_reviews_num_range}")
    print(f"Business reviews: {business_reviews_num} {business_comparison} or {business_reviews_num_range}")
    
    calculation_results = {}
    
    calculation_results["Train_data_num_before"] = len(train_df)
    calculation_results["Train_data_num_after"] = filtered_reviews_num
    calculation_results["Train_data_user_count_before"] = len(train_df['user_id'].unique())
    calculation_results["Train_data_user_count_after"] = filtered_users_count
    calculation_results["Train_data_business_count_before"] = len(train_df['business_id'].unique())
    calculation_results["Train_data_business_count_after"] = filtered_businesses_count
    
    print("Training data num:")
    print("Before:", len(train_df), "After:", filtered_reviews_num, '\n')
    print("Training data user count:")
    print("Before:", len(train_df['user_id'].unique()), "After:", filtered_users_count, '\n')
    print("Training data business count:")
    print("Before:", len(train_df['business_id'].unique()), "After:", filtered_businesses_count, '\n')
    
    unique_user_ids = filtered_reviews['user_id'].unique()
    unique_business_ids = filtered_reviews['business_id'].unique()

    test_data = test_df[
        (test_df['user_id'].isin(unique_user_ids)) & 
        (test_df['business_id'].isin(unique_business_ids))
    ]
    test_data.reset_index(inplace=True)
    
    calculation_results["Test_data_num_before"] = len(test_df)
    calculation_results["Test_data_num_after"] = len(test_data)
    calculation_results["Test_data_user_count_before"] = len(test_df['user_id'].unique())
    calculation_results["Test_data_user_count_after"] = len(test_data['user_id'].unique())
    calculation_results["Test_data_business_count_before"] = len(test_df['business_id'].unique())
    calculation_results["Test_data_business_count_after"] = len(test_data['business_id'].unique())
    
    print("Validation (Test) data num:")
    print("Before:", len(test_df), "After:", len(test_data), '\n')
    print("Validation (Test) data user count:")
    print("Before:", len(test_df['user_id'].unique()), "After:", len(test_data['user_id'].unique()), '\n')
    print("Validation (Test) data business count:")
    print("Before:", len(test_df['business_id'].unique()), "After:", len(test_data['business_id'].unique()))

    return filtered_reviews, test_data, calculation_results