In [5]:
# from pathlib import Path
# import nbformat

# def load_notebook(notebook_path):
#     with open(notebook_path, 'r', encoding='utf-8') as f:
#         nb = nbformat.read(f, as_version=4)
#     code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
#     exec('\n'.join(code_cells), globals())

# # import written function and variable

# parent_directory = Path('./')
# data_preprocessing_utils_path = parent_directory / 'data_preprocessing_utils.ipynb'
# load_notebook(data_preprocessing_utils_path)

# Obtain and log best training and model parameters

In [None]:
import ast
import math
import numpy as np

def obtain_best_train_params(model_name, record_data_df):
    
    train_best_params = {
        'MSE': {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
        'MAE': {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
        'RMSE': {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
    }
    
    if model_name == "AutoInt":
        record_data_df = record_data_df[(record_data_df[f"{model_name}_atten_embed_dim"]==32) &
                                    (record_data_df[f"{model_name}_num_layers"]==3) &
                                    (record_data_df[f"{model_name}_mlp_dims"]=='(256, 256, 256)') &
                                    (record_data_df["Epoch/Epochs_num"]=='100th/100')]
    elif model_name == "MLP":
        record_data_df = record_data_df[(record_data_df[f"{model_name}_embed_dims"]=='(256, 256, 256)') &
                                    (record_data_df["Epoch/Epochs_num"]=='100th/100')]
    batch_size = 'Batch_size'
    dl_learning_rate = 'DL_learning_rate'
    dropouts = f'{model_name}_dropouts'
    dropout = f'{model_name}_dropout'

    with open(f"{model_name}_train_best_params.txt", "w") as f:
        for metric in ['MSE', 'MAE', 'RMSE']:
            print(f'Train {metric}:\n')
            min_metric_indices = record_data_df[metric].nsmallest(5).index
            n = 0
            for index in min_metric_indices:
                n += 1
                train_best_params[metric][f'{n}']['batch_size'] = int(record_data_df.loc[index, batch_size])
                train_best_params[metric][f'{n}']['dl_learning_rate'] = float(record_data_df.loc[index, dl_learning_rate])
                if model_name == "MLP":
                    train_best_params[metric][f'{n}']['dropout'] = float(record_data_df.loc[index, dropout])
                else: # AutoInt
                    train_best_params[metric][f'{n}']['dropout'] = ast.literal_eval(record_data_df.loc[index, dropouts])
                print(f'{n}. ',
                    "batch_size:", train_best_params[metric][f'{n}']['batch_size'], 
                    "dl_learning_rate:", train_best_params[metric][f'{n}']['dl_learning_rate'], 
                    "dropout:", train_best_params[metric][f'{n}']['dropout'],
                    #               "\n",
                    "MSE", record_data_df.loc[index, 'MSE'], 
                    "MAE", record_data_df.loc[index, 'MAE'], 
                    "RMSE", record_data_df.loc[index, 'RMSE'], 
                    "Index:", index
                )
                f.write(f'Train {metric}:\n')
                f.write(f'{n}. ' + 
                      " batch_size: " + str(train_best_params[metric][f'{n}']['batch_size']) + 
                      " dl_learning_rate: " + str(train_best_params[metric][f'{n}']['dl_learning_rate']) + 
                      " dropout: " + str(train_best_params[metric][f'{n}']['dropout']) +
        #               "\n",
                      " MSE: " + str(record_data_df.loc[index, 'MSE']) +
                      " MAE: " + str(record_data_df.loc[index, 'MAE']) +
                      " RMSE: " + str(record_data_df.loc[index, 'RMSE']) + 
                      " Index: " + str(index)
                )
            print("\n")
            f.write("\n\n")

    return train_best_params
    
def obtain_best_all_params(model_name, record_data, train_best_params=None):
    
    if model_name in ["AutoInt", "MLP"]:
        
        if train_best_params == None:
            print("AutoInt and MLP need training params first !")
            return 0
        
        best_all_params = {
            "Train_MSE": {
                "Model_MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
            },
            "Train_MAE": {
                "Model_MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
            },
            "Train_RMSE": {
                "Model_MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
                "Model_RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
            },    
        }
        
        batch_size = 'Batch_size'
        dl_learning_rate = 'DL_learning_rate'
        dropouts = f'{model_name}_dropouts'
        dropout = f'{model_name}_dropout'
        
        atten_embed_dim = f"{model_name}_atten_embed_dim" # AutoInt
        num_layers = f"{model_name}_num_layers" # AutoInt
        mlp_dims = f"{model_name}_mlp_dims" # AutoInt
        embed_dims = f"{model_name}_embed_dims" # MLP
        
        for train_metric in ["MSE", "MAE", "RMSE"]:
            
            record_data_df = record_data
            
            metric_batch_size = train_best_params[train_metric]['1']['batch_size']
            metric_dl_learning_rate = train_best_params[train_metric]['1']['dl_learning_rate']
            metric_dropout = train_best_params[train_metric]['1']['dropout']
#             print("metric_batch_size:", metric_batch_size, "metric_batch_size type:", type(metric_batch_size))
#             print("metric_dl_learning_rate:", metric_dl_learning_rate, "metric_dl_learning_rate type:", type(metric_dl_learning_rate))
#             print("metric_dropout:", metric_dropout, "metric_dropout type:", type(metric_dropout))
            record_data_df = record_data_df[(record_data_df[f"{batch_size}"]==metric_batch_size) &
                                        (record_data_df[f"{dl_learning_rate}"]==metric_dl_learning_rate) &
                                        (record_data_df["Epoch/Epochs_num"]=='100th/100')]
#             print(len(record_data_df))
            if model_name == "AutoInt":
                record_data_df = record_data_df[record_data_df[f"{dropouts}"]==str(metric_dropout)]
#                 print(len(record_data_df))
            elif model_name == "MLP":
                record_data_df = record_data_df[record_data_df[f"{dropout}"]==metric_dropout]
#                 print(len(record_data_df))
            else:
                print("Currently support FM, AutoInt, MLP, XGBoost, Random Forest !")
                return 0
            
            for model_metric in ["MSE", "MAE", "RMSE"]:
            
                min_metric_indices = record_data_df[model_metric].nsmallest(5).index
                n = 0
                for index in min_metric_indices:
                    n += 1
                    best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['batch_size'] = int(record_data_df.loc[index, batch_size])
                    best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['dl_learning_rate'] = float(record_data_df.loc[index, dl_learning_rate])
                    
                    if model_name == "AutoInt":
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['dropout'] = ast.literal_eval(record_data_df.loc[index, dropouts])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['atten_embed_dim'] = int(record_data_df.loc[index, atten_embed_dim])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['num_layers'] = int(record_data_df.loc[index, num_layers])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['mlp_dims'] = ast.literal_eval(record_data_df.loc[index, mlp_dims])
                        
                    elif model_name == "MLP":
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['dropout'] = float(record_data_df.loc[index, dropout])
                        best_all_params[f"Train_{train_metric}"][f"Model_{model_metric}"][f'{n}']['embed_dims'] = ast.literal_eval(record_data_df.loc[index, embed_dims])
                    
                    else:
                        print("Currently support FM, AutoInt, MLP, XGBoost, Random Forest !")
                        return 0
        
        print_nested_dict(best_all_params)
        with open(f"{model_name}_all_best_params.txt", "w") as f:
            write_nested_dict_to_file(best_all_params, f)
        
        return best_all_params
    
    elif model_name in ["FM", "XGBoost", "RandomForest"]:
        
        best_all_params = {
            "MSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
            "MAE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}},
            "RMSE": {'1': {}, '2': {}, '3': {}, '4': {}, '5': {}}
        }
        
        # FM
        batch_size = 'Batch_size'
        dl_learning_rate = 'DL_learning_rate'
        factors_num = f"{model_name}_factors_num"
        
        # XGBoost
        xgb_colsample_bytree = f"{model_name}_colsample_bytree"
        xgb_lambda = f"{model_name}_lambda"
        xgb_learning_rate = f"{model_name}_learning_rate"
        xgb_max_depth = f"{model_name}_max_depth"
        xgb_min_child_weight = f"{model_name}_min_child_weight"
        xgb_n_estimators = f"{model_name}_n_estimators"
        xgb_subsample = f"{model_name}_subsample"
        
        # Random Forest
        rf_max_depth = f"{model_name}_max_depth"
        rf_max_features = f"{model_name}_max_features"
        rf_min_samples_leaf = f"{model_name}_min_samples_leaf"
        rf_min_samples_split = f"{model_name}_min_samples_split"
        rf_n_estimators = f"{model_name}_n_estimators"
        
        record_data_df = record_data
        
        if model_name == "FM":
            record_data_df = record_data_df[record_data_df["Epoch/Epochs_num"]=='100th/100']
        
        for metric in ["MSE", "MAE", "RMSE"]:
            min_metric_indices = record_data_df[metric].nsmallest(5).index
            n = 0
            for index in min_metric_indices:
                n += 1
                if model_name == "FM":
                    best_all_params[metric][f'{n}']['factors_num'] = int(record_data_df.loc[index, factors_num])
                    best_all_params[metric][f'{n}']['batch_size'] = int(record_data_df.loc[index, batch_size])
                    best_all_params[metric][f'{n}']['dl_learning_rate'] = float(record_data_df.loc[index, dl_learning_rate])
                
                elif model_name == "XGBoost":
                    best_all_params[metric][f'{n}']['colsample_bytree'] = float(record_data_df.loc[index, xgb_colsample_bytree])
                    best_all_params[metric][f'{n}']['lambda'] = float(record_data_df.loc[index, xgb_lambda])
                    best_all_params[metric][f'{n}']['learning_rate'] = float(record_data_df.loc[index, xgb_learning_rate])
                    best_all_params[metric][f'{n}']['max_depth'] = int(record_data_df.loc[index, xgb_max_depth])
                    best_all_params[metric][f'{n}']['min_child_weight'] = int(record_data_df.loc[index, xgb_min_child_weight])
                    best_all_params[metric][f'{n}']['n_estimators'] = int(record_data_df.loc[index, xgb_n_estimators])
                    best_all_params[metric][f'{n}']['subsample'] = float(record_data_df.loc[index, xgb_subsample])
    
                elif model_name == "RandomForest":
                    max_feature_num = record_data_df.loc[index, rf_max_features]
                    max_feature_num = int(max_feature_num) if max_feature_num.isdigit() else max_feature_num
                    max_depth = record_data_df.loc[index, rf_max_depth]
                    max_depth = None if math.isnan(max_depth) else int(max_depth)
                    best_all_params[metric][f'{n}']['max_depth'] = max_depth
                    best_all_params[metric][f'{n}']['max_features'] = max_feature_num
                    best_all_params[metric][f'{n}']['min_samples_leaf'] = int(record_data_df.loc[index, rf_min_samples_leaf])
                    best_all_params[metric][f'{n}']['min_samples_split'] = int(record_data_df.loc[index, rf_min_samples_split])
                    best_all_params[metric][f'{n}']['n_estimators'] = int(record_data_df.loc[index, rf_n_estimators])
        
        print_nested_dict(best_all_params)
        with open(f"{model_name}_all_best_params.txt", "w") as f:
            write_nested_dict_to_file(best_all_params, f)
        
        return best_all_params
        
    else:
        print("Currently support FM, AutoInt, MLP, XGBoost, Random Forest !")
        return 0
    

# Mediators (pass best params to training validation testing function)

In [None]:
from itertools import product

def generate_grid_search_combinations(model_name, is_model_params=False):
    
    current_models_two_phase = ["AutoInt", "MLP"]
    current_models_one_phase = ["FM", "XGBoost", "RandomForest"]
    
    params_combinations = 0
    
    if model_name == "FM":
        # optimizers = ["Adam", "SGD"]
        factors_num_list = [4, 8, 16, 32, 64]
        optimizers = ["Adam"]
        batch_sizes = [32, 64, 128, 256, 512, 1024]
        learning_rates = [0.0001, 0.0005, 0.001, 0.005]
        # dropouts = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        params_combinations = list(product(optimizers, batch_sizes, learning_rates, factors_num_list))
    
    elif model_name == "AutoInt":
        if is_model_params:
            atten_embed_dim_list = [32, 64]
            num_layers = [2, 3, 4, 5]
            mlp_dims = [tuple([2**size]*3) for size in range(5, 10)]
            params_combinations = list(product(atten_embed_dim_list, num_layers, mlp_dims))
        else:
            # optimizers = ["Adam", "SGD"]
            optimizers = ["Adam"]
            batch_sizes = [32, 64, 128, 256, 512, 1024]
            learning_rates = [0.0001, 0.0005, 0.001, 0.005]
            dropouts = [tuple([size * 0.1] * 3) for size in range(1, 10, 2)]
            params_combinations = list(product(optimizers, batch_sizes, learning_rates, dropouts))
    
    elif model_name == "MLP":
        if is_model_params:
            embed_dims = [tuple([2**size]*3) for size in range(5, 10)]
            params_combinations = list(product(embed_dims))
        else:
            # optimizers = ["Adam", "SGD"]
            optimizers = ["Adam"]
            batch_sizes = [32, 64, 128, 256, 512, 1024]
            learning_rates = [0.0001, 0.0005, 0.001, 0.005]
            dropouts = [size * 0.1 for size in range(1, 10, 2)]
            params_combinations = list(product(optimizers, batch_sizes, learning_rates, dropouts))
    
    elif model_name == "XGBoost":
        learning_rates = [0.001, 0.01, 0.1]
        n_estimators_list = [50, 100, 200]
        subsample_list = [0.8, 1]
        colsample_bytree_list = [0.8, 1]
        min_child_weight_list = [1, 3, 5]
        max_depth_list = [4, 5, 6]
        lambda_list = [0, 0.1, 0.5]
        params_combinations = list(product(learning_rates, n_estimators_list, subsample_list, colsample_bytree_list,
                                              min_child_weight_list, max_depth_list, lambda_list))
    
    elif model_name == "RandomForest":
        n_estimators_list = [50, 100, 200] # to 200
        max_features_list = [3, 5, 'sqrt', 'log2'] # None, most takes too long, so remove none
        max_depth_list = [5, 10, None]
        min_samples_split_list = [2, 4, 6] # to 4
        min_samples_leaf_list = [1, 3, 5] # 1, 3, 5
        params_combinations = list(product(n_estimators_list, max_features_list, max_depth_list, 
                                               min_samples_split_list, min_samples_leaf_list))
    
    else:
        print(f"Please make sure model in {current_models_two_phase} and {current_models_one_phase} !")
        return 0
    
    return params_combinations

In [None]:
def params_mediator(model_name, input_dim, field_num, embedding_size=768, params_input=None, 
                    validating=False, training_combo=None, model_combo=None, train_best_params=None, 
                    testing=True, best_all_params=None):
    
#     input_dim = len(X_train[0])
#     embedding_size = 768
#     field_num = len(columns_to_train)
   
    FM_params = {
        'hyperparameters': {
            'input_dim': input_dim, 
    #         'factors_num': [5, 10, 15],
    #         'factors_num': [i+1 for i in range(20)]
    #         'factors_num': [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    #         'factors_num': [2**i for i in range(3, 7)],
            'factors_num': 8
        },

        'task_type': 'Regression', 'loss_type': 'MSE', 'optimizer_type': 'Adam', 
    #   'dl_learning_rate': [0.01, 0.02, 0.001], 
        'dl_learning_rate': 0.001,
        'epochs_num': 100, 
    #   'batch_size': [100, 500, 1000]
        'batch_size': 512

    }
    
    # AutoInt
    # field_dims, embed_dim, atten_embed_dim, num_heads, num_layers, mlp_dims, dropouts, has_residual=True
    # embed_dim=16, atten_embed_dim=64, num_heads=2, num_layers=3, mlp_dims=(400, 400), dropouts=(0, 0, 0)

    # Paper:
    # embed_dim is set to 16, batch size set to 1024
    # num_head is 2, num_layer(interaction layer) is 3, num of hidden units (interaction layer) is 32
    # test dropout from 0.1 ~ 0.9
    # optimizer is Adam
    # test num_layer(interaction layer) from 0 ~ 4, 1 increase dramaticaly, 1 ~ 4 become stable
    # test atten_embed_dim 8, 16, 24, 32, movie len is getting better, for KDD12, 24 is best, then decrease
    # hidden units shape is set to (1, 200) or (4, 100)
    # residaul is crucial

    AutoInt_params = {
        'hyperparameters': {
            'field_dims': [embedding_size for i in range(field_num)], 
    #         'embed_dim': [768], 
            'embed_dim': 768, 
    #         'atten_embed_dim': [(64, 32)],
    #         'atten_embed_dim': [2**i for i in range(2, 7)],
            'atten_embed_dim': 32,
    #         "num_heads": [2],
            "num_heads": 2,
    #         "num_layers": [3],
    #         "num_layers": [i+1 for i in range(5)],
            "num_layers": 3,
    #         'mlp_dims': [(16, 16), (400, 400)],
    #         'mlp_dims': [tuple([size] * num_layers) for num_layers in range(1, 6) for size in [i*100 for i in range(1, 6)]],
            'mlp_dims': (256, 256, 256),
    #         'dropouts': [(0.5, 0.5, 0.5)],
    #         'dropouts': [tuple([size * 0.1] * 3) for size in range(1, 10)],
    #         'dropouts': (0.0, 0.0, 0.0),
            'dropouts': (0.5, 0.5, 0.5),
            "has_residual": True
        },

        'task_type': 'Regression', 'loss_type': 'MSE', 'optimizer_type': 'Adam', 
    #         'dl_learning_rate': [0.01, 0.02, 0.001], 
    #     'dl_learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
        'dl_learning_rate': 0.001,
    #     'epochs_num': [10, 20, 30], 
        'epochs_num': 100,
    #         'batch_size': [100, 500, 1000]
    #     'batch_size': [128, 256, 512, 1024]
        'batch_size': 512

    }
    
    # MLP
    # input_dim, embed_dims, dropout

    MLP_params = {
        'hyperparameters': {
            'input_dim': input_dim, 
    #         'factors_num': [5, 10, 15],
            'embed_dims': (256, 256, 256),
            'dropout': 0.0
        },

        'task_type': 'Regression', 'loss_type': 'MSE', 'optimizer_type': 'Adam', 
    #         'dl_learning_rate': [0.01, 0.02, 0.001], 
        'dl_learning_rate': 0.001,
        'epochs_num': 100, 
    #         'batch_size': [100, 500, 1000]
        'batch_size': 512

    }
    
    XGBoost_params = {
        'hyperparameters': {
    #         'learning_rate': [0.01, 0.02, 0.001],
    #         'learning_rate': [i/100 for i in range(1, 11)],
            'learning_rate': 0.01,
    #         'n_estimators': [50, 100],
    #         'n_estimators': [20, 40, 60, 80, 100],
            'n_estimators': 50,
    #         'subsample': [1],
    #         'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'subsample': 0.8,
    #         'colsample_bytree': [1]
    #         'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
            'colsample_bytree': 0.8,
            'min_child_weight': 1,
            'max_depth': 5,
    #         'gamma': 0,
#             'alpha':0,
            'lambda':0
        },
        'task_type': 'Regression', 'loss_type': None, 'optimizer_type': None, 
        'dl_learning_rate': None, 'epochs_num': None, 'batch_size': None
    }

    RandomForest_params = {
        'hyperparameters': {
            'max_depth': 5,
            'max_features': 3,
            'min_samples_leaf': 5,
            'min_samples_split': 2,
            'n_estimators': 200
    #         'max_leaf_nodes':,
    #         'bootstrap':,
    #         'min_impurity_decrease':,
    #         'min_weight_fraction_leaf':
        },
        'task_type': 'Regression', 'loss_type': None, 'optimizer_type': None, 
        'dl_learning_rate': None, 'epochs_num': None, 'batch_size': None
    }

    current_models_two_phase = ["AutoInt", "MLP"]
    current_models_one_phase = ["FM", "XGBoost", "RandomForest"]
    top_n = '1'
    
    if validating == True and testing == False:
        
        params = 0
        
        if model_name == "FM":
            if training_combo != None:
                params = FM_params
                optimizer, batch_size, learning_rate, factor_num = combo
                params['optimizer_type'] = optimizer
                params['batch_size'] = batch_size
                params['dl_learning_rate'] = learning_rate
                params['hyperparameters']['factors_num'] = factor_num
            else:
                print(f"If you want to validate {model_name}, make sure training_combo not None !")
                return 0
            
        elif model_name == "AutoInt":
            if training_combo != None and model_combo == None:
                optimizer, batch_size, learning_rate, dropout = training_combo
                params = params_input if params_input != None else AutoInt_params
                params['optimizer_type'] = optimizer
                params['batch_size'] = batch_size
                params['dl_learning_rate'] = learning_rate
                params['hyperparameters']['dropouts'] = dropout
            elif training_combo == None and model_combo != None and train_best_params != None:
                atten_dim, num_layer, mlp_dim = model_combo
                params = params_input if params_input != None else AutoInt_params
                params['batch_size'] = train_best_params[top_n]['batch_size']
                params['dl_learning_rate'] = train_best_params[top_n]['dl_learning_rate']
                params['hyperparameters']['dropouts'] = train_best_params[top_n]['dropout']
                params['hyperparameters']['atten_embed_dim'] = atten_dim
                params['hyperparameters']['num_layers'] = num_layer
                params['hyperparameters']['mlp_dims'] = mlp_dim
            else:
                print(f"If you want to validate {model_name}, make sure training_combo not None or model_combo and train_best_params not None !")
                return 0
        
        elif model_name == "MLP":
            if training_combo != None and model_combo == None:
                optimizer, batch_size, learning_rate, dropout = training_combo
                params = params_input if params_input != None else MLP_params
                params['optimizer_type'] = optimizer
                params['batch_size'] = batch_size
                params['dl_learning_rate'] = learning_rate
                params['hyperparameters']['dropout'] = round(dropout, 1)
            elif training_combo == None and model_combo != None and train_best_params != None:
                embed_dim = model_combo[0]
                params = MLP_params
                params['batch_size'] = train_best_params[top_n]['batch_size']
                params['dl_learning_rate'] = train_best_params[top_n]['dl_learning_rate']
                params['hyperparameters']['dropout'] = round(train_best_params[top_n]['dropout'], 1)
                params['hyperparameters']['embed_dims'] = embed_dim
            else:
                print(f"If you want to validate {model_name}, make sure training_combo not None or model_combo and train_best_params not None !")
                return 0
        
        elif model_name == "XGBoost":
            if training_combo != None:
                learning_rate, n_estimator, subsample, colsample_bytree, min_child_weight, max_depth, lambda_value = training_combo
                params = params_input if params_input != None else XGBoost_params 
                params['hyperparameters']['learning_rate'] = float(learning_rate) # validation reocord dtype is float
                params['hyperparameters']['n_estimators'] = n_estimator
                params['hyperparameters']['subsample'] = float(subsample)
                params['hyperparameters']['colsample_bytree'] = float(colsample_bytree)
                params['hyperparameters']['min_child_weight'] = min_child_weight
                params['hyperparameters']['max_depth'] = max_depth
                params['hyperparameters']['lambda'] = float(lambda_value)
            else:
                print(f"If you want to validate {model_name}, make sure training_combo not None !")
                return 0
        
        elif model_name == "RandomForest":
            if training_combo != None:
                n_estimator, max_feature, max_depth, min_samples_split, min_samples_leaf = training_combo
                params = params_input if params_input != None else RandomForest_params
                params['hyperparameters']['n_estimators'] = n_estimator
                params['hyperparameters']['max_features'] = max_feature
#                 params['hyperparameters']['max_depth'] = float(max_depth) if isinstance(max_depth, int) else np.nan
                params['hyperparameters']['max_depth'] = max_depth
                params['hyperparameters']['min_samples_split'] = min_samples_split
                params['hyperparameters']['min_samples_leaf'] = min_samples_leaf
            else:
                print(f"If you want to validate {model_name}, make sure training_combo not None !")
                return 0
        
        else:
            print(f"Please make sure model in {current_models_two_phase} and {current_models_one_phase} !")
            return 0
        
        return params
        
    elif testing == True and validating == False:
        
        if best_all_params == None:
            print(f"Please make sure best_all_params not None if you want to use this function to test !")
            return 0 
        
        best_all_params = best_all_params[top_n]
        test_params = 0
        
        if model_name == "FM":
            test_params = params_input if params_input != None else FM_params
            test_params["batch_size"] = best_all_params['batch_size']
            test_params["dl_learning_rate"] = best_all_params['dl_learning_rate']
            test_params['hyperparameters']["factors_num"] = best_all_params['factors_num']
        
        elif model_name == "AutoInt":
            test_params = params_input if params_input != None else AutoInt_params
            test_params['batch_size'] = best_all_params['batch_size']
            test_params['dl_learning_rate'] = best_all_params['dl_learning_rate']
            test_params['hyperparameters']['dropouts'] = best_all_params['dropout']
            test_params['hyperparameters']['atten_embed_dim'] = best_all_params['atten_embed_dim']
            test_params['hyperparameters']['num_layers'] = best_all_params['num_layers']
            test_params['hyperparameters']['mlp_dims'] = best_all_params['mlp_dims']
        
        elif model_name == "MLP":
            test_params = params_input if params_input != None else MLP_params
            test_params['batch_size'] = best_all_params['batch_size']
            test_params['dl_learning_rate'] = best_all_params['dl_learning_rate']
            test_params['hyperparameters']['dropout'] = best_all_params['dropout']
            test_params['hyperparameters']['embed_dims'] = best_all_params['embed_dims']
        
        elif model_name == "XGBoost":
            test_params = params_input if params_input != None else XGBoost_params
            test_params['hyperparameters']['colsample_bytree'] = best_all_params['colsample_bytree']
            test_params['hyperparameters']['lambda'] = best_all_params['lambda']
            test_params['hyperparameters']['learning_rate'] = best_all_params['learning_rate']
            test_params['hyperparameters']['max_depth'] = best_all_params['max_depth']
            test_params['hyperparameters']['min_child_weight'] = best_all_params['min_child_weight']
            test_params['hyperparameters']['n_estimators'] = best_all_params['n_estimators']
            test_params['hyperparameters']['subsample'] = best_all_params['subsample']

        elif model_name == "RandomForest":
            test_params = params_input if params_input != None else RandomForest_params
            test_params['hyperparameters']['max_depth'] = best_all_params['max_depth']
            test_params['hyperparameters']['max_features'] = best_all_params['max_features']
            test_params['hyperparameters']['min_samples_leaf'] = best_all_params['min_samples_leaf']
            test_params['hyperparameters']['min_samples_split'] = best_all_params['min_samples_split']
            test_params['hyperparameters']['n_estimators'] = best_all_params['n_estimators']
            
        else:
            print(f"Please make sure model in {current_models_two_phase} and {current_models_one_phase} !")
            return 0
        
        return test_params
    
    else:
        print(f"Please make sure validating True or testing True, can not be both True or both False !")
        return 0
    

# Testing with mlflow experiment name (run id)

In [None]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def test_with_mlflow_experiment_name(X_train, y_train, X_valid, y_valid, model_name, 
                                experiment_name, metric="MSE", train_metric=None, model_metric=None, 
                                     best_all_params=None, representation=None, sorting='ASC', top_n=1, 
                                     save_records=True, not_duplicate=False, device_name="cpu"):
    
    device = torch.device(device_name)
    cpu_device = torch.device("cpu")
    
    if "tv" not in experiment_name:
        print("Experiment must come from validation experiment !")
        return 0
    
    experiment = mlflow.get_experiment_by_name(experiment_name)

    experiment_id = 0
    if experiment is not None:
        experiment_id = experiment.experiment_id
    else:
        print("Experiment is empty !")
        return 0
    
    current_models_two_phase = ["AutoInt", "MLP"]
    current_models_one_phase = ["FM", "XGBoost", "RandomForest"]
    
    if model_name in current_models_two_phase and best_all_params!=None:
        if train_metric == None or model_metric == None:
            print(f"Model in {current_models_two_phase}. If you want to use best_all_params, please make sure train_metric and model_metric not None !")
            return 0
            
    filter_str = None
    if best_all_params != None:
        best_all_params = best_all_params['1']
        if model_name == "FM":
            batch_size = best_all_params['batch_size']
            dl_learning_rate = best_all_params['dl_learning_rate']
            factors_num = best_all_params['factors_num']
            filter_str = f"params.batch_size = '{batch_size}' and params.dl_learning_rate = '{dl_learning_rate}' and params.factors_num = '{factors_num}'"
        elif model_name == "AutoInt":
            batch_size = best_all_params['batch_size']
            dl_learning_rate = best_all_params['dl_learning_rate']
            dropouts = best_all_params['dropout']
            atten_embed_dim = best_all_params['atten_embed_dim']
            num_layers = best_all_params['num_layers']
            mlp_dims = best_all_params['mlp_dims']
            filter_str = f"params.batch_size = '{batch_size}' and params.dl_learning_rate = '{dl_learning_rate}' and params.dropouts = '{dropouts}' and params.atten_embed_dim = '{atten_embed_dim}' and params.num_layers = '{num_layers}' and params.mlp_dims = '{mlp_dims}'"
        elif model_name == "MLP":
            batch_size = best_all_params['batch_size']
            dl_learning_rate = best_all_params['dl_learning_rate']
            dropout = best_all_params['dropout']
            embed_dims = best_all_params['embed_dims']
            filter_str = f"params.batch_size = '{batch_size}' and params.dl_learning_rate = '{dl_learning_rate}' and params.dropout = '{dropout}' and params.embed_dims = '{embed_dims}'"
        elif model_name == "XGBoost":
            colsample_bytree = best_all_params['colsample_bytree']
            lambda_str = best_all_params['lambda']
            learning_rate = best_all_params['learning_rate']
            max_depth = best_all_params['max_depth']
            min_child_weight = best_all_params['min_child_weight']
            n_estimators = best_all_params['n_estimators']
            subsample = best_all_params['subsample']
            filter_str = f"params.colsample_bytree = '{colsample_bytree}' and params.lambda = '{lambda_str}' and params.learning_rate = '{learning_rate}' and params.max_depth = '{max_depth}' and params.min_child_weight = '{min_child_weight}' and params.n_estimators = '{n_estimators}' and params.subsample = '{subsample}'"
        elif model_name == "RandomForest":
            max_depth = best_all_params['max_depth']
            max_features = best_all_params['max_features']
            min_samples_leaf = best_all_params['min_samples_leaf']
            min_samples_split = best_all_params['min_samples_split']
            n_estimators = best_all_params['n_estimators']
            filter_str = f"params.max_depth = '{max_depth}' and params.max_features = '{max_features}' and params.min_samples_leaf = '{min_samples_leaf}' and params.min_samples_split = '{min_samples_split}' and params.n_estimators = '{n_estimators}'"
        else:
            print(f"Model params must in {current_models_two_phase} and {current_models_one_phase} !")
            return 0
    
    search_params = {
        "order_by": [f"metrics.eval_{metric.lower()} {sorting}"],
        "max_results": top_n
    }
    
    results = mlflow.search_runs(experiment_ids=experiment_id, 
                                 filter_string=filter_str,
                                 order_by=search_params["order_by"], 
                                 max_results=search_params["max_results"])
    run_id = 0
    if not results.empty:
        run_id = results.iloc[0]["run_id"]
        run_id = str(run_id)
        print(f"Run ID from best {metric} params in {experiment_name}:", run_id)
    else:
        print("Run ID is empty !")
        return 0

    run_info = mlflow.get_run(run_id)
    params = run_info.data.params
    
    print_nested_dict(params)
    
    phase = "test"
    
    deep_learning_model_names = ["FM", "MLP", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
    machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                    "AdaBoost", "XGBoost"]
    regression_loss_list = ["MSE, RMSE", "MAE"]
    classification_loss_list = ["CrossEntropy", "BinaryCrossEntropy"]
    
    if save_records:
            
        validation_column_list = [
            "Timestamp", "Model_name", *params.keys(), "RMSE", "MSE", "MAE", "Accuracy", "AUC_score", "F1", "Precision", "Recall", "Specificity"
        ]

        # Create folder if not exists
        folder_name = f"{model_name}_result_records"
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        validation_file_path = os.path.join(folder_name, f'{model_name}_testing_result_records_with_runID.csv')

        if not os.path.exists(validation_file_path):
            empty_df = pd.DataFrame(columns=validation_column_list)
            empty_df.to_csv(validation_file_path, index=False, float_format='%.6f')

        else:
            if not_duplicate:
                record_df = pd.read_csv(validation_file_path)
                columns_to_compare = ["Model_name", *params.keys()]
                record_df = record_df[columns_to_compare]
                record_df = record_df.values.tolist()
                record_df = [[str(value) for value in onelist] for onelist in record_df]
                temp_df_values = [[model_name, *params.values()]]
                temp_df = pd.DataFrame(temp_df_values, columns=columns_to_compare)
                temp_df = temp_df.values.tolist()[0]
                temp_df = [str(value) for value in temp_df]

                if temp_df in record_df:
                    print("Parameters already exists in test results !")
                    return 0

    else:
        print("Test results are not saved to csv !")
    
    run_name=0
    representation_name = f"{representation} " if representation != None else ""
    if train_metric == None and model_metric == None and best_all_params == None:
        run_name = f"Trained {representation_name}{metric.upper()} params"
    elif train_metric != None and model_metric == None:
        run_name = f"Trained {representation_name}{train_metric.upper()} params"
    elif train_metric != None and model_metric != None:
        run_name = f"Trained {representation_name}{train_metric.upper()} and {model_metric.upper()} params"
    elif train_metric == None and model_metric == None:
        run_name = None
    else:
        print("If you want to set run name, please make sure train_metric is not None !")
        return 0
    
    
    if model_name in deep_learning_model_names:
 
        with mlflow.start_run(run_name=run_name) as run:
            
            mlflow.log_params(params) # Log training parameters
            model_dir = f"runs:/{run_id}/{model_name}_model"
            model = mlflow.pytorch.load_model(model_dir) # Load the PyTorch model from the specified directory
            
            # Log model summary.
            with open(f"{model_name}_model_summary.txt", "w") as f:
                f.write(str(summary(model)))
            mlflow.log_artifact(f"{model_name}_model_summary.txt")

            # Convert to float tensor
#             X_train_tensor = torch.from_numpy(X_train).float()
#             Xi_train_tensor = torch.arange(X_train_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_train_tensor.size(0), 1, 1).int()
#             y_train_tensor = torch.from_numpy(y_train).float()
            X_valid_tensor = torch.from_numpy(X_valid).float().to(device)
            Xi_valid_tensor = torch.arange(X_valid_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_valid_tensor.size(0), 1, 1).int()
            y_valid_tensor = torch.from_numpy(y_valid).float().to(device)

            # Convert data to DataLoader
#             train_dataset = TensorDataset(Xi_train_tensor, X_train_tensor, y_train_tensor)
#             train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            validation_dataset = TensorDataset(Xi_valid_tensor, X_valid_tensor, y_valid_tensor)
            validation_loader = DataLoader(validation_dataset, batch_size=len(X_valid), shuffle=True)
            
            y_pred = 0
            validation_results_df = []
            criterion = 0
            
            print(f"Start testing with best {metric.upper()} params ...")
            
            for t, (xi, x, y_true) in enumerate(validation_loader):

                if model_name == "DeepFM":
                    y_pred = model(xi, x)
                elif model_name == "MLP":
                    y_pred = model(x)
                    y_pred = y_pred.view(-1)
                else:
                    y_pred = model(x)
            
            # Calculate metric values
            loss = 0.0

            rmse = 0.0
            mse = 0.0
            mae = 0.0
            accuracy = 0.0
            auc_score = 0.0
            f1 = 0.0
            precision = 0.0
            recall = 0.0
            specificity = 0.0

            if params['task_type'] == "Regression":
                criterion = nn.MSELoss()
                if params['loss_type'] == "RMSE":
                    eps = 1e-6
                    if model_name == "FM":
                        loss = torch.sqrt(criterion(y_pred, y_true.view(-1, 1)) + eps)
                    else:
                        loss = torch.sqrt(criterion(y_pred, y_true.view(-1)) + eps)
                elif params['loss_type'] == "MSE":
                    if model_name == "FM":
                        loss = criterion(y_pred, y_true.view(-1, 1))
                    else:        
                        loss = criterion(y_pred, y_true.view(-1))
                else:
                    print(f"Please make sure loss type is in {regression_loss_list}")
                    return 0

                y_true = y_true.to(cpu_device).detach().numpy() # Convert PyTorch tensor to NumPy array
                y_pred = y_pred.to(cpu_device).detach().numpy() # Convert PyTorch tensor to NumPy array
                y_pred = np.nan_to_num(y_pred, nan=0) # prevent error
                predictions = y_pred
                
                rmse = root_mean_squared_error(y_true, y_pred)
                mse = mean_squared_error(y_true, y_pred)
                mae = mean_absolute_error(y_true, y_pred)

            elif params['task_type'] == "Classification":
                if params['loss_type'] == "BCE":
                        criterion = nn.BCELoss()
                        y_pred = torch.sigmoid(y_pred)
                elif params['loss_type'] == "CE":
                    criterion = nn.CrossEntropyLoss()
                else:
                    print(f"Please make sure loss type is in {classification_loss_list}")
                    return 0

                loss = criterion(y_pred, y_true)

                y_true = y_true.to(cpu_device).detach().numpy() # Convert PyTorch tensor to NumPy array
                y_pred = y_pred.to(cpu_device).detach().numpy() # Convert PyTorch tensor to NumPy array
                predictions = y_pred
                
                accuracy = accuracy_score(y_true, y_pred) # Accuracy
                auc_score = roc_auc_score(y_true, y_pred) # AUC
                f1 = f1_score(y_true, y_pred) # F1-score
                precision = precision_score(y_true, y_pred) # Precision
                recall = recall_score(y_true, y_pred) # Recall
                specificity = recall_score(y_true, y_pred, pos_label=1) # Specificity (True Negative Rate)

            else:
                print("Please make sure the task is regression or classification !")
                return 0

#             mlflow.log_metric(f"{phase}_loss", f"{loss:6f}")
            mlflow.log_metric(f"{phase}_rmse", f"{rmse:6f}")
            mlflow.log_metric(f"{phase}_mse", f"{mse:6f}")
            mlflow.log_metric(f"{phase}_mae", f"{mae:6f}")

            mlflow.log_metric(f"{phase}_accuracy", f"{accuracy:6f}")
            mlflow.log_metric(f"{phase}_auc_score", f"{auc_score:6f}")
            mlflow.log_metric(f"{phase}_f1", f"{f1:6f}")
            mlflow.log_metric(f"{phase}_precision", f"{precision:6f}")
            mlflow.log_metric(f"{phase}_recall", f"{recall:6f}")
            mlflow.log_metric(f"{phase}_specificity", f"{specificity:6f}")

            # Save the trained model to MLflow.
            input_example = X_train[0]
            signatures = infer_signature(input_example, predictions)
            mlflow.pytorch.log_model(model, f"{model_name}_model", signature=signatures)
            
            value_list = [
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), model_name, *params.values(), 
                rmse, mse, mae, accuracy, auc_score, f1, precision, recall, specificity
            ]

            validation_results_df.append(value_list)
            
            # Save training and validation results to file
            validation_records_df = pd.DataFrame(validation_results_df, columns=validation_column_list)
            validation_records_df.to_csv(validation_file_path, mode='a', header=False, index=False, float_format='%.6f')
    
    elif model_name in machine_learning_model_names:
        
        with mlflow.start_run(run_name=run_name) as run:
            
            mlflow.log_params(params) # Log training parameters
            model_dir = f"runs:/{run_id}/{model_name}_model"
            model = 0
            
            if model_name in ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", "AdaBoost"]:
                model = mlflow.sklearn.load_model(model_dir)
            elif model_name == "XGBoost":
                model = mlflow.xgboost.load_model(model_dir)
            elif model_name == "CatBoost":
                model = mlflow.catboost.load_model(model_dir)
            else:
                print(f"Model not in {machine_learning_model_names}, couldn't load the model !")
                        
            validation_results_df = []

            print(f"Start testing with best {metric.upper()} params ...")

            y_pred = model.predict(X_valid)
            y_pred = np.nan_to_num(y_pred, nan=0) # prevent error
            predictions = y_pred
            
            # Calculate metric values
            rmse = 0.0
            mse = 0.0
            mae = 0.0
            accuracy = 0.0
            auc_score = 0.0
            f1 = 0.0
            precision = 0.0
            recall = 0.0
            specificity = 0.0

            if params['task_type'] == "Regression":
                rmse = root_mean_squared_error(y_valid, y_pred)
                mse = mean_squared_error(y_valid, y_pred)
                mae = mean_absolute_error(y_valid, y_pred)
            elif params['task_type'] == "Classification":
                accuracy = accuracy_score(y_valid, y_pred) # Accuracy
                auc_score = roc_auc_score(y_valid, y_pred) # AUC
                f1 = f1_score(y_valid, y_pred) # F1-score
                precision = precision_score(y_valid, y_pred) # Precision
                recall = recall_score(y_valid, y_pred) # Recall
                specificity = recall_score(y_valid, y_pred, pos_label=1) # Specificity (True Negative Rate)
            else:
                print("Please make sure the task is regression or classification !")
                return 0

            mlflow.log_metric(f"{phase}_rmse", f"{rmse:4f}")
            mlflow.log_metric(f"{phase}_mse", f"{mse:4f}")
            mlflow.log_metric(f"{phase}_mae", f"{mae:4f}")
            mlflow.log_metric(f"{phase}_accuracy", f"{accuracy:4f}")
            mlflow.log_metric(f"{phase}_auc_score", f"{auc_score:4f}")
            mlflow.log_metric(f"{phase}_f1", f"{f1:4f}")
            mlflow.log_metric(f"{phase}_precision", f"{precision:4f}")
            mlflow.log_metric(f"{phase}_recall", f"{recall:4f}")
            mlflow.log_metric(f"{phase}_specificity", f"{specificity:4f}")

            input_example = X_train[0]
            signatures = infer_signature(input_example, predictions)
            # Save the trained model to MLflow.
            if model_name in ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", "AdaBoost"]:
                mlflow.sklearn.log_model(model, f"{model_name}_model", signature=signatures)
    #                 mlflow.sklearn.log_model(model, f"{model_name}_model")
            elif model_name == "XGBoost":
                mlflow.xgboost.log_model(model, f"{model_name}_model", signature=signatures)
    #                 mlflow.xgboost.log_model(model, f"{model_name}_model")
            elif model_name == "CatBoost":
                mlflow.catboost.log_model(model, f"{model_name}_model", signature=signatures)
    #                 mlflow.catboost.log_model(model, f"{model_name}_model")
            else:
                print(f"Model not in {machine_learning_model_names}, couldn't save the model !")
            
            value_list = [
                datetime.now().strftime('%Y-%m-%d %H:%M:%S'), model_name, *params.values(), 
                rmse, mse, mae, accuracy, auc_score, f1, precision, recall, specificity
            ]

            validation_results_df.append(value_list)       
    #         print(f"RMSE on the validation set: {rmse}")

            # Save validation results to file
            validation_records_df = pd.DataFrame(validation_results_df, columns=validation_column_list)
            validation_records_df.to_csv(validation_file_path, mode='a', header=False, index=False, float_format='%.6f')
    
    else:
        return f"Please select a model in {deep_learning_model_names} and {machine_learning_model_names} !"

# Review num distribution

In [None]:
import pandas as pd
from collections import defaultdict
import nltk
# nltk.download('punkt')

def get_word_tokens(text):
    tokens = nltk.word_tokenize(text)
    return tokens

def calcuate_reviews_num(reviews_df): 

    # Count reviews per user and calculate total word count and token count per user
    user_stats = defaultdict(dict)
    
    for index, review in reviews_df.iterrows():
        user_id = review['user_id']
        if user_id not in user_stats:
            user_stats[user_id]['reviews_num'] = 0
            user_stats[user_id]['words_num'] = 0
            user_stats[user_id]['tokens_num'] = 0

        user_stats[user_id]['reviews_num'] += 1
        user_stats[user_id]['words_num'] += len(review['text'].split())
        user_stats[user_id]['tokens_num'] += len(get_word_tokens(review['text']))

    # Count reviews per business and calculate total word count and token count per business
    business_stats = defaultdict(dict)

    for index, review in reviews_df.iterrows():
        business_id = review['business_id']
        if business_id not in business_stats:
            business_stats[business_id]['reviews_num'] = 0
            business_stats[business_id]['words_num'] = 0
            business_stats[business_id]['tokens_num'] = 0

        business_stats[business_id]['reviews_num'] += 1
        business_stats[business_id]['words_num'] += len(review['text'].split())
        business_stats[business_id]['tokens_num'] += len(get_word_tokens(review['text']))

    # Convert to DataFrame
    user_df = pd.DataFrame.from_dict(user_stats, orient='index').reset_index()
    user_df.columns = ['user_id', 'reviews_num', 'words_num', 'tokens_num']

    business_df = pd.DataFrame.from_dict(business_stats, orient='index').reset_index()
    business_df.columns = ['business_id', 'reviews_num', 'words_num', 'tokens_num']
    
    return user_df, business_df

In [None]:
def remove_outliers(data):
    mean = data.mean()
    std = data.std()
    lower_bound = mean - 2 * std
    upper_bound = mean + 2 * std
    return data[(data >= lower_bound) & (data <= upper_bound)]

def calculate_distribution(data):
    
    data = remove_outliers(data)
    
    distribution = {
        'min': np.min(data),
        'max': np.max(data),
        'mean': np.mean(data),
        'std': np.std(data),
        'mode': float(np.argmax(np.bincount(data))),
        'percentiles': {
            '5th': np.percentile(data, 5),
            '10th': np.percentile(data, 10),
            '15th': np.percentile(data, 15),
            '20th': np.percentile(data, 20),
            '25th': np.percentile(data, 25),
            '30th': np.percentile(data, 30),
            '35th': np.percentile(data, 35),
            '40th': np.percentile(data, 40),
            '45th': np.percentile(data, 45),
            '50th': np.percentile(data, 50),
            '55th': np.percentile(data, 55),
            '60th': np.percentile(data, 60),
            '65th': np.percentile(data, 65),
            '70th': np.percentile(data, 70),
            '75th': np.percentile(data, 75),
            '80th': np.percentile(data, 80),
            '85th': np.percentile(data, 85),
            '90th': np.percentile(data, 90),
            '95th': np.percentile(data, 95),
            '100th': np.percentile(data, 100),
        }
    }
    
#     print("Minimum:", distribution['min'])
#     print("Maximum:", distribution['max'])
#     print("Mean:", distribution['mean'])
#     print("Standard Deviation:", distribution['std'])
#     print("Mode:", distribution['mode'])

#     percentiles = distribution['percentiles']
#     for percentile, value in percentiles.items():
#         print(f"{percentile.capitalize()} Percentile:", value)
    
    return distribution

# Few reviews performance inspection

In [1]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def few_reviews_performance_inspection(train_df, test_df, run_id_experiment_task_model, 
                                       user_reviews_num=3,user_comparison="equal", 
                                       user_reviews_num_range=None, business_reviews_num=100000,
                                       business_comparison="max", business_reviews_num_range=None):
    
    # filter reviews
    filtered_reviews, calculation_results = sample_reviews_and_calculate_price_then_return_data(train_df, 100, 
                                                        user_reviews_num=user_reviews_num,
                                                        user_comparison=user_comparison,
                                                        user_reviews_num_range=user_reviews_num_range,
                                                        business_reviews_num=business_reviews_num,
                                                        business_comparison=business_comparison,
                                                        business_reviews_num_range=business_reviews_num_range,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
    
    filtered_reviews_num = calculation_results["filtered_reviews_num"]
    filtered_users_count = calculation_results["filtered_users_count"]
    filtered_businesses_count = calculation_results["filtered_businesses_count"]
    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
        return 0
    print("filtered_reviews_num:", filtered_reviews_num)
    print("filtered_users_count:", filtered_users_count)
    print("filtered_businesses_count:", filtered_businesses_count)
    
    unique_user_ids = filtered_reviews['user_id'].unique()
    unique_business_ids = filtered_reviews['business_id'].unique()

    test_data = test_df[
        (test_df['user_id'].isin(unique_user_ids)) & 
        (test_df['business_id'].isin(unique_business_ids))
    ]
    test_data.reset_index(inplace=True)

    # get X_test and y_test
    cols = list(test_data.columns)
    columns_to_train = [col for col in cols if "vector" in col.lower()]

    concatenated_vectors = []
    for i in range(len(test_data)):
        col_vectors = []
        for col in columns_to_train:
            col_vectors.append(np.array(eval(test_data.loc[i, col])))
        concatenated_vector = np.concatenate(col_vectors)
        concatenated_vectors.append(concatenated_vector)

    X_test = np.array(concatenated_vectors)
    y_test = np.array(test_data['stars'])

    deep_learning_model_names = ["FM", "MLP", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
    machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                    "AdaBoost", "XGBoost", "CatBoost"]
    
    num_experiments = len(run_id_experiment_task_model["run_id"])
    performance_results = {}
    for i in range(num_experiments):
        run_id = run_id_experiment_task_model["run_id"][i]
        experiment_name = run_id_experiment_task_model["experiment_name"][i]
    
        task_type = run_id_experiment_task_model["task_type"]
        model_name = run_id_experiment_task_model["model_name"]

        print("run_id:", run_id)
        print("experiment_name:", experiment_name)
        print("task_type:", task_type)
        print("model_name", model_name)

        model = 0
        y_pred = 0

        # Specify the directory containing the MLmodel file
        mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
        model_dir = f"runs:/{run_id}/{model_name}_model"

        print(f'Start predicting with {model_name} model ...')

        if model_name in deep_learning_model_names:

            # Load the model from the specified directory
            model = mlflow.pytorch.load_model(model_dir)

            # Convert to float tensor
            X_test_tensor = torch.from_numpy(X_test).float()
            Xi_test_tensor = torch.arange(X_test_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_test_tensor.size(0), 1, 1).int()
            y_test_tensor = torch.from_numpy(y_test).float()

            # Convert data to DataLoader
            test_dataset = TensorDataset(Xi_test_tensor, X_test_tensor, y_test_tensor)
            test_loader = DataLoader(test_dataset, batch_size=len(y_test), shuffle=False)

            for t, (xi, x, _) in enumerate(test_loader):

                if model_name == "DeepFM":
                    y_pred = model(xi, x)
                elif model_name == "MLP":
                    y_pred = model(x)
                    y_pred = y_pred.view(-1)
                else:    
                    y_pred = model(x)

                y_pred = y_pred.detach().numpy()

        elif model_name in machine_learning_model_names:
            if model_name == "XGBoost":
                model = mlflow.xgboost.load_model(model_dir)
            elif model_name == "CatBoost":
                model = mlflow.catboost.load_model(model_dir)
            else:
                model = mlflow.sklearn.load_model(model_dir)

            y_pred = model.predict(X_test)


        else:
            print(f"Please select model in {deep_learning_model_names} or {machine_learning_model_names} !")
            return 0

        results = {}

        rmse = 0.0
        mse = 0.0
        mae = 0.0
        accuracy = 0.0
        auc_score = 0.0
        f1 = 0.0
        precision = 0.0
        recall = 0.0
        specificity = 0.0

        if task_type=="regression":
            rmse = root_mean_squared_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
        else:
            accuracy = accuracy_score(y_test, y_pred) # Accuracy
            auc_score = roc_auc_score(y_test, y_pred) # AUC
            f1 = f1_score(y_test, y_pred) # F1-score
            precision = precision_score(y_test, y_pred) # Precision
            recall = recall_score(y_test, y_pred) # Recall
            specificity = recall_score(y_test, y_pred, pos_label=0) # Specificity (True Negative Rate)

        results["rmse"] = rmse
        results["mse"] = mse
        results["mae"] = mae
        results["accuracy"] = accuracy
        results["auc_score"] = auc_score
        results["f1"] = f1
        results["precision"] = precision
        results["recall"] = recall
        results["specificity"] = specificity
        
        print(f"{experiment_name}:\n", results)
        
        performance_results[experiment_name] = results

    return performance_results

In [None]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def inspect_few_reviews_performance(experiment_model_best_run_ids, task_type="regression"):
    
    performance_results = {}
#     performance_results_df = []
    
    for large_experiment, small_experiments in experiment_model_best_run_ids.items():
        
        if experiment_model_best_run_ids[large_experiment]["bbs"]["FM"]["MSE"] != 0:
            
            for representation, models in small_experiments.items():

                representation_name = "BERT_base_sentence_embedding" if representation == "bbs" else "RoBERTa_base_sentence_embedding"
                format_name = "text_format4" if "ori" in large_experiment and "gpt" not in large_experiment else "text_format_v4"
                path = f"./{large_experiment}/{format_name}/{representation_name}/"

                training_data = 0
                validation_data = 0
                test_data = 0
                temp_training_data_list = []
                temp_validation_data_list = []
                temp_test_data_list = []
                basic_experiments = ["original", "gpt_prompt_v1", "gpt_prompt_v3"]

                # loading data
                if "ori" in large_experiment and "gpt" in large_experiment and "concat" not in large_experiment:

                    for basic_experiment in basic_experiments:
                        if basic_experiment in large_experiment:
                            path = f"./{basic_experiment}/{format_name}/{representation_name}/"
                            files = os.listdir(path)
                            train_data_name = [file for file in files if 'train' in file and 'set' in file][0]
                            valid_data_name = [file for file in files if 'valid' in file and 'set' in file][0]
                            test_data_name = [file for file in files if 'test' in file and 'set' in file][0]
                            temp_training_data = pd.read_csv(path+train_data_name)
        #                     print(training_data.shape)
                            temp_training_data_list.append(temp_training_data)
                            temp_validation_data = pd.read_csv(path+valid_data_name)
        #                     print(validation_data.shape)
                            temp_validation_data_list.append(temp_validation_data)
                            temp_test_data = pd.read_csv(path+test_data_name)
        #                     print(test_data.shape)
                            temp_test_data_list.append(temp_test_data)

        #             training_data = temp_training_data_list[0]
        #             validation_data = temp_validation_data_list[0]
        #             test_data = temp_test_data_list[0]

                else:
                    files = os.listdir(path)
                    train_data_name = [file for file in files if 'train' in file and 'set' in file][0]
                    valid_data_name = [file for file in files if 'valid' in file and 'set' in file][0]
                    test_data_name = [file for file in files if 'test' in file and 'set' in file][0]
                    training_data = pd.read_csv(path+train_data_name)
        #             print(training_data.shape)
                    validation_data = pd.read_csv(path+valid_data_name)
        #             print(validation_data.shape)
                    test_data = pd.read_csv(path+test_data_name)
        #             print(test_data.shape)

                train_df = 0
                if "ori" in large_experiment and "gpt" in large_experiment and "concat" not in large_experiment:
                    train_df = temp_training_data_list[0]
                else:
                    train_df = training_data

                user_df, business_df = calcuate_reviews_num(train_df)
                user_distribution = calculate_distribution(user_df["reviews_num"])
                business_distribution = calculate_distribution(business_df["reviews_num"])

                # specify filter condition
                user_max_reviews_num = user_distribution['max']
                business_max_reviews_num = business_distribution['max']
                user_comparison_list = ["equal", "max"]
                user_reviews_num_range = None
                business_comparison_list = ["max", "equal"]
                business_reviews_num_range = None
        #         user_reviews_num_list = 

                for user_comparison in user_comparison_list:
                    for business_comparison in business_comparison_list:
                        
                        if user_comparison == "equal" and business_comparison == "equal":
                            print("Skip user_comparison equal and business_comparison equal !")
                        elif user_comparison == "max" and business_comparison == "max":
                            print("Skip user_comparison max and business_comparison max !")
                        else:
                        
                            for temp_user_reviews_num in range(1, user_max_reviews_num+1):
                                for temp_business_reviews_num in range(1, business_max_reviews_num+1):
                                
                                    user_reviews_num = 0
                                    business_reviews_num = 0
                                    
                                    if user_comparison == "equal" and business_comparison == "max":
                                        user_reviews_num = temp_user_reviews_num
                                        business_reviews_num = 10000000
                                    else:
#                                         user_comparison == "max" and business_comparison == "equal":
                                        user_reviews_num = 10000000
                                        business_reviews_num = temp_business_reviews_num
                                
                                    # filter reviews
                                    filtered_reviews, calculation_results = sample_reviews_and_calculate_price_then_return_data(train_df, 100, 
                                                                                        user_reviews_num=user_reviews_num,
                                                                                        user_comparison=user_comparison,
                                                                                        user_reviews_num_range=user_reviews_num_range,
                                                                                        business_reviews_num=business_reviews_num,
                                                                                        business_comparison=business_comparison,
                                                                                        business_reviews_num_range=business_reviews_num_range,
                                                                                        genai="GPT-3.5 Turbo", 
                                                                                        sampling_method='random', column='None')

                                    filtered_reviews_num = calculation_results["filtered_reviews_num"]
                                    filtered_users_count = calculation_results["filtered_users_count"]
                                    filtered_businesses_count = calculation_results["filtered_businesses_count"]
                                    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
                                        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
                                        print("\n\n\n")
                                        print("-----------------------------------------------------------")
                                        print("\n\n\n")
    #                                     return 0
                                    else:
                                        print("filtered_reviews_num:", filtered_reviews_num)
                                        print("filtered_users_count:", filtered_users_count)
                                        print("filtered_businesses_count:", filtered_businesses_count)

                                        unique_user_ids = filtered_reviews['user_id'].unique()
                                        unique_business_ids = filtered_reviews['business_id'].unique()

                                        # obtaining X_test and y_test
                                        X_test = 0
                                        y_test = 0
                                        if "ori" in large_experiment and "gpt" in large_experiment and "concat" not in large_experiment:

                                            test_data_list = []
                                            is_test_data_empty = False
                                            for data_df in temp_test_data_list:
                                                data_df = data_df[
                                                    (data_df['user_id'].isin(unique_user_ids)) & 
                                                    (data_df['business_id'].isin(unique_business_ids))
                                                ]
                                                if len(data_df) < 1:
                                                    is_test_data_empty = True
                                                else:
                                                    data_df.reset_index(inplace=True)
                                                    test_data_list.append(data_df)

                                            if is_test_data_empty:
                                                X_test = 0
                                                y_test = 0
                                            else:
                                                concatenated_vectors = []
                                                for i in range(len(test_data_list[0])):
                                                    col_vectors = []
                                                    for j in range(2): # 2 vectors
                                                        for data in test_data_list:
                                                            data_cols = list(data.columns)
                                                            columns_to_train = [col for col in data_cols if "vector" in col.lower()]
                                                            col = columns_to_train[j]
                                                            col_vectors.append(np.array(eval(data[col][i])))
                                                    concatenated_vector = np.concatenate(col_vectors)
                                                    concatenated_vectors.append(concatenated_vector)

                                                X_test = np.array(concatenated_vectors)
                                                y_test = np.array(test_data_list[0]['stars'])

                                        else:
    #                                         print("1")
                                            test_df = test_data[
                                                (test_data['user_id'].isin(unique_user_ids)) & 
                                                (test_data['business_id'].isin(unique_business_ids))
                                            ]
    #                                         print("2")
                                            if len(test_df) < 1:
                                                X_test = 0
                                                y_test = 0
                                            else:
                                                test_df.reset_index(inplace=True)        
                                                cols = list(test_df.columns)
                                                columns_to_train = [col for col in cols if "vector" in col.lower()]

                                                concatenated_vectors = []
                                                for i in range(len(test_df)):
                                                    col_vectors = []
                                                    for col in columns_to_train:
                                                        col_vectors.append(np.array(eval(test_df.loc[i, col])))
                                                    concatenated_vector = np.concatenate(col_vectors)
                                                    concatenated_vectors.append(concatenated_vector)

                                                X_test = np.array(concatenated_vectors)
                                                y_test = np.array(test_df['stars'])
    #                                             print("3")

                                        if not isinstance(X_test, np.ndarray) and X_test == 0:
                                            print('Test data is empty ! \n\n')
                                            print("-----------------------------------------------------------")

                                        else:
    #                                         print("3.1")
                                            # start testing
                                            deep_learning_model_names = ["FM", "MLP", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
                                            machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                                                            "AdaBoost", "XGBoost", "CatBoost"]

                                            for model_name, metrics in models.items():

                                                for metric, run_id in metrics.items():

#                                                     run_name = f"{large_experiment}/{format_name}/{representation_name}/{model_name}"
#                                                     run_name = f"{large_experiment}/{format_name}/{representation_name}"
    
                                                    results = {}

                                                    rmse = 0.0
                                                    mse = 0.0
                                                    mae = 0.0
                                                    accuracy = 0.0
                                                    auc_score = 0.0
                                                    f1 = 0.0
                                                    precision = 0.0
                                                    recall = 0.0
                                                    specificity = 0.0

                                                    results["rmse"] = rmse
                                                    results["mse"] = mse
                                                    results["mae"] = mae
                                                    results["accuracy"] = accuracy
                                                    results["auc_score"] = auc_score
                                                    results["f1"] = f1
                                                    results["precision"] = precision
                                                    results["recall"] = recall
                                                    results["specificity"] = specificity
    #                                                 print("3.5")
                                                    record_file_name = "inspect_few_reviews_performance_results.csv"
                                                    if not os.path.exists(record_file_name):
                                                        column_list = ["experiment_name", "format_name", "representation_name", "model_name", "user_comparison", "user_reviews_num", 
                                                                       "business_comparison", "business_reviews_num", "metric",
                                                                       "filtered_reviews_num", "filtered_users_count", 
                                                                       "filtered_businesses_count", "num_of_test_data", *results.keys()]
                                                        empty_df = pd.DataFrame(columns=column_list)
                                                        empty_df.to_csv(record_file_name, index=False, float_format='%.6f')
                                                    else:
                                                        record_df = pd.read_csv(record_file_name)
                                                        columns_to_compare = ["experiment_name", "format_name", "representation_name", "model_name", "user_comparison", "user_reviews_num", 
                                                                              "business_comparison", "business_reviews_num", "metric"]
                                                        record_df = record_df[columns_to_compare]
                                                        record_df = record_df.values.tolist()
                                                        record_df = [[str(value) for value in onelist] for onelist in record_df]

                                                        temp_df_values = [[large_experiment, format_name, representation_name, model_name, user_comparison, user_reviews_num, 
                                                                  business_comparison, business_reviews_num, metric]]
                                                        
                                                        temp_df = pd.DataFrame(temp_df_values, columns=columns_to_compare)
                                                        temp_df = temp_df.values.tolist()[0]
                                                        temp_df = [str(value) for value in temp_df]
    #                                                     print("4")
                                                        if temp_df in record_df:
                                                            print("This combination already exists in record results !")
        #                                                     return 0
                                                        else:

                                                            if run_id == 0 or len(X_test) < 1:

                                                                results["rmse"] = rmse
                                                                results["mse"] = mse
                                                                results["mae"] = mae
                                                                results["accuracy"] = accuracy
                                                                results["auc_score"] = auc_score
                                                                results["f1"] = f1
                                                                results["precision"] = precision
                                                                results["recall"] = recall
                                                                results["specificity"] = specificity

                                                            else:
    #                                                             print("5")
                                                                # Specify the directory containing the MLmodel file
                                                                mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
                                                                model_dir = f"runs:/{run_id}/{model_name}_model"
                                                                # Load the model from the specified directory
    #                                                             model = mlflow.pytorch.load_model(model_dir)

                                                                print(f'Start predicting with {large_experiment}/{format_name}/{representation_name}/{model_name}/{metric} ...')

                                                                if model_name in deep_learning_model_names:

                                                                    model = mlflow.pytorch.load_model(model_dir)

                                                                    # Convert to float tensor
                                                                    X_test_tensor = torch.from_numpy(X_test).float()
                                                                    Xi_test_tensor = torch.arange(X_test_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_test_tensor.size(0), 1, 1).int()
                                                                    y_test_tensor = torch.from_numpy(y_test).float()

                                                                    # Convert data to DataLoader
                                                                    test_dataset = TensorDataset(Xi_test_tensor, X_test_tensor, y_test_tensor)
                                                                    test_loader = DataLoader(test_dataset, batch_size=len(y_test), shuffle=False)

                                                                    for t, (xi, x, _) in enumerate(test_loader):

                                                                        if model_name == "DeepFM":
                                                                            y_pred = model(xi, x)
                                                                        elif model_name == "MLP":
                                                                            y_pred = model(x)
                                                                            y_pred = y_pred.view(-1)
                                                                        else:    
                                                                            y_pred = model(x)

                                                                        y_pred = y_pred.detach().numpy()

                                                                elif model_name in machine_learning_model_names:
                                                                    if model_name == "XGBoost":
                                                                        model = mlflow.xgboost.load_model(model_dir)
                                                                    elif model_name == "CatBoost":
                                                                        model = mlflow.catboost.load_model(model_dir)
                                                                    else:
                                                                        model = mlflow.sklearn.load_model(model_dir)

                                                                    y_pred = model.predict(X_test)

                                                                else:
                                                                    print(f"Please select model in {deep_learning_model_names} or {machine_learning_model_names} !")
                                                                    return 0

                                                                if task_type=="regression":
                                                                    rmse = root_mean_squared_error(y_test, y_pred)
                                                                    mse = mean_squared_error(y_test, y_pred)
                                                                    mae = mean_absolute_error(y_test, y_pred)
                                                                else:
                                                                    accuracy = accuracy_score(y_test, y_pred) # Accuracy
                                                                    auc_score = roc_auc_score(y_test, y_pred) # AUC
                                                                    f1 = f1_score(y_test, y_pred) # F1-score
                                                                    precision = precision_score(y_test, y_pred) # Precision
                                                                    recall = recall_score(y_test, y_pred) # Recall
                                                                    specificity = recall_score(y_test, y_pred, pos_label=0) # Specificity (True Negative Rate)

                                                                results["rmse"] = rmse
                                                                results["mse"] = mse
                                                                results["mae"] = mae
                                                                results["accuracy"] = accuracy
                                                                results["auc_score"] = auc_score
                                                                results["f1"] = f1
                                                                results["precision"] = precision
                                                                results["recall"] = recall
                                                                results["specificity"] = specificity

                                                            print(f"{large_experiment}/{format_name}/{representation_name}/{model_name}/{metric}, uc:{user_comparison}, un:{user_reviews_num}, bc:{business_comparison}, bn:{business_reviews_num} :\n", results)

                                                            performance_results[large_experiment] = {}
                                                            performance_results[large_experiment][format_name] = {}
                                                            performance_results[large_experiment][format_name][representation_name] = {}
                                                            performance_results[large_experiment][format_name][representation_name][model_name] = {}
                                                            performance_results[large_experiment][format_name][representation_name][model_name][metric] = {}
                                                            performance_results[large_experiment][format_name][representation_name][model_name][metric][user_comparison] = {}
                                                            performance_results[large_experiment][format_name][representation_name][model_name][metric][user_comparison][user_reviews_num] = {}
                                                            performance_results[large_experiment][format_name][representation_name][model_name][metric][user_comparison][user_reviews_num][business_comparison] = {}
                                                            performance_results[large_experiment][format_name][representation_name][model_name][metric][user_comparison][user_reviews_num][business_comparison][business_reviews_num] = results

                                                            value_list = [large_experiment, format_name, representation_name, model_name, user_comparison, user_reviews_num, 
                                                                          business_comparison, business_reviews_num, metric, filtered_reviews_num, 
                                                                          filtered_users_count, filtered_businesses_count, len(X_test), *results.values()]

                                                            performance_results_df = []
                                                            performance_results_df.append(value_list)
                                                            column_list = ["experiment_name", "format_name", "representation_name", "model_name", "user_comparison", "user_reviews_num", "business_comparison", 
                                                                           "business_reviews_num", "metric", "filtered_reviews_num", "filtered_users_count",
                                                                           "filtered_businesses_count", "num_of_test_data", *results.keys()]

                                                            # Save results to file
                                                            if run_id != 0:
                                                                records_df = pd.DataFrame(performance_results_df, columns=column_list)
                                                                records_df.to_csv(record_file_name, mode='a', header=False, index=False, float_format='%.6f')

    return performance_results

# Inspect performance of different stars

In [None]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def inspect_stars_performance(experiment_model_best_run_ids, task_type="regression"):
    
    performance_results = {}
#     performance_results_df = []
    
    for large_experiment, small_experiments in experiment_model_best_run_ids.items():
        
        if experiment_model_best_run_ids[large_experiment]["bbs"]["FM"]["MSE"] != 0:
            
            for representation, models in small_experiments.items():

                representation_name = "BERT_base_sentence_embedding" if representation == "bbs" else "RoBERTa_base_sentence_embedding"
                format_name = "text_format4" if "ori" in large_experiment and "gpt" not in large_experiment else "text_format_v4"
                path = f"./{large_experiment}/{format_name}/{representation_name}/"

                training_data = 0
                validation_data = 0
                test_data = 0
                temp_training_data_list = []
                temp_validation_data_list = []
                temp_test_data_list = []
                basic_experiments = ["original", "gpt_prompt_v1", "gpt_prompt_v3"]

                # loading data
                if "ori" in large_experiment and "gpt" in large_experiment and "concat" not in large_experiment:

                    for basic_experiment in basic_experiments:
                        if basic_experiment in large_experiment:
                            path = f"./{basic_experiment}/{format_name}/{representation_name}/"
                            files = os.listdir(path)
                            train_data_name = [file for file in files if 'train' in file and 'set' in file][0]
                            valid_data_name = [file for file in files if 'valid' in file and 'set' in file][0]
                            test_data_name = [file for file in files if 'test' in file and 'set' in file][0]
                            temp_training_data = pd.read_csv(path+train_data_name)
        #                     print(training_data.shape)
                            temp_training_data_list.append(temp_training_data)
                            temp_validation_data = pd.read_csv(path+valid_data_name)
        #                     print(validation_data.shape)
                            temp_validation_data_list.append(temp_validation_data)
                            temp_test_data = pd.read_csv(path+test_data_name)
        #                     print(test_data.shape)
                            temp_test_data_list.append(temp_test_data)

        #             training_data = temp_training_data_list[0]
        #             validation_data = temp_validation_data_list[0]
        #             test_data = temp_test_data_list[0]

                else:
                    files = os.listdir(path)
                    train_data_name = [file for file in files if 'train' in file and 'set' in file][0]
                    valid_data_name = [file for file in files if 'valid' in file and 'set' in file][0]
                    test_data_name = [file for file in files if 'test' in file and 'set' in file][0]
                    training_data = pd.read_csv(path+train_data_name)
        #             print(training_data.shape)
                    validation_data = pd.read_csv(path+valid_data_name)
        #             print(validation_data.shape)
                    test_data = pd.read_csv(path+test_data_name)
        #             print(test_data.shape)

                train_df = 0
                if "ori" in large_experiment and "gpt" in large_experiment and "concat" not in large_experiment:
                    train_df = temp_training_data_list[0]
                else:
                    train_df = training_data

                for star_value in [1, 2, 3, 4, 5]:

                    filtered_reviews = train_df[train_df['stars']==star_value]

                    filtered_reviews_num = len(filtered_reviews)
                    filtered_users_count = len(filtered_reviews['user_id'].unique())
                    filtered_businesses_count = len(filtered_reviews['business_id'].unique())

                    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
                        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
                        print("\n\n\n")
                        print("-----------------------------------------------------------")
                        print("\n\n\n")
#                                     return 0
                    else:
                        print("filtered_reviews_num:", filtered_reviews_num)
                        print("filtered_users_count:", filtered_users_count)
                        print("filtered_businesses_count:", filtered_businesses_count)

                        unique_user_ids = filtered_reviews['user_id'].unique()
                        unique_business_ids = filtered_reviews['business_id'].unique()

                        # obtaining X_test and y_test
                        X_test = 0
                        y_test = 0
                        if "ori" in large_experiment and "gpt" in large_experiment and "concat" not in large_experiment:

                            test_data_list = []
                            is_test_data_empty = False
                            for data_df in temp_test_data_list:
                                data_df = data_df[
                                    (data_df['user_id'].isin(unique_user_ids)) & 
                                    (data_df['business_id'].isin(unique_business_ids))
                                ]
                                if len(data_df) < 1:
                                    is_test_data_empty = True
                                else:
                                    data_df.reset_index(inplace=True)
                                    test_data_list.append(data_df)

                            if is_test_data_empty:
                                X_test = 0
                                y_test = 0
                            else:
                                concatenated_vectors = []
                                for i in range(len(test_data_list[0])):
                                    col_vectors = []
                                    for j in range(2): # 2 vectors
                                        for data in test_data_list:
                                            data_cols = list(data.columns)
                                            columns_to_train = [col for col in data_cols if "vector" in col.lower()]
                                            col = columns_to_train[j]
                                            col_vectors.append(np.array(eval(data[col][i])))
                                    concatenated_vector = np.concatenate(col_vectors)
                                    concatenated_vectors.append(concatenated_vector)

                                X_test = np.array(concatenated_vectors)
                                y_test = np.array(test_data_list[0]['stars'])

                        else:
#                                         print("1")
                            test_df = test_data[
                                (test_data['user_id'].isin(unique_user_ids)) & 
                                (test_data['business_id'].isin(unique_business_ids))
                            ]
#                                         print("2")
                            if len(test_df) < 1:
                                X_test = 0
                                y_test = 0
                            else:
                                test_df.reset_index(inplace=True)        
                                cols = list(test_df.columns)
                                columns_to_train = [col for col in cols if "vector" in col.lower()]

                                concatenated_vectors = []
                                for i in range(len(test_df)):
                                    col_vectors = []
                                    for col in columns_to_train:
                                        col_vectors.append(np.array(eval(test_df.loc[i, col])))
                                    concatenated_vector = np.concatenate(col_vectors)
                                    concatenated_vectors.append(concatenated_vector)

                                X_test = np.array(concatenated_vectors)
                                y_test = np.array(test_df['stars'])
#                                             print("3")

                        if not isinstance(X_test, np.ndarray) and X_test == 0:
                            print('Test data is empty ! \n\n')
                            print("-----------------------------------------------------------")

                        else:
#                                         print("3.1")
                            # start testing
                            deep_learning_model_names = ["FM", "MLP", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
                            machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                                            "AdaBoost", "XGBoost", "CatBoost"]

                            for model_name, metrics in models.items():

                                for metric, run_id in metrics.items():

#                                                     run_name = f"{large_experiment}/{format_name}/{representation_name}/{model_name}"
#                                                     run_name = f"{large_experiment}/{format_name}/{representation_name}"

                                    results = {}

                                    rmse = 0.0
                                    mse = 0.0
                                    mae = 0.0
                                    accuracy = 0.0
                                    auc_score = 0.0
                                    f1 = 0.0
                                    precision = 0.0
                                    recall = 0.0
                                    specificity = 0.0

                                    results["rmse"] = rmse
                                    results["mse"] = mse
                                    results["mae"] = mae
                                    results["accuracy"] = accuracy
                                    results["auc_score"] = auc_score
                                    results["f1"] = f1
                                    results["precision"] = precision
                                    results["recall"] = recall
                                    results["specificity"] = specificity
#                                                 print("3.5")
                                    record_file_name = "inspect_few_reviews_performance_results.csv"
                                    if not os.path.exists(record_file_name):
                                        column_list = ["experiment_name", "format_name", "representation_name", "model_name", "metric",
                                                       "star_value", "filtered_reviews_num", "filtered_users_count", 
                                                       "filtered_businesses_count", "num_of_test_data", *results.keys()]
                                        empty_df = pd.DataFrame(columns=column_list)
                                        empty_df.to_csv(record_file_name, index=False, float_format='%.6f')
                                    else:
                                        record_df = pd.read_csv(record_file_name)
                                        columns_to_compare = ["experiment_name", "format_name", "representation_name", "model_name", "metric", "star_value"]
                                        record_df = record_df[columns_to_compare]
                                        record_df = record_df.values.tolist()
                                        record_df = [[str(value) for value in onelist] for onelist in record_df]

                                        temp_df_values = [[large_experiment, format_name, representation_name, model_name, metric, star_value]]

                                        temp_df = pd.DataFrame(temp_df_values, columns=columns_to_compare)
                                        temp_df = temp_df.values.tolist()[0]
                                        temp_df = [str(value) for value in temp_df]
#                                                     print("4")
                                        if temp_df in record_df:
                                            print("This combination already exists in record results !")
#                                                     return 0
                                        else:

                                            if run_id == 0 or len(X_test) < 1:

                                                results["rmse"] = rmse
                                                results["mse"] = mse
                                                results["mae"] = mae
                                                results["accuracy"] = accuracy
                                                results["auc_score"] = auc_score
                                                results["f1"] = f1
                                                results["precision"] = precision
                                                results["recall"] = recall
                                                results["specificity"] = specificity

                                            else:
#                                                             print("5")
                                                # Specify the directory containing the MLmodel file
                                                mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
                                                model_dir = f"runs:/{run_id}/{model_name}_model"
                                                # Load the model from the specified directory
#                                                             model = mlflow.pytorch.load_model(model_dir)

                                                print(f'Start predicting with {large_experiment}/{format_name}/{representation_name}/{model_name}/{metric} ...')

                                                if model_name in deep_learning_model_names:

                                                    model = mlflow.pytorch.load_model(model_dir)

                                                    # Convert to float tensor
                                                    X_test_tensor = torch.from_numpy(X_test).float()
                                                    Xi_test_tensor = torch.arange(X_test_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_test_tensor.size(0), 1, 1).int()
                                                    y_test_tensor = torch.from_numpy(y_test).float()

                                                    # Convert data to DataLoader
                                                    test_dataset = TensorDataset(Xi_test_tensor, X_test_tensor, y_test_tensor)
                                                    test_loader = DataLoader(test_dataset, batch_size=len(y_test), shuffle=False)

                                                    for t, (xi, x, _) in enumerate(test_loader):

                                                        if model_name == "DeepFM":
                                                            y_pred = model(xi, x)
                                                        elif model_name == "MLP":
                                                            y_pred = model(x)
                                                            y_pred = y_pred.view(-1)
                                                        else:    
                                                            y_pred = model(x)

                                                        y_pred = y_pred.detach().numpy()

                                                elif model_name in machine_learning_model_names:
                                                    if model_name == "XGBoost":
                                                        model = mlflow.xgboost.load_model(model_dir)
                                                    elif model_name == "CatBoost":
                                                        model = mlflow.catboost.load_model(model_dir)
                                                    else:
                                                        model = mlflow.sklearn.load_model(model_dir)

                                                    y_pred = model.predict(X_test)

                                                else:
                                                    print(f"Please select model in {deep_learning_model_names} or {machine_learning_model_names} !")
                                                    return 0

                                                if task_type=="regression":
                                                    rmse = root_mean_squared_error(y_test, y_pred)
                                                    mse = mean_squared_error(y_test, y_pred)
                                                    mae = mean_absolute_error(y_test, y_pred)
                                                else:
                                                    accuracy = accuracy_score(y_test, y_pred) # Accuracy
                                                    auc_score = roc_auc_score(y_test, y_pred) # AUC
                                                    f1 = f1_score(y_test, y_pred) # F1-score
                                                    precision = precision_score(y_test, y_pred) # Precision
                                                    recall = recall_score(y_test, y_pred) # Recall
                                                    specificity = recall_score(y_test, y_pred, pos_label=0) # Specificity (True Negative Rate)

                                                results["rmse"] = rmse
                                                results["mse"] = mse
                                                results["mae"] = mae
                                                results["accuracy"] = accuracy
                                                results["auc_score"] = auc_score
                                                results["f1"] = f1
                                                results["precision"] = precision
                                                results["recall"] = recall
                                                results["specificity"] = specificity

                                            print(f"{large_experiment}/{format_name}/{representation_name}/{model_name}/{metric}, star_value:{star_value}", results)

                                            performance_results[large_experiment] = {}
                                            performance_results[large_experiment][format_name] = {}
                                            performance_results[large_experiment][format_name][representation_name] = {}
                                            performance_results[large_experiment][format_name][representation_name][model_name] = {}
                                            performance_results[large_experiment][format_name][representation_name][model_name][metric] = {}
                                            performance_results[large_experiment][format_name][representation_name][model_name][metric][star_value] = {}

                                            value_list = [large_experiment, format_name, representation_name, model_name, metric, star_value, 
                                                          filtered_reviews_num, filtered_users_count, filtered_businesses_count, len(X_test), *results.values()]

                                            performance_results_df = []
                                            performance_results_df.append(value_list)
                                            column_list = ["experiment_name", "format_name", "representation_name", "model_name", "metric", "star_value", 
                                                           "filtered_reviews_num", "filtered_users_count", "filtered_businesses_count", "num_of_test_data", *results.keys()]

                                            # Save results to file
                                            if run_id != 0:
                                                records_df = pd.DataFrame(performance_results_df, columns=column_list)
                                                records_df.to_csv(record_file_name, mode='a', header=False, index=False, float_format='%.6f')

    return performance_results

# Print nested dictionary data

In [None]:
def print_nested_dict(d, indent=0):
    for key, value in d.items():
        if isinstance(value, dict):
            print(' ' * indent + str(key) + ':')
            print_nested_dict(value, indent + 4)
        else:
            print(' ' * indent + str(key) + ': ' + str(value))

def write_nested_dict_to_file(d, f, indent=0):
    for key, value in d.items():
        if isinstance(value, dict):
            f.write("  " * indent + str(key) + ":\n")
            write_nested_dict_to_file(value, f, indent + 1)
        else:
            f.write("  " * indent + str(key) + ": " + str(value) + "\n")

# Different number of reviews performance inspection

In [None]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def return_filtered_train_test_data(train_df, test_df, user_reviews_num=3,user_comparison="equal", 
                                    user_reviews_num_range=None, business_reviews_num=100000,
                                    business_comparison="max", business_reviews_num_range=None):
    
    # filter reviews
    filtered_reviews, calculation_results = sample_reviews_and_calculate_price_then_return_data(train_df, 100, 
                                                        user_reviews_num=user_reviews_num,
                                                        user_comparison=user_comparison,
                                                        user_reviews_num_range=user_reviews_num_range,
                                                        business_reviews_num=business_reviews_num,
                                                        business_comparison=business_comparison,
                                                        business_reviews_num_range=business_reviews_num_range,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
    
    filtered_reviews_num = calculation_results["filtered_reviews_num"]
    filtered_users_count = calculation_results["filtered_users_count"]
    filtered_businesses_count = calculation_results["filtered_businesses_count"]
    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
        return 0
    
    print(f"User reviews: {user_reviews_num} {user_comparison} or {user_reviews_num_range}")
    print(f"Business reviews: {business_reviews_num} {business_comparison} or {business_reviews_num_range}")
    
    calculation_results = {}
    
    calculation_results["Train_data_num_before"] = len(train_df)
    calculation_results["Train_data_num_after"] = filtered_reviews_num
    calculation_results["Train_data_user_count_before"] = len(train_df['user_id'].unique())
    calculation_results["Train_data_user_count_after"] = filtered_users_count
    calculation_results["Train_data_business_count_before"] = len(train_df['business_id'].unique())
    calculation_results["Train_data_business_count_after"] = filtered_businesses_count
    
    print("Training data num:")
    print("Before:", len(train_df), "After:", filtered_reviews_num, '\n')
    print("Training data user count:")
    print("Before:", len(train_df['user_id'].unique()), "After:", filtered_users_count, '\n')
    print("Training data business count:")
    print("Before:", len(train_df['business_id'].unique()), "After:", filtered_businesses_count, '\n')
    
    unique_user_ids = filtered_reviews['user_id'].unique()
    unique_business_ids = filtered_reviews['business_id'].unique()

    test_data = test_df[
        (test_df['user_id'].isin(unique_user_ids)) & 
        (test_df['business_id'].isin(unique_business_ids))
    ]
    test_data.reset_index(inplace=True)
    
    calculation_results["Test_data_num_before"] = len(test_df)
    calculation_results["Test_data_num_after"] = len(test_data)
    calculation_results["Test_data_user_count_before"] = len(test_df['user_id'].unique())
    calculation_results["Test_data_user_count_after"] = len(test_data['user_id'].unique())
    calculation_results["Test_data_business_count_before"] = len(test_df['business_id'].unique())
    calculation_results["Test_data_business_count_after"] = len(test_data['business_id'].unique())
    
    print("Validation (Test) data num:")
    print("Before:", len(test_df), "After:", len(test_data), '\n')
    print("Validation (Test) data user count:")
    print("Before:", len(test_df['user_id'].unique()), "After:", len(test_data['user_id'].unique()), '\n')
    print("Validation (Test) data business count:")
    print("Before:", len(test_df['business_id'].unique()), "After:", len(test_data['business_id'].unique()))

    return filtered_reviews, test_data, calculation_results