In [5]:
# from pathlib import Path
# import nbformat

# def load_notebook(notebook_path):
#     with open(notebook_path, 'r', encoding='utf-8') as f:
#         nb = nbformat.read(f, as_version=4)
#     code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
#     exec('\n'.join(code_cells), globals())

# # import written function and variable

# parent_directory = Path('./')
# data_preprocessing_utils_path = parent_directory / 'data_preprocessing_utils.ipynb'
# load_notebook(data_preprocessing_utils_path)

# Review num distribution

In [None]:
import pandas as pd
from collections import defaultdict
import nltk
# nltk.download('punkt')

def get_word_tokens(text):
    tokens = nltk.word_tokenize(text)
    return tokens

def calcuate_reviews_num(reviews_df): 

    # Count reviews per user and calculate total word count and token count per user
    user_stats = defaultdict(dict)
    
    for index, review in reviews_df.iterrows():
        user_id = review['user_id']
        if user_id not in user_stats:
            user_stats[user_id]['reviews_num'] = 0
            user_stats[user_id]['words_num'] = 0
            user_stats[user_id]['tokens_num'] = 0

        user_stats[user_id]['reviews_num'] += 1
        user_stats[user_id]['words_num'] += len(review['text'].split())
        user_stats[user_id]['tokens_num'] += len(get_word_tokens(review['text']))

    # Count reviews per business and calculate total word count and token count per business
    business_stats = defaultdict(dict)

    for index, review in reviews_df.iterrows():
        business_id = review['business_id']
        if business_id not in business_stats:
            business_stats[business_id]['reviews_num'] = 0
            business_stats[business_id]['words_num'] = 0
            business_stats[business_id]['tokens_num'] = 0

        business_stats[business_id]['reviews_num'] += 1
        business_stats[business_id]['words_num'] += len(review['text'].split())
        business_stats[business_id]['tokens_num'] += len(get_word_tokens(review['text']))

    # Convert to DataFrame
    user_df = pd.DataFrame.from_dict(user_stats, orient='index').reset_index()
    user_df.columns = ['user_id', 'reviews_num', 'words_num', 'tokens_num']

    business_df = pd.DataFrame.from_dict(business_stats, orient='index').reset_index()
    business_df.columns = ['business_id', 'reviews_num', 'words_num', 'tokens_num']
    
    return user_df, business_df

In [None]:
def remove_outliers(data):
    mean = data.mean()
    std = data.std()
    lower_bound = mean - 2 * std
    upper_bound = mean + 2 * std
    return data[(data >= lower_bound) & (data <= upper_bound)]

def calculate_distribution(data):
    
    data = remove_outliers(data)
    
    distribution = {
        'min': np.min(data),
        'max': np.max(data),
        'mean': np.mean(data),
        'std': np.std(data),
        'mode': float(np.argmax(np.bincount(data))),
        'percentiles': {
            '5th': np.percentile(data, 5),
            '10th': np.percentile(data, 10),
            '15th': np.percentile(data, 15),
            '20th': np.percentile(data, 20),
            '25th': np.percentile(data, 25),
            '30th': np.percentile(data, 30),
            '35th': np.percentile(data, 35),
            '40th': np.percentile(data, 40),
            '45th': np.percentile(data, 45),
            '50th': np.percentile(data, 50),
            '55th': np.percentile(data, 55),
            '60th': np.percentile(data, 60),
            '65th': np.percentile(data, 65),
            '70th': np.percentile(data, 70),
            '75th': np.percentile(data, 75),
            '80th': np.percentile(data, 80),
            '85th': np.percentile(data, 85),
            '90th': np.percentile(data, 90),
            '95th': np.percentile(data, 95),
            '100th': np.percentile(data, 100),
        }
    }
    
#     print("Minimum:", distribution['min'])
#     print("Maximum:", distribution['max'])
#     print("Mean:", distribution['mean'])
#     print("Standard Deviation:", distribution['std'])
#     print("Mode:", distribution['mode'])

#     percentiles = distribution['percentiles']
#     for percentile, value in percentiles.items():
#         print(f"{percentile.capitalize()} Percentile:", value)
    
    return distribution

# Few reviews performance inspection

In [1]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def few_reviews_performance_inspection(train_df, test_df, run_id_experiment_task_model, 
                                       user_reviews_num=3,user_comparison="equal", 
                                       user_reviews_num_range=None, business_reviews_num=100000,
                                       business_comparison="max", business_reviews_num_range=None):
    
    # filter reviews
    filtered_reviews, calculation_results = sample_reviews_and_calculate_price_then_return_data(train_df, 100, 
                                                        user_reviews_num=user_reviews_num,
                                                        user_comparison=user_comparison,
                                                        user_reviews_num_range=user_reviews_num_range,
                                                        business_reviews_num=business_reviews_num,
                                                        business_comparison=business_comparison,
                                                        business_reviews_num_range=business_reviews_num_range,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
    
    filtered_reviews_num = calculation_results["filtered_reviews_num"]
    filtered_users_count = calculation_results["filtered_users_count"]
    filtered_businesses_count = calculation_results["filtered_businesses_count"]
    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
        return 0
    print("filtered_reviews_num:", filtered_reviews_num)
    print("filtered_users_count:", filtered_users_count)
    print("filtered_businesses_count:", filtered_businesses_count)
    
    unique_user_ids = filtered_reviews['user_id'].unique()
    unique_business_ids = filtered_reviews['business_id'].unique()

    test_data = test_df[
        (test_df['user_id'].isin(unique_user_ids)) & 
        (test_df['business_id'].isin(unique_business_ids))
    ]
    test_data.reset_index(inplace=True)

    # get X_test and y_test
    cols = list(test_data.columns)
    columns_to_train = [col for col in cols if "vector" in col.lower()]

    concatenated_vectors = []
    for i in range(len(test_data)):
        col_vectors = []
        for col in columns_to_train:
            col_vectors.append(np.array(eval(test_data.loc[i, col])))
        concatenated_vector = np.concatenate(col_vectors)
        concatenated_vectors.append(concatenated_vector)

    X_test = np.array(concatenated_vectors)
    y_test = np.array(test_data['stars'])

    deep_learning_model_names = ["FM", "MLP", "DeepFM", "AFM", "DCN", "xDeepFM", "AutoInt", "AFN"]
    machine_learning_model_names = ["Linear", "KNN", "SVM", "DecisionTree", "RandomForest", 
                                    "AdaBoost", "XGBoost", "CatBoost"]
    
    num_experiments = len(run_id_experiment_task_model["run_id"])
    performance_results = {}
    for i in range(num_experiments):
        run_id = run_id_experiment_task_model["run_id"][i]
        experiment_name = run_id_experiment_task_model["experiment_name"][i]
    
        task_type = run_id_experiment_task_model["task_type"]
        model_name = run_id_experiment_task_model["model_name"]

        print("run_id:", run_id)
        print("experiment_name:", experiment_name)
        print("task_type:", task_type)
        print("model_name", model_name)

        model = 0
        y_pred = 0

        # Specify the directory containing the MLmodel file
        mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
        model_dir = f"runs:/{run_id}/{model_name}_model"

        print(f'Start predicting with {model_name} model ...')

        if model_name in deep_learning_model_names:

            # Load the model from the specified directory
            model = mlflow.pytorch.load_model(model_dir)

            # Convert to float tensor
            X_test_tensor = torch.from_numpy(X_test).float()
            Xi_test_tensor = torch.arange(X_test_tensor.size(1)).unsqueeze(0).unsqueeze(-1).repeat(X_test_tensor.size(0), 1, 1).int()
            y_test_tensor = torch.from_numpy(y_test).float()

            # Convert data to DataLoader
            test_dataset = TensorDataset(Xi_test_tensor, X_test_tensor, y_test_tensor)
            test_loader = DataLoader(test_dataset, batch_size=len(y_test), shuffle=False)

            for t, (xi, x, _) in enumerate(test_loader):

                if model_name == "DeepFM":
                    y_pred = model(xi, x)
                elif model_name == "MLP":
                    y_pred = model(x)
                    y_pred = y_pred.view(-1)
                else:    
                    y_pred = model(x)

                y_pred = y_pred.detach().numpy()

        elif model_name in machine_learning_model_names:
            if model_name == "XGBoost":
                model = mlflow.xgboost.load_model(model_dir)
            elif model_name == "CatBoost":
                model = mlflow.catboost.load_model(model_dir)
            else:
                model = mlflow.sklearn.load_model(model_dir)

            y_pred = model.predict(X_test)


        else:
            print(f"Please select model in {deep_learning_model_names} or {machine_learning_model_names} !")
            return 0

        results = {}

        rmse = 0.0
        mse = 0.0
        mae = 0.0
        accuracy = 0.0
        auc_score = 0.0
        f1 = 0.0
        precision = 0.0
        recall = 0.0
        specificity = 0.0

        if task_type=="regression":
            rmse = root_mean_squared_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
        else:
            accuracy = accuracy_score(y_test, y_pred) # Accuracy
            auc_score = roc_auc_score(y_test, y_pred) # AUC
            f1 = f1_score(y_test, y_pred) # F1-score
            precision = precision_score(y_test, y_pred) # Precision
            recall = recall_score(y_test, y_pred) # Recall
            specificity = recall_score(y_test, y_pred, pos_label=0) # Specificity (True Negative Rate)

        results["rmse"] = rmse
        results["mse"] = mse
        results["mae"] = mae
        results["accuracy"] = accuracy
        results["auc_score"] = auc_score
        results["f1"] = f1
        results["precision"] = precision
        results["recall"] = recall
        results["specificity"] = specificity
        
        print(f"{experiment_name}:\n", results)
        
        performance_results[experiment_name] = results

    return performance_results

# Different number of reviews performance inspection

In [None]:
import mlflow
from mlflow.models import infer_signature
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import tqdm
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, auc, roc_curve, f1_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, roc_auc_score
import os
import numpy as np
import pandas as pd
from datetime import datetime

def return_filtered_train_test_data(train_df, test_df, user_reviews_num=3,user_comparison="equal", 
                                    user_reviews_num_range=None, business_reviews_num=100000,
                                    business_comparison="max", business_reviews_num_range=None):
    
    # filter reviews
    filtered_reviews, calculation_results = sample_reviews_and_calculate_price_then_return_data(train_df, 100, 
                                                        user_reviews_num=user_reviews_num,
                                                        user_comparison=user_comparison,
                                                        user_reviews_num_range=user_reviews_num_range,
                                                        business_reviews_num=business_reviews_num,
                                                        business_comparison=business_comparison,
                                                        business_reviews_num_range=business_reviews_num_range,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None')
    
    filtered_reviews_num = calculation_results["filtered_reviews_num"]
    filtered_users_count = calculation_results["filtered_users_count"]
    filtered_businesses_count = calculation_results["filtered_businesses_count"]
    if filtered_reviews_num==0 or filtered_users_count==0 or filtered_businesses_count==0:
        print("filtered_reviews_num or filtered_users_count or filtered_businesses_count = 0 !")
        return 0
    
    print(f"User reviews: {user_reviews_num} {user_comparison} or {user_reviews_num_range}")
    print(f"Business reviews: {business_reviews_num} {business_comparison} or {business_reviews_num_range}")
    
    calculation_results = {}
    
    calculation_results["Train_data_num_before"] = len(train_df)
    calculation_results["Train_data_num_after"] = filtered_reviews_num
    calculation_results["Train_data_user_count_before"] = len(train_df['user_id'].unique())
    calculation_results["Train_data_user_count_after"] = filtered_users_count
    calculation_results["Train_data_business_count_before"] = len(train_df['business_id'].unique())
    calculation_results["Train_data_business_count_after"] = filtered_businesses_count
    
    print("Training data num:")
    print("Before:", len(train_df), "After:", filtered_reviews_num, '\n')
    print("Training data user count:")
    print("Before:", len(train_df['user_id'].unique()), "After:", filtered_users_count, '\n')
    print("Training data business count:")
    print("Before:", len(train_df['business_id'].unique()), "After:", filtered_businesses_count, '\n')
    
    unique_user_ids = filtered_reviews['user_id'].unique()
    unique_business_ids = filtered_reviews['business_id'].unique()

    test_data = test_df[
        (test_df['user_id'].isin(unique_user_ids)) & 
        (test_df['business_id'].isin(unique_business_ids))
    ]
    test_data.reset_index(inplace=True)
    
    calculation_results["Test_data_num_before"] = len(test_df)
    calculation_results["Test_data_num_after"] = len(test_data)
    calculation_results["Test_data_user_count_before"] = len(test_df['user_id'].unique())
    calculation_results["Test_data_user_count_after"] = len(test_data['user_id'].unique())
    calculation_results["Test_data_business_count_before"] = len(test_df['business_id'].unique())
    calculation_results["Test_data_business_count_after"] = len(test_data['business_id'].unique())
    
    print("Validation (Test) data num:")
    print("Before:", len(test_df), "After:", len(test_data), '\n')
    print("Validation (Test) data user count:")
    print("Before:", len(test_df['user_id'].unique()), "After:", len(test_data['user_id'].unique()), '\n')
    print("Validation (Test) data business count:")
    print("Before:", len(test_df['business_id'].unique()), "After:", len(test_data['business_id'].unique()))

    return filtered_reviews, test_data, calculation_results