In [None]:
# import global modules
import os
import re
import sys
import time
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from yaml import safe_load

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_utils = pth_project / 'utils'
pth_queries = pth_project / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, pth_project.as_posix())
d_config = safe_load(pth_creds.open())

# import local modules
from utils.gcp import connect_bq_services
from utils.etl.extract import extract_bq_data
from utils.modeling import process_features, extract_stats
from utils.naive_models import naive_model_predict_proba

In [None]:
bq_client = connect_bq_services(d_config['gcp-project-name'])

In [None]:
%load_ext autoreload
%autoreload 2

#### Extract data

In [None]:
sql = (pth_queries / 'extract_aia_predictions.sql').read_text()
df_results = extract_bq_data(bq_client, sql)
df_results.shape

In [None]:
d_target_mapping = {
 'sing_acquisition': 0,
 'shs_acquisition': 1,
 'tos_acquisition': 2,
 'wifi_acquisition': 3,
 'ttv_acquisition': 4,
 'sws_acquisition': 5,
 'hsic_acquisition': 6,
 'lwc_acquisition': 7,
 'hpro_acquisition': 8,
 'whsia_acquisition': 9,
}

d_target_mapping

#### Process data

In [None]:
# get predicted columns
l_pred_cols = [c for c in df_results.columns if '_predicted_score_calibrated' in c]
len(l_pred_cols)

In [None]:
# get label columns
l_label_cols = [c for c in df_results.columns if '_label' in c]
len(l_label_cols)

In [None]:
df_results[l_label_cols].max()

In [None]:
df_results[['ban', 'cust_id', 'lpds_id']].value_counts()

In [None]:
# aggregate labels
df_results['model_scenario'] = df_results.apply(
    lambda row: [
        label.replace('_label', '_acquisition') for label in l_label_cols 
        if pd.notnull(row[label]) and row[label] == 1
    ], axis=1
)

In [None]:
df_results[['model_scenario'] + l_label_cols]

In [None]:
# explode labels
df_res_exploded = df_results.explode('model_scenario')

In [None]:
df_res_exploded[['ban', 'model_scenario'] + l_label_cols].head()

In [None]:
df_res_exploded['model_scenario'].value_counts()

In [None]:
# create target indexes
df_res_exploded['target'] = df_res_exploded['model_scenario'].map(d_target_mapping)

#### Results

In [None]:
# creta list with same order of label indexes
l_pred_ordered = [label.replace('_acquisition', '_predicted_score_calibrated') for label in d_target_mapping.keys()]
len(l_pred_ordered), set(l_pred_ordered) == set(l_pred_cols)

In [None]:
probabilities =  df_res_exploded[l_pred_ordered].to_numpy()
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(3, results_ranked, df_res_exploded['target'], d_target_mapping))

In [None]:
# import global modules
from google.cloud import storage
from google.cloud import bigquery
from pathlib import Path
from yaml import safe_load
import sys
import os

pth = Path(os.getcwd()) 

pth_model_config = pth / 'model_config.yaml'
model_type = 'acquisition'

d_model_config = safe_load(pth_model_config.open())

# extract target name - index mapping
d_target_mapping = {
    d_target_info['class_index']: d_target_info['name'] 
    for d_target_info in d_model_config['target_variables'][model_type]
}

print(d_target_mapping)

In [None]:
# import global modules
import sys
import os
import gc
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from google.cloud import storage
from google.cloud import bigquery

from sklearn.model_selection import train_test_split

from typing import List, Dict, Tuple, Optional

def evaluate(df_result: pd.DataFrame, 
             file_bucket: str, 
             service_type: str, 
             model_type: str, 
             d_model_config: dict, 
             stats_file_name: str
             ):
    """
    This function evaluates the prospects NBA model based on the predictions it made on validation set. It takes the following parameters:
    
    Args:
        - df_result: Returned dataset from train() function
        - file_bucket: A GCS Bucket where training dataset is saved.
        - service_type: Service type name
        - model_type: 'acquisition' or 'tier'
        - d_model_config: A dictionary containing the metadata information for the model.
        - stats_file_name: The name of the file that contains df_stats. 

    Returns:
        - pd.DataFrame: The processed dataframe with additional features and mapped target values.
    """

    def extract_stats(
        file_bucket: str, 
        service_type: str, 
        stats_file_name: str, 
        n: int, 
        predictions_ranked: np.array, 
        true_values: np.array,
        d_target_mapping: dict
    ):
        """
        Extracts statistics and metrics for evaluating predictions ranked by their probability scores.

        Parameters:
        n (int): The number of predictions to consider in the top N.
        predictions_ranked (np.array): An array of ranked predictions.
        true_values (np.array): An array of true values corresponding to the predictions.

        Returns:
        pd.DataFrame: A DataFrame containing statistics and metrics for evaluating the predictions.

        """

        # true_predctions - check if prediction is in top n
        l_results = [
            1 if true_value in prediction[:n] else 0
            for prediction, true_value in zip(predictions_ranked, true_values)
        ]

        # build results dataframe
        df_results = pd.DataFrame(true_values)
        df_results = df_results.rename(columns = {df_results.columns[0]: 'label'})
        df_results[f'is_prediction_in_top_{n}'] = l_results

        # aggregate by label
        df_stats = df_results.groupby('label').agg({
            'label': 'count',
            f'is_prediction_in_top_{n}': 'sum'
        }).rename(
            columns = {
                'label': 'n_acquisitions'
            }
        )

        # capture rate
        capture_rate = df_stats[f'is_prediction_in_top_{n}'] / df_stats['n_acquisitions']
        df_stats[f'capture_rate_top_{n}'] = round(capture_rate * 100, 2)

        # add product names
        df_stats['product'] = ''
        for name, idx in d_target_mapping.items():
            df_stats.at[idx, 'product'] = name

        # calculate the weighted average and append to df
        w_avg = (df_stats[f'capture_rate_top_{n}'] * df_stats['n_acquisitions']).sum() / df_stats['n_acquisitions'].sum()
        total_correct_predictions = df_stats[f'is_prediction_in_top_{n}'].sum()
        df_w_avg = pd.DataFrame({
            'n_acquisitions': [df_stats['n_acquisitions'].sum()],
            f'is_prediction_in_top_{n}': [total_correct_predictions],
            f'capture_rate_top_{n}': [round(w_avg, 2)],
            'product': [f'weighted_avg']    
        })
        df_stats = pd.concat([df_stats, df_w_avg])

        df_stats.to_csv(f'gs://{file_bucket}/{service_type}/{stats_file_name}', index=False)
        
        return df_stats

    df_result = pd.read_csv('gs://divg-groovyhoon-pr-d2eab4-default/nba_product_reco_prospects/training_dataset.csv', index_col=None)

    # extract target name - index mapping
    d_target_mapping = {
        d_target_info['name']: d_target_info['class_index']
        for d_target_info in d_model_config['target_variables'][model_type]
    }

    # creta list with same order of label indexes
    l_pred_ordered = [label for label in d_target_mapping.keys()]
    l_pred_ordered = ['y_pred_0', 'y_pred_1', 'y_pred_2', 'y_pred_3', 'y_pred_4', 'y_pred_5', 'y_pred_6', 'y_pred_7', 'y_pred_8', 'y_pred_9']
    probabilities =  df_result[l_pred_ordered].to_numpy()
    results_ranked = np.argsort(-probabilities, axis=1)
    display(extract_stats(3, results_ranked, df_res_exploded['target'], d_target_mapping))

In [None]:
# import global modules
from google.cloud import storage
from google.cloud import bigquery
from pathlib import Path
from yaml import safe_load
import sys
import os

pth = Path(os.getcwd()) 

pth_model_config = pth / 'model_config.yaml'

d_model_config = safe_load(pth_model_config.open())

In [None]:

def extract_stats(
    n: int, 
    predictions_ranked: np.array, 
    true_values: np.array,
    d_target_mapping: dict
):
    """
    Extracts statistics and metrics for evaluating predictions ranked by their probability scores.

    Parameters:
    n (int): The number of predictions to consider in the top N.
    predictions_ranked (np.array): An array of ranked predictions.
    true_values (np.array): An array of true values corresponding to the predictions.

    Returns:
    pd.DataFrame: A DataFrame containing statistics and metrics for evaluating the predictions.

    """

    # true_predctions - check if prediction is in top n
    l_results = [
        1 if true_value in prediction[:n] else 0
        for prediction, true_value in zip(predictions_ranked, true_values)
    ]

    # build results dataframe
    df_results = pd.DataFrame(true_values)
    df_results = df_results.rename(columns = {df_results.columns[0]: 'label'})
    df_results[f'is_prediction_in_top_{n}'] = l_results

    # aggregate by label
    df_stats = df_results.groupby('label').agg({
        'label': 'count',
        f'is_prediction_in_top_{n}': 'sum'
    }).rename(
        columns = {
            'label': 'n_acquisitions'
        }
    )

    # capture rate
    capture_rate = df_stats[f'is_prediction_in_top_{n}'] / df_stats['n_acquisitions']
    df_stats[f'capture_rate_top_{n}'] = round(capture_rate * 100, 2)

    # add product names
    df_stats['product'] = ''
    for name, idx in d_target_mapping.items():
        df_stats.at[idx, 'product'] = name

    # calculate the weighted average and append to df
    w_avg = (df_stats[f'capture_rate_top_{n}'] * df_stats['n_acquisitions']).sum() / df_stats['n_acquisitions'].sum()
    total_correct_predictions = df_stats[f'is_prediction_in_top_{n}'].sum()
    df_w_avg = pd.DataFrame({
        'n_acquisitions': [df_stats['n_acquisitions'].sum()],
        f'is_prediction_in_top_{n}': [total_correct_predictions],
        f'capture_rate_top_{n}': [round(w_avg, 2)],
        'product': [f'weighted_avg']    
    })
    df_stats = pd.concat([df_stats, df_w_avg])
    
    df_stats.to_csv(f'gs://{file_bucket}/{service_type}/{stats_file_name}', index=False)
    
    return df_stats

df_result = pd.read_csv('gs://divg-groovyhoon-pr-d2eab4-default/nba_product_reco_prospects/df_val_exp.csv', index_col=None)

target = d_model_config['target']

# extract target name - index mapping
d_target_mapping = {
    d_target_info['name']: d_target_info['class_index']
    for d_target_info in d_model_config['target_variables'][model_type]
}

# creta list with same order of label indexes
l_pred_ordered = [label for label in d_target_mapping.keys()]
probabilities =  df_result[l_pred_ordered].to_numpy()
results_ranked = np.argsort(-probabilities, axis=1)
# display(extract_stats(3, results_ranked, df_result[target], d_target_mapping))
display(extract_stats(3, results_ranked, df_result['y_val'], d_target_mapping))

#### ADnA approche

In [None]:
# extract trainning data
sql = f"""
select *
  from `wb-nba-1-pr-08a45b.features_v4.master_features_set_20240213_existing_and_new`
  where split_type = '1-train'
    and label = 1 and label_desc= 'acquisition'
"""
df_train = extract_bq_data(bq_client, sql)
print(df_train.shape)

In [None]:
# load features metadata
d_features_metadata = safe_load((pth_utils / 'parameters' / 'acquisition_features_v9.yaml').open())

In [None]:
# process training data
df_train_processed = process_features(df_train, d_features_metadata, 'model_scenario', d_target_mapping)
df_res_exploded_processed = process_features(df_res_exploded, d_features_metadata, 'model_scenario', d_target_mapping)

In [None]:
X_train = df_train_processed.drop(columns='target')
y_train = df_train_processed['target']

X_val = df_res_exploded_processed.drop(columns='target')
y_val = df_res_exploded_processed['target']

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)

In [None]:
n = 3
probabilities =  xgb_model.predict_proba(X_val)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

#### Naive approche

In [None]:
df_res_exploded

In [None]:
for model_type in ('volume_only', 'random_only', 'random_weighted'):
    print(model_type)
    
    probabilities = naive_model_predict_proba(
        df_res_exploded, 'target', d_target_mapping, score_type = model_type,
        eligible_rule = False, existing_prod_rule = False,
    )
    results_ranked = np.argsort(-probabilities, axis=1)
    display(extract_stats(n, results_ranked, df_res_exploded['target'], d_target_mapping))

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample DataFrame
data = {'color': ['red', 'green', 'blue', 'green', 'red', 'blue']}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit and transform the 'color' column
df['color'] = le.fit_transform(df['color'])

print("\nDataFrame after Label Encoding:")
print(df)

In [None]:
df.info()