In [None]:
# import global modules
import os
import re
import sys
import time
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from yaml import safe_load
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_utils = pth_project / 'utils'
pth_queries = pth_project / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, pth_project.as_posix())
d_config = safe_load(pth_creds.open())

# import local modules
from utils.gcp import connect_bq_services, connect_pandas_bq_services
from utils.etl.extract import extract_bq_data
from utils.modeling import extract_stats, process_hs_features

In [None]:
import os
from pathlib import Path

Path(os.getcwd().split('notebooks')[0])

In [None]:
bq_client = connect_bq_services(d_config['gcp-project-name'])

In [None]:
%load_ext autoreload
%autoreload 2

#### Extract data

In [None]:
sql = "select * from `adna-nlp-pr-92330e.mob_nba.master_mob_features_set_test_prospects`"

In [None]:
df = extract_bq_data(bq_client, sql)
df.shape

In [None]:
pd.options.display.max_columns = None
df

In [None]:
df['cust_prov_state_cd'].value_counts()

In [None]:
df['model_scenario'].value_counts()

#### Process data

In [None]:
# load features metadata
d_model_config = safe_load((pth_utils / 'parameters' / 'mob_features.yaml').open())

In [None]:
l_target_name = [target['name'] for target in d_model_config['target_variables']]
l_target_name

In [None]:
d_target_mapping = {
    target['name']: target['class_index']
    for target in d_model_config['target_variables']
}
d_target_mapping

In [None]:
df_target = df[df['model_scenario'].isin(l_target_name)]

In [None]:
df_train = df_target[df_target['split_type'] == '1-train']
df_val = df_target[df_target['split_type'] == '2-val']
df_test = df_target[df_target['split_type'] == '3-test']

In [None]:
# process training data
df_train_processed = process_hs_features(df_train, d_model_config, training_mode=True, target_name='model_scenario')
df_validation_processed = process_hs_features(df_val, d_model_config, training_mode=True, target_name='model_scenario')
df_test_processed = process_hs_features(df_test, d_model_config, training_mode=True, target_name='model_scenario')

In [None]:
print(df_train_processed.shape)
print(df_validation_processed.shape)
print(df_test_processed.shape)

#### Split data

In [None]:
# Separate the features and target variable
X_train = df_train_processed.drop(columns='target')
y_train = df_train_processed['target']

X_val = df_validation_processed.drop(columns='target')
y_val = df_validation_processed['target']

X_test = df_test_processed.drop(columns='target')
y_test = df_test_processed['target']

#### XGBoost

In [None]:
#!pip install xgboost
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)]
)

In [None]:
n= 3
probabilities =  xgb_model.predict_proba(X_val)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

In [None]:
n= 3
probabilities =  xgb_model.predict_proba(X_test)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_test, d_target_mapping))

#### Features importance

In [None]:
# Retrieve feature importance scores
importance_scores = xgb_model.feature_importances_
feature_names = X_train.columns

# Sort feature importance scores and feature names in descending order
sorted_indices = importance_scores.argsort()
sorted_scores = importance_scores[sorted_indices][-30:]
sorted_names = feature_names[sorted_indices][-30:]

# Plot feature importance
plt.figure(figsize=(8, 8))
plt.barh(range(len(sorted_scores)), sorted_scores)
plt.yticks(range(len(sorted_scores)), sorted_names)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.show()