In [None]:
# !pip install tensorflow
# !pip install matplotlib
# !pip install retrying
# !pip install scikit-learn
# !pip install imblearn

In [None]:
# import global modules
import os
import re
import sys
import time
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from yaml import safe_load
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.metrics import Recall
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_utils = pth_project / 'utils'
pth_queries = pth_project / 'queries'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
pth_recommenders = pth_data / 'recommenders'
sys.path.insert(0, pth_project.as_posix())
d_config = safe_load(pth_creds.open())

# import local modules
from utils.gcp import connect_bq_services, connect_pandas_bq_services
from utils.extract import extract_bq_data
from utils.modeling import process_features, extract_stats

In [None]:
bq_client = connect_bq_services(d_config['gcp-project-name'])

In [None]:
%load_ext autoreload
%autoreload 2

#### Extract data

In [None]:
# extract trainning data
sql = f"""
  select *
    from `divg-team-v03-pr-de558a.nba_product_reco_model.nba_training_dataset_v8`
"""
df_train = extract_bq_data(bq_client, sql)
print(df_train.shape)

# extract validation data
sql = f"""
  select *
    from `divg-team-v03-pr-de558a.nba_product_reco_model.nba_test_dataset_v8` 
"""
df_validation = extract_bq_data(bq_client, sql)
print(df_validation.shape)

#### Process data

In [None]:
d_target_mapping = {
 'sing_acquisition': 0,
 'shs_acquisition': 1,
 'tos_acquisition': 2,
 'wifi_acquisition': 3,
 'ttv_acquisition': 4,
 'sws_acquisition': 5,
 'hsic_acquisition': 6,
 'lwc_acquisition': 7,
 'hpro_acquisition': 8,
 'whsia_acquisition': 9
}

# load features metadata
d_features_metadata = safe_load((pth_utils / 'parameters' / 'acquisition_features_v7.yaml').open())

# process training data
df_train_processed = process_features(df_train, d_features_metadata, 'model_scenario', d_target_mapping)
df_validation_processed = process_features(df_validation, d_features_metadata, 'model_scenario', d_target_mapping)

In [None]:
# df_train_all_num = df_train[df_train.select_dtypes(exclude=['object']).columns]
# df_validation_all_num = df_validation[df_train_all_num.columns]

In [None]:
# df_train_all_num_dropped = df_train_all_num.drop(
#     columns=['cust_src_id', 'ban', 'ban_src_id', 'lpds_id',	'label']
# ).fillna(0)

# df_validation_all_num_dropped = df_validation_all_num.drop(
#     columns=['cust_src_id', 'ban', 'ban_src_id', 'lpds_id',	'label']
# ).fillna(0)

In [None]:
X = df_train_processed.drop(columns=['target'])
#X = df_train_all_num_dropped
y = df_train_processed['target']

X_val = df_validation_processed.drop(columns=['target'])
#X_val = df_validation_all_num_dropped
y_val = df_validation_processed['target']

# split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Calculate normalization parameters on the training data
scaler = StandardScaler()
scaler.fit(X_train)

# Apply normalization to the training data
X_train_normalized = scaler.transform(X_train)

# Apply normalization to the validation data using the same parameters
X_test_normalized = scaler.transform(X_test)
X_val_normalized = scaler.transform(X_val)

In [None]:
# Apply random oversampling to balance the dataset
# sampling = RandomOverSampler(random_state=42, sampling_strategy=sampling_strategy)
# sampling = SMOTE(random_state=42)

# X_train_resampled, y_train_resampled = sampling.fit_resample(X_train_normalized.astype('float'), y_train)

#### Model

In [None]:
input_size = len(X_train.columns)
output_size = len(d_target_mapping.keys())
input_size, output_size

In [None]:
# Create the model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=input_size))
model.add(Dense(200, activation='relu'))
model.add(Dense(output_size, activation='softmax'))


# Compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3)])

In [None]:
# Train the model
model.fit(X_train_normalized, y_train, epochs=1000, batch_size=4096, validation_data=(X_test_normalized, y_test))

In [None]:
for n in (1, 2, 3):
    probabilities =  model.predict(X_val_normalized)
    results_ranked = np.argsort(-probabilities, axis=1)
    display(extract_stats(n, results_ranked, y_val, d_target_mapping))

In [None]:
n= 3
probabilities =  model.predict(X_val_normalized)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

In [None]:
n= 3
probabilities =  model.predict(X_val_normalized)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

In [None]:
n= 3
probabilities =  model.predict(X_val_normalized)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

In [None]:
n= 3
probabilities =  model.predict(X_val_normalized)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

#### Tunning

In [None]:
#!pip install tensorflow scikeras scikit-learn

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import make_scorer

In [None]:
def create_model(input_size, output_size, hidden_layer_dim, activation):
    
    model = Sequential()
    model.add(Dense(64, activation=activation, input_dim=input_size))
    model.add(Dense(hidden_layer_dim, activation=activation))
    model.add(Dense(output_size, activation='softmax'))
    
    return model

In [None]:
# Create KerasClassifier
model = KerasClassifier(
    create_model,
    input_size=input_size,
    output_size=output_size,
    hidden_layer_dim=100,
    activation='relu',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy(k=3)]
)

In [None]:
from sklearn.metrics import top_k_accuracy_score

In [None]:
# Define the hyperparameters to tune
param_grid = {
    'hidden_layer_dim': [50, 100, 200, 300, 400, 500],
    'optimizer': ['adam', 'sgd'],
    'optimizer__learning_rate': [0.0001, 0.0005, 0.001],
    'activation': ['relu', 'sigmoid'],
    'batch_size': [128, 256, 512, 1024, 2048,4096]
}

# Perform grid search
grid_search = GridSearchCV(
    estimator=model, 
    param_grid=param_grid, 
    cv=5, 
    scoring=make_scorer(top_k_accuracy_score, k=3, response_method='predict_proba')
    #scoring=make_scorer(top_k_accuracy_score, k=3, labels=list(d_target_mapping.values()))
)
grid_search.fit(X_train_normalized, y_train)

In [None]:
print(grid_search.best_score_, grid_search.best_params_)

In [None]:
# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_params

In [None]:
n= 3
probabilities =  best_model.predict_proba(X_val_normalized)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

In [None]:
# Train the best model
best_model.fit(X_train_normalized, y_train, epochs=1000, validation_data=(X_test_normalized, y_test))

In [None]:
n= 3
probabilities =  best_model.predict_proba(X_val_normalized)
results_ranked = np.argsort(-probabilities, axis=1)
display(extract_stats(n, results_ranked, y_val, d_target_mapping))

In [None]:
print('ok')