In [None]:
import kfp
from kfp import dsl
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,HTML,
                        OutputPath, ClassificationMetrics, Metrics, component)


# Component for Pycaret AutoML Training Pipeline
@component(
    base_image="northamerica-northeast1-docker.pkg.dev/cio-workbench-image-np-0ddefe/wb-platform/initiative/koodo-pyp/koodo-pyp-jupyter:latest",
    output_component_file="pycaret_automl.yaml",
)

def pycaret_automl(
    project_id: str,
    bucket_name: str,
    resources_bucket_name : str,
    dataset: str,
    training_dataset: str,
    utils_file_path : str,
    utils_filename: str,
    plot_utils_filename : str,
    training_perc : float ,
    sample_perc_training : float,
    sample_perc_valid :float,
    model: Output[Model],
    model_metrics_report: Output[HTML]

):
    from google.cloud import bigquery
    from google.cloud import storage
    from datetime import datetime
    import logging 
    from pycaret.classification import setup,create_model,tune_model, predict_model,get_config,compare_models,save_model,tune_model
    from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve, mean_squared_error, f1_score, precision_score, recall_score, confusion_matrix, roc_curve
 
    ## import data
    CREDENTIALS = google.oauth2.credentials.Credentials(token)
    # import google.oauth2.credentials
   
    client = bigquery.Client(project=project_id, location='northamerica-northeast1')
    storage_client = storage.Client(project=project_id)
        
    # Get utils.py
    bucket = storage_client.get_bucket(resources_bucket_name)
    blob = bucket.get_blob(f"{utils_file_path}/{utils_filename}")
    blob.download_to_filename(utils_filename)
    blob = bucket.get_blob(f"{utils_file_path}/{plot_utils_filename}")
    blob.download_to_filename(plot_utils_filename)
    
    from preprocessing_utils import pre_process_data
    from preprocessing_utils import downsampling
    from plotly_utils import evaluate_and_save_models,create_folder_if_not_exists,ploty_model_metrics,plotly_feature_importance,plotly_lift_curve, plotly_model_report,plotly_roc, plotly_confusion_matrix,plotly_output_hist,plotly_precision_recall 
    # specify the path to the training data
    training_table = f"{project_id}.{dataset}.{training_dataset}"
    
    # generate the query
    train_query = '''
       SELECT * 
                FROM `{training_table}`
    '''.format(training_table = training_table)
   

    job_config = bigquery.QueryJobConfig()
    
    # create a dataframe with the training data
    train_all = client.query(train_query, job_config=job_config).to_dataframe()
    
    # josh test starts here
    train_all = pd.read_csv('gs://{}/{}_train.csv.gz'.format(file_bucket, service_type),
                           compression='gzip')  
    # valid_df = pd.read_csv('gs://{}/{}_validation.csv.gz'.format(file_bucket, service_type),  
    #                       compression='gzip')
    

    #set up df_train
    client = bigquery.Client(project=project_id)
    sql_train = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_train = client.query(sql_train).to_dataframe()
    df_target_train = df_target_train.loc[
        df_target_train['YEAR_MONTH'] == '-'.join(score_date_dash.split('-')[:2])]  # score_date_dash = '2022-08-31'
    df_target_train['ban'] = df_target_train['ban'].astype('int64')
    df_target_train = df_target_train.groupby('ban').tail(1)
    df_train = df_train.merge(df_target_train[['ban', 'target_ind']], on='ban', how='left')
    df_train.rename(columns={'target_ind': 'target'}, inplace=True)
    df_train.dropna(subset=['target'], inplace=True)
    df_train['target'] = df_train['target'].astype(int)
    print(df_train.shape)
    
    df_train.to_csv('gs://{}/{}_train_monitoring.csv'.format(file_bucket, service_type))  
    df_test.to_csv('gs://{}/{}_validation_monitoring.csv'.format(file_bucket, service_type))
    
    #set up df_test
    sql_test = ''' SELECT * FROM `{}.{}.bq_call_to_retention_targets` '''.format(project_id, dataset_id) 
    df_target_test = client.query(sql_test).to_dataframe()
    df_target_test = df_target_test.loc[
        df_target_test['YEAR_MONTH'] == '-'.join(score_date_val_dash.split('-')[:2])]  # score_date_dash = '2022-09-30'
    df_target_test['ban'] = df_target_test['ban'].astype('int64')
    df_target_test = df_target_test.groupby('ban').tail(1)
    df_test = df_test.merge(df_target_test[['ban', 'target_ind']], on='ban', how='left')
    df_test.rename(columns={'target_ind': 'target'}, inplace=True)
    df_test.dropna(subset=['target'], inplace=True)
    df_test['target'] = df_test['target'].astype(int)
    print(df_test.shape)


     ##############  Split train/valid/test based of Dev Training Sample Size   #######################
    training_perc = 0.62
    train_df = train_all.sort_values(["partition_dt"]).iloc[:int(train_all.shape[0]*training_perc)]

    lower_bound = int(train_all.shape[0]*training_perc)
    upper_bound = lower_bound + int(train_all.shape[0]*((1-training_perc)/2))
    valid_df = train_all.sort_values(["partition_dt"]).iloc[lower_bound:upper_bound]

    lower_bound = train_df.shape[0] + valid_df.shape[0]
    upper_bound = lower_bound + int(train_all.shape[0]*((1-training_perc)/2))
    test_df = train_all.sort_values(["partition_dt"]).iloc[lower_bound:]
    
    ##############  Downsample train/valid   #######################
    
    train_sampled = downsampling(df=train_df, true_to_false_ratio=sample_perc_training)
    valid_sampled = downsampling(df=valid_df, true_to_false_ratio=sample_perc_valid)

    ############################# Preprocess data  ############################
    
    drop_cols =  ['ban']

    feats = [col for col in train_sampled.columns if col not in drop_cols]
    train_sampled.drop(columns = drop_cols , inplace=True)
    valid_sampled.drop(columns = drop_cols , inplace=True)

    # train.rpc_flag = train.rpc_flag.astype(int)
    train_sampled.rpc_flag = train_sampled.rpc_flag.astype(int)
    valid_sampled.rpc_flag = valid_sampled.rpc_flag.astype(int)
    test_df.rpc_flag = test_df.rpc_flag.astype(int)    ### Split train/valid/test
    # //todo : Training class 0 and class 1 sample function integration
    numeric_features = [col for col in train_sampled.columns if col != 'rpc_flag']
    
    #tag cell with parameters
    PROJECT_ID =  'divg-josh-pr-d1cc3a'
    BUCKET_NAME='divg-josh-pr-d1cc3a-default'
    DATASET_ID = 'telus_rewards'
    RESOURCE_BUCKET = 'divg-josh-pr-d1cc3a-default'
    FILE_BUCKET = 'divg-josh-pr-d1cc3a-default'
    MODEL_ID = '5070'
    SERVICE_TYPE = 'call_to_retention'
    
    file_bucket = FILE_BUCKET
    service_type = SERVICE_TYPE
    
    ################################ Pycaret Setup initialize  ############################ 
    classification_setup = setup(data=train_sampled, 
                             ignore_features=drop_cols,
                             test_data = valid_sampled ,
                             target='target',
                             fix_imbalance=False,
                             remove_outliers = True,
                             normalize=True,
                             normalize_method='zscore',
                             log_experiment=False,
                             remove_multicollinearity=True,
                             multicollinearity_threshold=0.95,
                             feature_selection=True,
                             fold=5,
                             fold_shuffle=True,
                             session_id=123,
                             numeric_features=numeric_features,
                             silent=True)
    
    ### Pycaret top 3 models to analyze
    best_model = compare_models(include = ['rf','xgboost','lightgbm','et'],errors='raise', n_select=3)
    # save the model reports and report fig of all top 2 models to GCS
    todays_date = datetime.now().strftime("%Y-%m-%d")
    save_path = f'pycaret/test_{todays_date}/'
    model_reports, model_to_report_map = evaluate_and_save_models(models=best_model.copy(), 
                                         bucket_name=file_bucket,
                                         save_path=save_path, 
                                         test_df=test_df,
                                         actual_label_str='target_ind',
                                         columns = get_config('X_train').columns,
                                         save_columns=True,
                                         show_report=False)
    # Find the top Model and top model's report Figs
    top_model = None
    for i in range(len(best_model)):
        if best_model[i].__class__.__name__ == model_reports.sort_values(["Recall","Precision"],ascending=False).head(1)["model_name"][0]:
            top_model = best_model.copy()[i]


    best_model = model_reports.sort_values(["Recall","Precision"],ascending=False).head(1)["model_name"][0]
    best_model_report = model_to_report_map[top_model.__class__.__name__]
    
    ################ Export the top model's report and figs to GCS ###############################
    bucket = storage.Client().bucket(bucket_name)
    save_path = f'models/best_model/{todays_date}/'
    create_folder_if_not_exists(save_path)
    ######### Save HTML report of the best model
    best_model_report.write_html(f"{save_path}{todays_date}_{top_model.__class__.__name__}.html")

    filename = f"{todays_date}_{top_model.__class__.__name__}.html"
    blob = bucket.blob(f"{save_path}{filename}")
    blob.upload_from_filename(f"{save_path}{todays_date}_{top_model.__class__.__name__}.html")
    logging.info(f"{filename} sucessfully uploaded to GCS bucket!")

    ####### Save the model
    model_file_name = '{save_path}{model_type}_{date}'.format(save_path = save_path,
                                      model_type = top_model.__class__.__name__,    
                                      date=datetime.now().strftime("%Y-%m-%d"))                                                                   
    save_model(top_model,model_file_name)
    filename = '{model_type}_{date}.pkl'.format(model_type=top_model.__class__.__name__,date=datetime.now().strftime("%Y-%m-%d"))
    blob = bucket.blob(f"{save_path}{filename}")
    blob.upload_from_filename(f"{model_file_name}.pkl")
    logging.info(f"{filename} sucessfully uploaded to GCS bucket!")
    
    ###############################  Tuned Model  ##############################
    model_base = create_model(top_model)
    tuned_model, tuner = tune_model(model_base, optimize='recall', return_tuner = True, n_iter = 20)
    save_path = f'models/best_model/tuned/{todays_date}/'
    model_reports_tuned, model_to_report_map_tuned = evaluate_and_save_models(models=tuned_model, 
                                         bucket_name=bucket_name,
                                         save_path=save_path, 
                                         test_df=test_df,
                                         actual_label_str='rpc_flag',
                                         columns = get_config('X_train').columns,
                                         save_columns=True,
                                         show_report=False)
    
    ###############################   Define  Final Model     ##############################
    
    final_model_report = None
    final_model_class_name = None
    final_model_file = None
    if model_reports_tuned.Recall.values[0] >= 1 :
        logging.info("CAUTION : TUNED MODEL had 100% recall. TUNED model was not selected as best model. ")
        final_model_class_name = top_model.__class__.__name__
        final_model_report = best_model_report
        final_model_file = top_model
    elif model_reports_tuned.Recall.values[0] > model_reports.sort_values(["Recall","Precision"],ascending=False).head(1).Recall.values[0]:
        base_recall = model_reports.sort_values(["Recall","Precision"],ascending=False).head(1).Recall.values[0]
        logging.info(f"TUNED MODEL had {model_reports_tuned.Recall.values[0]*100} recall and Base model without tuning had {base_recall*100} Recall. TUNED model was selected as best model. ")
        
        final_model_class_name = tuned_model.__class__.__name__
        final_model_report = model_to_report_map_tuned[final_model_class_name]
        final_model_file = tuned_model
    else:
        base_recall = model_reports.sort_values(["Recall","Precision"],ascending=False).head(1).Recall.values[0]
        logging.info(f"TUNED MODEL had {model_reports_tuned.Recall.values[0]*100} recall and Base model without tuning had {base_recall*100} Recall. TUNED model was selected as best model. ")
        final_model_class_name = top_model.__class__.__name__
        final_model_report = best_model_report
        final_model_file = top_model

    ###############################  Save the Report and model    ###############################
    # Save HTML report of the selected model
    # final_model_report = model_to_report_map_tuned[tuned_model.__class__.__name__]
    save_path = f'models/final_selected/{todays_date}/'
    create_folder_if_not_exists(save_path)
    final_model_report.write_html(f"{save_path}{todays_date}_{final_model_class_name}.html")
    # bucket = storage.Client().bucket(bucket)
    filename = f"{todays_date}_{final_model_class_name}.html"
    blob = bucket.blob(f"{save_path}{filename}")
    blob.upload_from_filename(f"{save_path}{todays_date}_{final_model_class_name}.html")
    print(f"{filename} sucessfully uploaded to GCS bucket!")

    
    model_file_name = '{save_path}{model_type}_{date}'.format(save_path = save_path,
                                      model_type = final_model_class_name,    
                                      date=datetime.now().strftime("%Y-%m-%d"))                                                                   
    save_model(final_model_file,model_file_name)
    filename = 'model.pkl'.format(model_type=final_model_class_name,date=datetime.now().strftime("%Y-%m-%d"))
    blob = bucket.blob(f"{save_path}{filename}")
    blob.upload_from_filename(f"{model_file_name}.pkl")
    print(f"{filename} sucessfully uploaded to GCS bucket!")
    
    ######################## Save the final Model for Upload Moel componet with Renaming ##############
    model.uri = f'gs://{bucket_name}/models/final_selected/{todays_date}/'
    
    # save_model(final_model_file,'model.pkl')
    # final_model_file.save_model(model.path + ".bst")
    
    ###################### Output Final Selected Model's HTML Report View ########################

    model_metrics_report.path = f'gs://{bucket_name}/{save_path}{todays_date}_{final_model_class_name}.html'