##### Setup

In [0]:
#basic imports
from pyspark.sql.functions import col, count, when, isnan, isnull, percent_rank, monotonically_increasing_id
from pyspark.sql import functions as F
from pyspark.sql import types
from pyspark import StorageLevel

#for EDA/plots
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#for feature creation
from pyspark.ml.feature import StandardScaler, VectorAssembler, StringIndexer, OneHotEncoder, ChiSqSelector, Bucketizer
from pyspark.sql.window import Window
from pyspark.sql import Row
from pyspark.sql.functions import concat, substring, lit, udf
from pyspark.sql import DataFrame
from pyspark.sql import Window as W

#for modeling
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, NaiveBayes
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
import itertools
#for evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
from sklearn.metrics import classification_report
from sparkdl.xgboost import XgboostClassifier

In [0]:
blob_container = "team20fp" # The name of your container created in https://portal.azure.com
storage_account = "w261fp" # The name of your Storage account created in https://portal.azure.com
secret_scope = "team20scope" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "team20key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

### Load Data

In [0]:
def load_data(prefolded = True, limited_features = True):
    
    #load the set of 5-fold data
    if prefolded:
        df_list = []
        
        #limited features
        if limited_features:
            for i in range(5):
                train = spark.read.parquet(f"{blob_url}/folds5_lim_features/fold_{i}_train")
                test = spark.read.parquet(f"{blob_url}/folds5_lim_features/fold_{i}_test")
                df_list.append((train, test))
        #all features
        else:
            for i in range(5):
                train = spark.read.parquet(f"{blob_url}/folds5_full_features/fold_{i}_train")
                test = spark.read.parquet(f"{blob_url}/folds5_full_features/fold_{i}_test")
                df_list.append((train, test))
        return(df_list)
       
    #return a single dataframe (entire feature set)    
    else:
        df = spark.read.parquet(f"{blob_url}/feature_set_full_sorted")
        
        return(df)

### Modeling Helper Functions

In [0]:
################# functions for prepping data ###################

def balance_trainset(train_df, up_or_down = 'up'):
    ''' Balance classes in training dataset'''
    
    num_delay = train_df.filter(F.col('label') == 1).count()
    num_ontime = train_df.filter(F.col('label') == 0).count()
    
    #total_obs = num_delay + num_ontime
    
    if up_or_down == 'up':
        pct = num_ontime/num_delay
        
        ontime_df = train_df.filter(F.col('label') == 0)
        delay_df = train_df.filter(F.col('label') == 1).sample(withReplacement = True, fraction = pct, seed = 1)
        
    
    elif up_or_down == 'down':
        pct = num_delay/num_ontime
        
        delay_df = train_df.filter(F.col('label') == 1)
        ontime_df = train_df.filter(F.col('label') == 0).sample(withReplacement = False, fraction = pct, seed = 1)
    
    
    train_balance = delay_df.union(ontime_df)
    print(f'Balancing factor: {round(pct,2)}')
    print(f'Num observations after balancing: {train_balance.count()}')
    return train_balance

################### functions for modeling ######################

def lr_pipeline(model, cts_features, cat_features, bucket_features):
    ''' Pipeline to scale continuous features and encode categorical features'''
    
    #scale continuous features
    vect_cts = VectorAssembler(inputCols = cts_features,
                               outputCol = 'cts_feats',
                              handleInvalid = 'skip')
    scaler = StandardScaler(inputCol = 'cts_feats',
                            outputCol = 'scaled_cts_feats',
                            withStd = True, withMean = True)
    
    ##test bucketed vars
    vect_bucket = Bucketizer(inputCols = bucket_features,
                                         splitsArray = [[0,8,25,50,100,250,500,1000,float('inf')], #orig_fpd
                                           [0,8,25,50,100,250,500,1000,float('inf')], #dest_fpd
                                           [-float('inf'), 4023, float('inf')], #vis
                                           [-float('inf'), -156, float('inf')], #air_tmp
                                           [1,2,3,4,5,7,float('inf')], #distance (group)
                                           [0, 0.20, 0.45, 0.6, 0.8, float('inf')], #pct carrier del
                                           [0, 0.20, 0.45, 0.6, 0.8, float('inf')], #pct route del
                                           [0, 0.20, 0.45, 0.6, 0.8, float('inf')], #pct orig del
                                           [0, 0.20, 0.45, 0.6, 0.8, float('inf')] #pct dest del
                                          ],
                            outputCols = [col+'_bucket' for col in bucket_features])
    
    #index string values before one hot encoding (also works on numeric categoricals, will convert to string then index)
    indexed = StringIndexer(inputCols = cat_features+[col+'_bucket' for col in bucket_features],
                            outputCols = [col+'_idx' for col in cat_features+bucket_features],
                            handleInvalid = 'keep')
    
    
    onehot_feats = OneHotEncoder(inputCols = [col+'_idx' for col in cat_features+bucket_features],
                                outputCols = [col+'_enc' for col in cat_features+bucket_features])
    
    vect_cat = VectorAssembler(inputCols = [col+'_enc' for col in cat_features+bucket_features],
                              outputCol = 'cat_feats')
    

    #combine cts and cat features
    combined_vect = VectorAssembler(inputCols = ['scaled_cts_feats', 'cat_feats'],
                                   outputCol = 'features')
    
    #combine pipeline components
    pipeline = Pipeline(stages = [vect_cts, scaler, vect_bucket, indexed, onehot_feats, vect_cat, combined_vect, model])
    return pipeline

# tree algo pipeline - no scaler, no feature selector
def tree_pipeline(model, cts_features, cat_features, bucket_features):
    
    #cts vars
    vect_cts = VectorAssembler(inputCols = cts_features,
                               outputCol = 'cts_feats',
                              handleInvalid = 'skip')
    
    #cts vars to bucket
    vect_bucket = Bucketizer(inputCols = bucket_features,
                            splitsArray = [[0,8,25,50,100,250,500,1000,float('inf')], #orig_fpd
                                           [0,8,25,50,100,250,500,1000,float('inf')], #dest_fpd
                                           [-float('inf'), 4023, float('inf')], #vis
                                           [-float('inf'), -156, float('inf')], #air_tmp
                                           [1,2,3,4,5,7,float('inf')], #distance (group)
                                           [0, 0.20, 0.4, 0.6, 0.8, float('inf')], #pct carrier del
                                           [0, 0.20, 0.4, 0.6, 0.8, float('inf')], #pct route del
                                           [0, 0.20, 0.4, 0.6, 0.8, float('inf')], #pct orig del
                                           [0, 0.20, 0.4, 0.6, 0.8, float('inf')] #pct dest del
                                          ],
                             outputCols = [col+'_bucket' for col in bucket_features])
    
    #cat vars
    indexed = StringIndexer(inputCols = cat_features+[col+'_bucket' for col in bucket_features],
                            outputCols = [col+'_idx' for col in cat_features+bucket_features],
                            handleInvalid = 'keep')
    
    
    onehot_feats = OneHotEncoder(inputCols = [col+'_idx' for col in cat_features+bucket_features],
                                outputCols = [col+'_enc' for col in cat_features+bucket_features])
    
    vect_cat = VectorAssembler(inputCols = [col+'_enc' for col in cat_features+bucket_features],
                              outputCol = 'cat_feats')
    
    combined_vect = VectorAssembler(inputCols = ['cts_feats', 'cat_feats'],
                                   outputCol = 'features')
    
    #combine pipeline components
    pipeline = Pipeline(stages = [vect_cts, vect_bucket, indexed, onehot_feats, vect_cat, combined_vect, model])
    return pipeline

    
def gen_model_pipeline(model_type, param_dict, cts_features, cat_features, bucket_features = None):
    '''Input model type and parameters, return model pipeline'''
    
    #still need to create dict with parameters for each model
    params = param_dict[model_type]
    if model_type == 'lr':
        lr = LogisticRegression(regParam = params['regParam']
                                #,weightCol = 'label_weight'
                               )
        pipeline = lr_pipeline(lr, cts_features, cat_features, bucket_features)
    
    elif model_type == 'gbt':
        gbt = GBTClassifier(maxDepth = params['maxDepth'],
                           maxBins = params['maxBins'],
                           maxIter = params['maxIter'],
                           stepSize = params['stepSize'])
        pipeline = tree_pipeline(gbt, cts_features, cat_features, bucket_features)
    
    elif model_type == 'xgb':

        xgb = XgboostClassifier(labelCol = 'label',
                                featuresCol = 'features',
                                missing = 0.0,
                                rawPredictionCol = 'probability',
                                #booster = params['booster'], #defaults to gbtree
                                max_depth = params['max_depth'],
                                n_estimators = params['n_estimators'],
                                reg_lambda = params['reg_lambda'],
                                reg_alpha = params['reg_alpha'],
                                objective = params['objective'],
                                base_score = params['base_score'],
                                gamma = params['gamma'],
                                scale_pos_weight = params['scale_pos_weight'],
                                min_child_weight = params['min_child_weight'],
                                #max_delta_step = params['max_delta_step'],
                                learning_rate = params['learning_rate'],
                                max_bin = params['max_bin']
                               )
        
        pipeline = tree_pipeline(xgb, cts_features, cat_features, bucket_features)
        
    return pipeline




def fit_model(train_df, model_type, param_dict, cts_features, cat_features, bucket_features = None, balance_type = 'up', pipeline = None):
    ''' Balance train_df, generate model pipeline using best params, train model'''
    
    #balance train_df
    if balance_type == 'up':
        train_df = balance_trainset(train_df, up_or_down = 'up').persist(StorageLevel.MEMORY_AND_DISK)
    elif balance_type == 'down':
        train_df = balance_trainset(train_df, up_or_down = 'down').persist(StorageLevel.MEMORY_AND_DISK)
    elif balance_type == 'weight':
        train_df = weight_classes(train_df).persist(StorageLevel.MEMORY_AND_DISK)
    #elif balance_type == None:
        #train_df = train_df.persist(StorageLevel.MEMORY_AND_DISK)
    #print('Train dataset balancing complete')  
    
    #gen model pipeline using best params (need to find thru CV)
    #params = param_dict[model_type]
    if pipeline == None:
        pipeline = gen_model_pipeline(model_type, param_dict, cts_features, cat_features, bucket_features)
        #print('Pipeline generation complete')
    #train model
    model = pipeline.fit(train_df)
    #print('Training complete')
    train_df.unpersist()
    return model


################# functions for evaluation ######################



def eval_p_r_f2(df, acc = True):
    pred_rdd = df.select(['prediction', 'label']).rdd
    multi_metrics = MulticlassMetrics(pred_rdd)
    precision = multi_metrics.precision(1)
    recall = multi_metrics.recall(label = 1)
    f2 = multi_metrics.fMeasure(1.0,2.0)
    if acc:
        score =  multi_metrics.accuracy
        return(precision, recall, f2, score)
    return (precision, recall, f2)
  
    
def extract_prob(v):
    '''udf to extract just the positive class probability from a transformed valedation dataframe'''
    
    try:
        return float(v[1])  
    except ValueError:
        return None
    
extract_prob_udf = udf(extract_prob, types.DoubleType())

def eval_thresholds(val_pred_list, thresholds = [.25,.5,.75]):
    '''in: a list of valedation dataframes with predictions as column 'label'
        out: dataframe that shows the prediction threshold and associated accuracy metrics
    '''
    threshold_precisions = []
    threshold_recalls = []
    threshold_f2s = []
    threshold_accuracies = []
    
    df= val_pred_list[0]
    for i in val_pred_list[1:]:
        df = df.union(i)

    df = df.withColumn('probability', extract_prob_udf('probability'))

    for i in thresholds:
        curr = df.select(col('probability').cast('float'),col('label').cast('float'))\
               .withColumn('prediction', (col('probability')>=i).cast('float'))
            
        metrics = eval_p_r_f2(curr, acc = True)
        
        threshold_precisions.append(metrics[0])
        threshold_recalls.append(metrics[1])
        threshold_f2s.append(metrics[2])
        threshold_accuracies.append(metrics[3])
    
    out = pd.DataFrame({'threshold': thresholds, 
                        'precision': threshold_precisions,
                        'recall': threshold_recalls,
                        'f2':threshold_f2s,    
                        'accuracy':threshold_accuracies
                        })
    return(out)



###CV/Tuning Helper Functions

In [0]:
def gen_param_grid(param_dict):
    ''' Create list of all param combinations '''
    param_keys = list(param_dict.keys())
    param_vals = param_dict.values()
    param_combinations = list(itertools.product(*param_vals))
    
    return (param_keys, param_combinations)


def kfold_split(df, n_splits = 5, train_size = .8, balance_type = 'down'):
    '''inputs: dataframe, number of splits (train_size), percentage of each fold that should be allocated to training (train_size), which direction to balance label proportions (balance_type)
    output: list of n_splits tuples, where each tuple consists of a trainining set and a test set sorted by datetime'''
    
    w = Window().orderBy('UTC_DEP_TIME')

    df = df.withColumn("row_num", F.row_number().over(w))
    
    kfold_dataframes = []
    n_samples = df.count()
    k_fold_size = n_samples // n_splits
    indices = np.arange(n_samples)

    margin = 0
    for i in range(n_splits):
        start = i * k_fold_size
        if i == n_splits-1:
            stop = n_samples+1
        else:
            stop = start + k_fold_size
        mid = int(train_size * (stop - start)) + start
    
        print(f'Fold: {i+1}, Train Start: {start}, Train Stop/ Test Start: {mid}, Test Stop: {stop-1}')
        print(f'Fold Size: {stop-start}')
        print(f'Train Percentage: {100*(mid-start)/(stop-start):.2f}%')        

        train_df = df.filter((col('row_num')>= start)&(col('row_num')< mid))
        #balance classes in train_df
        train_df = balance_trainset(train_df, balance_type)
        
        test_df = df.filter((col('row_num')>= mid)&(col('row_num')< stop))
        kfold_dataframes.append([train_df, test_df])
        print('=========================')

    return(kfold_dataframes)



def param_opt_cv(df_list, test_param_dict, model_type, cts_features, cat_features, bucket_features = None, balance_type = 'down'):
       
    print(f'Number of folds: {len(df_list)}')
    
    #initialize lists to track results (select best at end based on max of score list)
    precision_list = []
    recall_list = []
    f2_list = []
    test_params = []

    #call param grid function to get list of all param combinations to test
    param_grid = gen_param_grid(test_param_dict[model_type])
    print(f'Num parameter combinations to test: {len(param_grid[1])}')

    for params in param_grid[1]:
        
        #initialize lists for param set to take avg at end 
        ps_precision = []
        ps_recall = []
        ps_f2 = []

        #create param dict for input to pipeline gen function
        param_dict = {p[0]:p[1] for p in zip(param_grid[0],params)}
        print(f'Testing params: {param_dict}')
        
        #add params to full list for tracking
        test_params.append(param_dict)
        
        #create pipeline with params (set dict structure to match input to gen_model_pipeline with model type as key)
        pipeline = gen_model_pipeline(model_type, {model_type:param_dict}, cts_features, cat_features, bucket_features = bucket_features)
        
        for df in df_list: #get train df
            train_fold, val_fold = df[0], df[1]
            
            train_fold = train_fold.persist()
 
            
            model = fit_model(train_fold, model_type, param_dict, cts_features, cat_features, balance_type = None, bucket_features = bucket_features, pipeline = pipeline)
            
            val_pred = model.transform(val_fold)
            #evaluate
            metrics = eval_p_r_f2(val_pred)
            ps_precision.append(metrics[0])
            ps_recall.append(metrics[1])
            ps_f2.append(metrics[2])
            #print('Evaluation done')

        #take mean across folds for param set
        precision_list.append(np.mean(ps_precision))
        recall_list.append(np.mean(ps_recall))
        f2_list.append(np.mean(ps_f2))
            
    metrics_df = pd.DataFrame(list(zip(test_params, f2_list, precision_list, recall_list)), columns = ['Params','F2','Precision','Recall'])
        
    print(f'Optimal parameters:',metrics_df.loc[metrics_df['F2'].idxmax()])
    return metrics_df
        


## Baseline models
All with 5-fold CV

In [0]:
df_list = load_data(prefolded = True, limited_features = False)

In [0]:
cts_vars = ['PRE_FL_WINDOW','CUMAVG_WND_DIR_WEEKLY','CUMAVG_DEW_WEEKLY','CUMAVG_VIS_WEEKLY','CUMAVG_SLP_WEEKLY','CUMAVG_WND_SPEED_WEEKLY','CUMAVG_CEIL_HEIGHT_WEEKLY','CUMAVG_AIR_TMP_WEEKLY','CUMAVG_DEP_DELAY_WEEKLY','CUMAVG_DEP_DEL15_WEEKLY','CUMAVG_ARR_DELAY_WEEKLY','CUMAVG_ARR_DEL15_WEEKLY']

cat_vars = ['ORIGIN','YEAR','QUARTER','DEST','MONTH','DAY_OF_WEEK','OP_CARRIER','PRIOR_ARR_DEL','PRIOR_DEP_DEL','ORIG_DEST']

bucket_vars = ['ORIG_FPD','DEST_FPD','VIS','AIR_TMP','DISTANCE','PCT_CARRIER_DEL','PCT_ROUTE_DEL','PCT_ORIG_DEL','PCT_DEST_DEL']


##### Logistic Regression

In [0]:
param_dict1 = {'lr':{'regParam':[0.1],
                     'elasticNetParam': [0.8]}}

lr_baseline = param_opt_cv(df_list, 
                           param_dict1, 
                           model_type = 'lr',
                           cts_features = cts_vars,
                           cat_features = cat_vars,
                           bucket_features = bucket_vars,
                           balance_type = 'down')

##### GBT Classifier Baseline

In [0]:
#test performance of bucketed FPD vars (vs. continuous, above)
param_dict2 = {'gbt':{'maxDepth':[7],
                      'maxBins':[250],
                      'maxIter':[6],
                      'stepSize':[0.3]}}

gbt_baseline = param_opt_cv(df_list,
                            param_dict2,
                            model_type = 'gbt',
                            cts_features = cts_vars,
                            cat_features = cat_vars,
                            bucket_features = bucket_vars,
                            balance_type = 'down')

##### XGBoost Baseline 
[XGBoost parameter info](https://xgboost.readthedocs.io/en/stable/parameter.html)

In [0]:
#test performance of bucketed FPD vars
param_dict3 = {'xgb':{'max_depth':[6],
                      'n_estimators':[125],
                      'reg_lambda':[1],
                      'reg_alpha':[0.1],
                      'tree_method':['hist'],
                      'objective':['binary:logistic'],
                      'base_score':[0.5],
                      'gamma':[0.05],
                      'min_child_weight':[1.5],
                      'max_bin': [100],
                      'learning_rate' : [0.3]}}

xgb_baseline = param_opt_cv(df_list,
                            param_dict3,
                            model_type = 'xgb',
                            cts_features = cts_vars,
                            cat_features = cat_vars,
                            bucket_features = bucket_vars,
                            balance_type = 'down')

### Grid Search Hyperparameter Tuning with Cross Validation

##### GBT Classifier Hyperparameter Tuning

In [0]:
param_dict4 = {'gbt':{'maxDepth':[6,9],
                      'maxBins':[250,300],
                      'maxIter':[6],
                      'stepSize':[0.2]}}

gbt_opt = param_opt_cv(df_list,
                       param_dict4,
                       model_type = 'gbt',
                       cts_features = cts_vars,
                       cat_features = cat_vars,
                       bucket_features = bucket_vars,
                       balance_type = 'down')

In [0]:
gbt_opt

Unnamed: 0,Params,F2,Precision,Recall
0,"{'maxDepth': 6, 'maxBins': 250, 'maxIter': 6, ...",0.573545,0.319713,0.725539
1,"{'maxDepth': 6, 'maxBins': 300, 'maxIter': 6, ...",0.57478,0.318967,0.728463
2,"{'maxDepth': 9, 'maxBins': 250, 'maxIter': 6, ...",0.576526,0.324444,0.723345
3,"{'maxDepth': 9, 'maxBins': 300, 'maxIter': 6, ...",0.578219,0.323277,0.727845


##### XGBoost Hyperparameter Tuning
Iteration 1

In [0]:
param_dict5 = {'xgb':{#booster = params['booster'],
                                'max_depth':[6],
                                'n_estimators':[150],
                                'reg_lambda':[0.5,1],
                                'reg_alpha':[0.1,0.5],
                                'tree_method':['hist'],
                                'objective':['binary:logistic'],
                                'base_score':[0.5],
                                'gamma':[0,0.05],
                                #'scale_pos_weight':[1],
                                'min_child_weight':[1.5],
                                #'max_delta_step':[0.7],
                                'max_bin': [100],
                                'learning_rate' : [0.2,0.3]}}

xgb_opt = param_opt_cv(df_list,
                            param_dict5,
                            model_type = 'xgb',
                            cts_features = cts_vars,
                            cat_features = cat_vars,
                            bucket_features = bucket_vars,
                            balance_type = 'down')

In [0]:
xgb_opt

Unnamed: 0,Params,F2,Precision,Recall
0,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584254,0.330522,0.734255
1,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584311,0.329345,0.735267
2,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584371,0.330502,0.734658
3,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584274,0.329622,0.734908
4,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584532,0.330556,0.734892
5,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.58183,0.329981,0.729399
6,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584341,0.330519,0.734371
7,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.581927,0.329923,0.729943
8,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584572,0.330167,0.735825
9,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.583468,0.329762,0.733065


Iteration 2

In [0]:
param_dict6 = {'xgb':{#booster = params['booster'],
                                'max_depth':[6],
                                'n_estimators':[150],
                                'reg_lambda':[1],
                                'reg_alpha':[0.1,0.2],
                                'tree_method':['hist'],
                                'objective':['binary:logistic'],
                                'base_score':[0.5],
                                'gamma':[0.05],
                                #'scale_pos_weight':[1],
                                'min_child_weight':[1.5],
                                #'max_delta_step':[0.7],
                                'max_bin': [50,100],
                                'learning_rate' : [0.2]}}

xgb_opt_2 = param_opt_cv(df_list,
                            param_dict6,
                            model_type = 'xgb',
                            cts_features = cts_vars,
                            cat_features = cat_vars,
                            bucket_features = bucket_vars,
                            balance_type = 'down')

In [0]:
xgb_opt_2

Unnamed: 0,Params,F2,Precision,Recall
0,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.58393,0.330424,0.730531
1,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.58393,0.330424,0.730531
2,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584068,0.331052,0.72917
3,"{'max_depth': 6, 'n_estimators': 150, 'reg_lam...",0.584068,0.331052,0.72917


### Final Model

In [0]:
train_df = load_data(prefolded = False).filter(F.col('YEAR') != 2019).withColumnRenamed('DEP_DEL15','label').withColumnRenamed('DISTANCE_GROUP','DISTANCE')
val_df = load_data(prefolded = False).filter(F.col('YEAR') == 2019).withColumnRenamed('DEP_DEL15','label').withColumnRenamed('DISTANCE_GROUP','DISTANCE')

In [0]:
params = {'xgb':{#booster = params['booster'],
                                'max_depth':6,
                                'n_estimators':150,
                                'reg_lambda':1,
                                'reg_alpha':0.2,
                                'tree_method':'hist',
                                'objective':'binary:logistic',
                                'base_score':0.5,
                                'gamma':0.05,
                                #'scale_pos_weight':[1],
                                'min_child_weight':1.5,
                                'max_bin': 50,
                                'learning_rate' : 0.2}}

xgb_model = fit_model(train_df,
                     'xgb',
                     params,
                     cts_features = cts_vars,
                     cat_features = cat_vars,
                     bucket_features = bucket_vars,
                     balance_type = 'down')

In [0]:
val_pred = xgb_model.transform(val_df)
val_pred.write.parquet(f"{blob_url}/val_pred_xgb_3")

In [0]:
metrics = eval_p_r_f2(val_pred)
print(f'Precison:{metrics[0]}')
print(f'Recall:{metrics[1]}')
print(f'F2:{metrics[2]}')
print(f'Accuracy:{metrics[3]}')