# Decision Tree Classifier
- Pipeline starts from loading data from our custom data model train/test checkpoints:
  - Encode features for model training
  - Cross validation pipeline
  - Single train / test pipeline
    - code in notebook corresponds to best model

## Imports

In [0]:
import pyspark.pandas as ps
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType
from pyspark.sql import functions as F
from pyspark.sql.functions import col, current_date, datediff, skewness, kurtosis, max, length, coalesce
from pyspark.ml.functions import vector_to_array

from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, VectorAssembler, Imputer, MinMaxScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, PipelineModel

from sklearn.metrics import classification_report, precision_recall_fscore_support

In [0]:
## Place this cell in any team notebook that needs access to the team cloud storage.
blob_container  = "261-final-project"       # The name of your container created in https://portal.azure.com
storage_account = "ansonbquon"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "final_project"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "project_key"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket

# the 261 course blob storage is mounted here.
mids261_mount_path      = "dbfs:/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

# see what's in the blob storage root folder 
display(dbutils.fs.ls(f"{team_blob_url}/PRD/"))

path,name,size,modificationTime
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/1ymodel/,1ymodel/,0,1733581182000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/parquet_prod_data/,parquet_prod_data/,0,1734171841000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/parquet_prod_data_1y/,parquet_prod_data_1y/,0,1733579343000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval/,xval/,0,1733543026000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/,xval_2020/,0,1734024312000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_corrected/,xval_corrected/,0,1733964471000


## Functions

In [0]:
# RUN CELL - contains the subset of features to be considered in final model

# LABEL
label = 'depDel15'

# NUMERIC COLUMNS
# min/max scaling
num_scale_columns = ['crsElapsedTime',
                    'distance',
                    'actualElapsedTimePrevF',
                    'distancePrevF',
                    'timeBtwFlights',
                    'hourlyAltimeterSetting',
                    'hourlyDewPointTemperature',
                    'hourlyDryBulbTemperature',
                    'hourlyPrecipitation',
                    'hourlyPressureChange',
                    'hourlyPressureTendency',
                    'hourlyRelativeHumidity',
                    'hourlySeaLevelPressure',
                    'hourlyStationPressure',
                    'hourlyVisibility',
                    'hourlyWetBulbTemperature',
                    'hourlyWindDirection',
                    'hourlyWindGustSpeed',
                    'hourlyWindSpeed',
                    'daysToNearestHoliday',
                    'predictedMeanHourlyPrecipitation',
                    'predictedMeanHourlyVisibility',
                    'predictedMeanHourlyDewPointTemp',
                    'originPageRank',
                    'destPageRank',
                    ]

# no min/max scaling needed
num_columns_no_scale = ['crsDepTimeSine',
                        'monthSine',
                        'dayOfWeekSine',
                        ]

# CATEGORICAL COLUMNS
# string indexer
str_idx_columns = [ 'quarter',
                    'month',
                    'dayOfMonth',
                    'dayOfWeek',
                    'opUniqueCarrier',
                    'year',
                    'originType',
                    'destType',
                    'depDel15PrevF',
                    'depDelayGroupPrevF',
                    'depTimeBlkPrevF',
                    'arrDel15PrevF',
                    'arrDelayGroupPrevF',
                    'arrTimeBlkPrevF',
                    'originTypePrevF',
                    ]

# binary indexer
bin_idx_columns = [ 'tailNum',
                    'opCarrierFlNum',
                    'origin',
                    'dest',
                    'originIsoRegion',
                    'destIsoRegion',
                    'originPrevF',
                    ]


# not used (currently)
omit_columns = [    'crsDepTime',
                    'crsDepTimePrevF',
                    'depTimePrevF',
                    'crsArrTimePrevF',
                    'arrTimePrevF',
                    'crsElapsedTimePrevF',
                    'timeBtwFlightsPlanned',
                    'monthIndex',
                    'dayOfWeekIndex',
                    ]

In [0]:
# RUN CELL - functions associated with pre-processing for model training
def replace_scaled_columns(df, input_cols):
    """This function takes a dataframe and list of input columns that were min/max scaled and updates the
    scaled values into the original input columns.
    
    Input:
        df: dataframe
        input_cols: list of numerical scaled features
    Output:
        df: dataframe with min/max features in original source columns
    
    """
    df = df.withColumn('scaled_array', vector_to_array(col('scaled_features')))
    for idx, input_col in enumerate(input_cols):
        df = df.withColumn(input_col, F.col('scaled_array').getItem(idx))
    
    return df

def BinarizationEncoder(value):
    """
    Converts an integer index value to binary for binarization.

    Input:
        value: integer index value
    Output:
        value: binary value with 0b prefix
    """

    value = bin(value)
    return value

def BinarizationVectorizer(value, max_length):
    """
    Converts binary vector into binary array.

    Input:
        value: binary value with 0b prefix
        max_length: max length of binary values in columns
    Output:
        value: array of corresponding values
    """
    # removes prefix and create array for further binarization
    value = str(value).ljust(max_length, '0')[2:]
    value = [int(i) for i in list(value)]

    return value

def BinIndexer(df, input_cols):
    """
    Converts array of binary into separate columns for each index in the array.

    Input:
        df: dataframe
        input_cols: initial list of columns which require binarization
    Output:
        df: dataframe with binarized columns
    """

    for input_col in input_cols:
        # col_name = input_col[:-5]

        # create binary representation of index
        BinarizationEncoder_udf = udf(BinarizationEncoder, StringType())
        df = df.withColumn(f'{input_col}Bin', BinarizationEncoder_udf(df[input_col].cast(IntegerType())))

        # calculate max length to pad binary values
        max_length = df.select(max(length(col(f'{input_col}Bin'))).alias(f'{input_col}MaxLength')) \
                        .collect()[0][f'{input_col}MaxLength']

        # create binary index columns and drop intermediate columns
        BinarizationVectorization_udf = udf(lambda value: BinarizationVectorizer(value, max_length), ArrayType(IntegerType()))
        df = df.withColumn(f'{input_col}Vector', BinarizationVectorization_udf(df[f'{input_col}Bin']))
        for i in range(max_length-2):
                df = df.withColumn(f'{input_col}BinIndex{i}', col(f'{input_col}Vector').getItem(i))
        df = df.drop(f'{input_col}Bin', f'{input_col}Vector')

    return df

def df_processing_pipeline(df):
    """
    Pipeline to scale and string index columns; run prior to binarization.
    
    Input:
        df: dataframe
    Output:
        pipeline: pipeline object
    """
    # Assemble and scale numeric features
    mm_assembler = VectorAssembler(inputCols=num_scale_columns, outputCol='numeric_feat_to_scale')
    scaler = MinMaxScaler(inputCol='numeric_feat_to_scale', outputCol='scaled_features')

    # String index categorical features
    cat_idx_columns = str_idx_columns + bin_idx_columns
    cat_idx_output = [f'{i}Index' for i in cat_idx_columns]
    cat_indexer = StringIndexer(inputCols=cat_idx_columns, outputCols=cat_idx_output, handleInvalid='keep')

    # Pipeline definition
    pipeline_stages = [mm_assembler, scaler, cat_indexer]
    pipeline = Pipeline(stages=pipeline_stages)

    return pipeline

def df_processing(df, pipeline_model):
    """
    Final processing pipeline to transform df with fitted pipeline model (min/max scaling, indexing),
    binarization, and replacing scaled values into source columns.

    Input:
        df: dataframe
        pipeline_model: fitted pipeline model
    Output:
        df: dataframe with processed columns ready for training
    """
    df_processed = pipeline_model.transform(df)
    bin_idx_output = [f'{i}Index' for i in bin_idx_columns]
    df_final = BinIndexer(df_processed, input_cols=bin_idx_output)

    df_final = replace_scaled_columns(df_final, num_scale_columns)
    df_final.drop('numeric_feat_to_scale', 'scaled_features', 'scaled_array')
    
    return df_final


In [0]:
# RUN CELL - functions for model training
def df_feature_assembler(df, assembler_columns):
    """
    Assembles features and drops columns not used in trianing.
    Input:
        df: dataframe
        assemble_columns: list of features to include in model
    Output:
        df: dataframe with 'features', 'labels'
    """
    #drop omitted columns
    label = 'depDel15'
    columns = df.columns
    omit_columns = list(set(columns).symmetric_difference(set(assembler_columns + [label])))
    df = df.drop(*omit_columns)

    # Assemble Vectors
    label_indexer = StringIndexer(inputCol=label, outputCol='label')
    feature_assembler = VectorAssembler(inputCols=assembler_columns, outputCol="features")
    pipeline = Pipeline(stages=[label_indexer, feature_assembler])

    # Train Model
    df_model = pipeline.fit(df).transform(df).select('features', 'label') 

    return df_model

def train_model(df, max_depth, min_info_gain, max_bins):
    """
    Function to create and train model
    Input:
        df: dataframe
        max_depth:
        min_info_gain:
        max_bins:
    Return:
        dt_model: fitted decision tree model
    """
    dt = DecisionTreeClassifier(maxDepth=max_depth,
                                minInfoGain=min_info_gain,
                                labelCol='label',
                                featuresCol="features",
                                maxBins=max_bins)
    dt_model = dt.fit(df)

    return dt_model

def print_metrics(df, model):
    """
    Prints fbeta, recall, precision, and classification report.
    Input:
        df: dataframe
        model: fitted mode
    Prints:
        fbeta, recall, precision, classification report
    """

    predictions = model.transform(df)
    y_true = predictions.select('label').toPandas()
    y_pred = predictions.select('prediction').toPandas()    
    metrics = precision_recall_fscore_support(y_true, y_pred, beta=2.0, average='binary', pos_label=1, zero_division=1)

    print('F_beta:', metrics[2])
    print('Recall:', metrics[1])
    print(f'Precision:', metrics[0])
    print(classification_report(y_true, y_pred, zero_division=0))

def model_metrics(model, df):
    #get predictions
    predictions = model.transform(df)
    y_true = predictions.select('label').toPandas()
    y_pred = predictions.select('prediction').toPandas()
    prf = precision_recall_fscore_support(y_true, y_pred, beta=2.0, average='binary', pos_label=1)

    return prf

# Cross Validation

In [0]:
# RUN CELL - functions for crossvalidation and retrieving weighted averages
def cross_validator(df_train_names, df_test_names, features, max_depth_list, min_info_gain_list, max_bins):
    model_summary = {}

    for xval, (train, test) in enumerate(zip(df_train_names, df_test_names)):
        #load train data
        df_train = spark.read.parquet(f"{team_blob_url}/PRD/xval/{train}/")
        pipeline = df_processing_pipeline(df_train)
        pipeline_model = pipeline.fit(df_train)
        df_train = df_processing(df_train, pipeline_model)

        #load test data
        df_test = spark.read.parquet(f"{team_blob_url}/PRD/xval/{test}/")
        df_test = df_processing(df_test, pipeline_model)

        #fit and transform df's
        df_fit_train = df_feature_assembler(df_train, features)
        df_fit_test = df_feature_assembler(df_test, features)

        #train model on each param
        for i in max_depth_list:
            for j in min_info_gain_list:
                print(f'{xval}: (Max Depth: {i}, Min Info Gain: {j})')
                # create dictionary to save run info
                model_dict = {}
                model_dict['datasplit'] = f'xval{xval+1}'
                model_dict['params'] = {'max_depth': i,
                                        'min_info_gain': j}

                # train and save model
                model = train_model(df_fit_train, i, j, max_bins)
                model_dict['model'] = model

                #get train metrics
                train_metrics = model_metrics(model, df_fit_train)
                train_metrics_dict =   {'Precision': train_metrics[0],
                                        'Recall': train_metrics[1],
                                        'Fbeta': train_metrics[2]
                                        }
                model_dict['train_metrics'] = train_metrics_dict

                #get test metrics
                test_metrics = model_metrics(model, df_fit_test)
                test_metrics_dict =   { 'Precision': test_metrics[0],
                                        'Recall': test_metrics[1],
                                        'Fbeta': test_metrics[2]
                                        }
                model_dict['test_metrics'] = test_metrics_dict

                #save model dict
                model_summary[f'xval{xval+1}_param_{i}_{j}'] = model_dict

    return model_summary

def weighted_average(errors, weights):
    return sum(error * weight for error, weight in zip(errors, weights)) / sum(weights)

def print_weighted_averages(model_summary, max_depth_list, min_info_gain_list):
    xval_wavg_vals = [0.2, 0.3, 0.5]
    #loop through param iterations and get each xval metric
    for i in max_depth_list:
        for j in min_info_gain_list:
                print(f'Model Params: max_depth: {i}, min_info_gain: {j}')
                print()

                #loop through metrics
                metric_list = ['Fbeta', 'Recall', 'Precision']
                for metric in metric_list:

                    train_metric = [model_summary[f'xval1_param_{i}_{j}']['train_metrics'][metric],
                                    model_summary[f'xval2_param_{i}_{j}']['train_metrics'][metric],
                                    model_summary[f'xval3_param_{i}_{j}']['train_metrics'][metric],
                                    ]
                    
                    test_metric = [ model_summary[f'xval1_param_{i}_{j}']['test_metrics'][metric],
                                    model_summary[f'xval2_param_{i}_{j}']['test_metrics'][metric],
                                    model_summary[f'xval3_param_{i}_{j}']['test_metrics'][metric],
                                    ]
                    
                    train_wavg = weighted_average(train_metric, xval_wavg_vals)
                    test_wavg = weighted_average(test_metric, xval_wavg_vals)
                    print(f'Train {metric} Weighted Average: {train_wavg}')
                    print(f'Test {metric} Weighted Average: {test_wavg}')
                    print()
                print("*"*50)

In [0]:
# Name of train, test files
df_train_names = ["df_xval1_train", "df_xval2_train", "df_xval3_train"]
df_test_names = ["df_xval1_test", "df_xval2_test", "df_xval3_test"]

# Hyperparameters for grid search
max_depth_list = [6, 8, 10, 12, 14]
min_info_gain_list = [1e-2, 1e-4, 1e-6]
max_bins=64

# Best features
assembler_columns = [
 'timeBtwFlightsPlanned',
 'hourlyAltimeterSetting',
 'hourlyDewPointTemperature',
 'hourlyDryBulbTemperature',
 'hourlyPrecipitation',
 'hourlyWetBulbTemperature',
 'hourlyWindDirection',
 'hourlyWindGustSpeed',
 'hourlyWindSpeed',
 'crsDepTimeSine',
 'monthSine',
 'dayOfWeekSine',
 'originPageRank',
 'destPageRank',
 'dayOfMonthIndex',
 'opUniqueCarrierIndex',
 'yearIndex',
 'originTypeIndex',
 'destTypeIndex',
 'depDel15PrevFIndex',
 'arrDel15PrevFIndex',
 'originIsoRegionIndex',
 'destIsoRegionIndex',
 ]

In [0]:
model_summary = cross_validator(df_train_names,
                                df_test_names,
                                assembler_columns,
                                max_depth_list,
                                min_info_gain_list,
                                max_bins)

Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

0: (Max Depth: 6, Min Info Gain: 0.01)


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

1: (Max Depth: 6, Min Info Gain: 0.01)


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

2: (Max Depth: 6, Min Info Gain: 0.01)


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
model_summary

{'xval1_param_6_0.01': {'datasplit': 'xval1',
  'params': {'max_depth': 6, 'min_info_gain': 0.01},
  'model': DecisionTreeClassificationModel: uid=DecisionTreeClassifier_fb686962469a, depth=6, numNodes=23, numClasses=2, numFeatures=23,
  'train_metrics': {'Precision': 0.6155078323093742,
   'Recall': 0.8360218692317796,
   'Fbeta': 0.7801238754015771},
  'test_metrics': {'Precision': 0.33952385214356035,
   'Recall': 0.49858710652559635,
   'Fbeta': 0.45587280986372486}},
 'xval2_param_6_0.01': {'datasplit': 'xval2',
  'params': {'max_depth': 6, 'min_info_gain': 0.01},
  'model': DecisionTreeClassificationModel: uid=DecisionTreeClassifier_98203166881d, depth=6, numNodes=15, numClasses=2, numFeatures=23,
  'train_metrics': {'Precision': 0.6195563854265542,
   'Recall': 0.834692021281386,
   'Fbeta': 0.78048843929446},
  'test_metrics': {'Precision': 0.5771357286033513,
   'Recall': 0.8303354780813146,
   'Fbeta': 0.7633559048886881}},
 'xval3_param_6_0.01': {'datasplit': 'xval3',
  'par

In [0]:
print_weighted_averages(model_summary, max_depth_list, min_info_gain_list)

Model Params: max_depth: 6, min_info_gain: 0.01

Train Fbeta Weighted Average: 0.7872167330403146
Test Fbeta Weighted Average: 0.5504573193485489

Train Recall Weighted Average: 0.8431356128963167
Test Recall Weighted Average: 0.5958764589915581

Train Precision Weighted Average: 0.6221698042685833
Test Precision Weighted Average: 0.4221205469523097

**************************************************


# Model Training

In [0]:
# best hyperparameters
max_depth = 6
min_info_gain = 0.01

# Best features
assembler_columns = [
 'timeBtwFlightsPlanned',
 'hourlyAltimeterSetting',
 'hourlyDewPointTemperature',
 'hourlyDryBulbTemperature',
 'hourlyPrecipitation',
 'hourlyWetBulbTemperature',
 'hourlyWindDirection',
 'hourlyWindGustSpeed',
 'hourlyWindSpeed',
 'crsDepTimeSine',
 'monthSine',
 'dayOfWeekSine',
 'originPageRank',
 'destPageRank',
 'dayOfMonthIndex',
 'opUniqueCarrierIndex',
 'yearIndex',
 'originTypeIndex',
 'destTypeIndex',
 'depDel15PrevFIndex',
 'arrDel15PrevFIndex',
 'originIsoRegionIndex',
 'destIsoRegionIndex',
 ]

In [0]:
# TRAIN
# load data
df_train = spark.read.parquet(f'{team_blob_url}/PRD/xval/df_xval1-3_train/')

# prepare features
pipeline = df_processing_pipeline(df_train)
pipeline_model = pipeline.fit(df_train)
df_train = df_processing(df_train, pipeline_model)
df_fit_train = df_feature_assembler(df_train, assembler_columns)

# train model
model = train_model(df_fit_train, max_depth, min_info_gain, max_bins=64)

# save parameters
model_dict_final_train = {}
model_dict_final_train['params'] = {
                        'max_depth': max_depth,
                        'min_info_gain': min_info_gain
                        }

# save metrics

train_metrics = model_metrics(model, df_fit_train)

train_metrics_dict =   {'Precision': train_metrics[0],
                        'Recall': train_metrics[1],
                        'Fbeta': train_metrics[2],
                        }
model_dict_final_train['train_metrics'] = train_metrics_dict

# save model dict
model_summary_final = {}
model_summary_final[f'final_train_param_{max_depth}_{min_info_gain}'] = model_dict_final_train

# save model and pipeline
# pipeline_model.write().save(f'{team_blob_url}/JD/models/dt_2015-2018_best_model_pipeline')
# model.save(f'{team_blob_url}/JD/models/dt_2015-2018_best_model/')

Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

## Evaluation

In [0]:
display(dbutils.fs.ls(f"{team_blob_url}/PRD/xval_2020"))

path,name,size,modificationTime
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/df_final_val/,df_final_val/,0,1734024624000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/df_xval1_test/,df_xval1_test/,0,1734024323000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/df_xval1_train/,df_xval1_train/,0,1734024376000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/df_xval2_test/,df_xval2_test/,0,1734024425000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/df_xval2_train/,df_xval2_train/,0,1734024477000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/df_xval3_test/,df_xval3_test/,0,1734024527000
wasbs://261-final-project@ansonbquon.blob.core.windows.net/PRD/xval_2020/df_xval3_train/,df_xval3_train/,0,1734024586000


In [0]:
# load saved model and pipeline:
model = DecisionTreeClassificationModel.load(f'{team_blob_url}/JD/models/dt_2015-2018_best_model/')
pipeline_model = PipelineModel.load(f'{team_blob_url}/JD/models/dt_2015-2018_best_model_pipeline')

In [0]:
# TEST
# load data
# df_test = spark.read.parquet(f'{team_blob_url}/PRD/xval/df_final_val/') # 2019 validation
df_test = spark.read.parquet(f'{team_blob_url}/PRD/xval_2020/df_final_val/') # 2020 validation

# prepare features
pipeline = df_processing_pipeline(df_test)
pipeline_model = pipeline.fit(df_test)
df_test = df_processing(df_test, pipeline_model)
df_fit_test = df_feature_assembler(df_test, assembler_columns)

# save parameters
model_dict_final_test = {}
model_dict_final_test['params'] = {
                        'max_depth': max_depth,
                        'min_info_gain': min_info_gain
                        }

# metrics
test_metrics = model_metrics(model, df_fit_test)
test_metrics_dict =   {'Precision': test_metrics[0],
                        'Recall': test_metrics[1],
                        'Fbeta': test_metrics[2],
                        }
model_dict_final_test['test_metrics'] = test_metrics_dict
model_summary_final = {} # uncomment if only test
model_summary_final[f'final_test_param_{max_depth}_{min_info_gain}'] = model_dict_final_test

Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
model_summary_final

{'final_test_param_6_0.01': {'params': {'max_depth': 6, 'min_info_gain': 0.01},
  'test_metrics': {'Precision': 0.24436212617651357,
   'Recall': 0.6166692560690765,
   'Fbeta': 0.47264579276582414}}}

## Decision Tree Metadata

In [0]:
df_fit_test.schema['features'].metadata

{'ml_attr': {'attrs': {'numeric': [{'idx': 0, 'name': 'timeBtwFlightsPlanned'},
    {'idx': 1, 'name': 'hourlyAltimeterSetting'},
    {'idx': 2, 'name': 'hourlyDewPointTemperature'},
    {'idx': 3, 'name': 'hourlyDryBulbTemperature'},
    {'idx': 4, 'name': 'hourlyPrecipitation'},
    {'idx': 5, 'name': 'hourlyWetBulbTemperature'},
    {'idx': 6, 'name': 'hourlyWindDirection'},
    {'idx': 7, 'name': 'hourlyWindGustSpeed'},
    {'idx': 8, 'name': 'hourlyWindSpeed'},
    {'idx': 9, 'name': 'crsDepTimeSine'},
    {'idx': 10, 'name': 'monthSine'},
    {'idx': 11, 'name': 'dayOfWeekSine'},
    {'idx': 12, 'name': 'originPageRank'},
    {'idx': 13, 'name': 'destPageRank'}],
   'nominal': [{'vals': ['13',
      '6',
      '23',
      '7',
      '10',
      '20',
      '17',
      '3',
      '19',
      '16',
      '12',
      '9',
      '5',
      '4',
      '18',
      '14',
      '27',
      '2',
      '21',
      '24',
      '8',
      '1',
      '11',
      '22',
      '15',
      '26',


In [0]:
print('Number of Nodes:', model.numNodes)
print('Depth of Tree:', model.depth)

debug_string = model.toDebugString

for i, feature_name in enumerate(assembler_columns):
  debug_string = debug_string.replace(f'feature {i}', feature_name)

print(debug_string)

Number of Nodes: 131
Depth of Tree: 10
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_86c063f077ec, depth=10, numNodes=131, numClasses=2, numFeatures=23
  If (hourlyAltimeterSetting9 in {1.0,2.0})
   If (crsDepTimeSine <= 0.585)
    If (hourlyAltimeterSetting2 <= 0.12321908821488532)
     If (hourlyAltimeterSetting9 in {2.0})
      If (timeBtwFlightsPlanned <= 100.5)
       Predict: 0.0
      Else (timeBtwFlightsPlanned > 100.5)
       If (hourlyDewPointTemperature0 in {0.0})
        If (hourlyDewPointTemperature1 in {0.0,1.0,2.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,16.0,18.0,21.0,22.0,23.0,25.0,26.0,27.0,28.0,29.0,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0,40.0,41.0,43.0,44.0,45.0,46.0,47.0,48.0})
         If (timeBtwFlightsPlanned <= 161.5)
          If (hourlyDewPointTemperature1 in {4.0,12.0,21.0,39.0,40.0,43.0,47.0,48.0})
           Predict: 0.0
          Else (hourlyDewPointTemperature1 not in {4.0,12.0,21.0,39.0,40.0,43.0,47.0,48.0})
           If (hou

# Save predictions for evaluation

In [0]:

def df_evaluation_assembler(df, assembler_columns):
    """
    Assembles features and drops columns not used in trianing.
    Input:
        df: dataframe
        assemble_columns: list of features to include in model
    Output:
        df: dataframe with 'features', 'labels'
    """
    #drop omitted columns
    label = 'depDel15'

    # Assemble Vectors
    label_indexer = StringIndexer(inputCol=label, outputCol='label')
    feature_assembler = VectorAssembler(inputCols=assembler_columns, outputCol="features")
    pipeline = Pipeline(stages=[label_indexer, feature_assembler])

    # Train Model
    df_model = pipeline.fit(df).transform(df)

    return df_model

df_fit_test = df_evaluation_assembler(df_test, assembler_columns)
predictions = model.transform(df_fit_test)

df_fit_test = df_evaluation_assembler(df_test, assembler_columns)
predictions = model.transform(df_fit_test)

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
predictions.limit(5).display()

quarter,month,dayOfMonth,dayOfWeek,opUniqueCarrier,tailNum,opCarrierFlNum,origin,dest,crsDepTime,depDel15,crsElapsedTime,distance,year,originType,originIsoRegion,destType,destIsoRegion,originPrevF,crsDepTimePrevF,depTimePrevF,depDel15PrevF,depDelayGroupPrevF,depTimeBlkPrevF,crsArrTimePrevF,arrTimePrevF,arrDel15PrevF,arrDelayGroupPrevF,arrTimeBlkPrevF,crsElapsedTimePrevF,actualElapsedTimePrevF,distancePrevF,originTypePrevF,timeBtwFlights,timeBtwFlightsPlanned,hourlyAltimeterSetting,hourlyDewPointTemperature,hourlyDryBulbTemperature,hourlyPrecipitation,hourlyPressureChange,hourlyPressureTendency,hourlyRelativeHumidity,hourlySeaLevelPressure,hourlyStationPressure,hourlyVisibility,hourlyWetBulbTemperature,hourlyWindDirection,hourlyWindGustSpeed,hourlyWindSpeed,crsDepTimeSine,monthSine,dayOfWeekSine,daysToNearestHoliday,originPageRank,destPageRank,predictedMeanHourlyPrecipitation,predictedMeanHourlyVisibility,predictedMeanHourlyDewPointTemp,numeric_feat_to_scale,scaled_features,quarterIndex,monthIndex,dayOfMonthIndex,dayOfWeekIndex,opUniqueCarrierIndex,yearIndex,originTypeIndex,destTypeIndex,depDel15PrevFIndex,depDelayGroupPrevFIndex,depTimeBlkPrevFIndex,arrDel15PrevFIndex,arrDelayGroupPrevFIndex,arrTimeBlkPrevFIndex,originTypePrevFIndex,tailNumIndex,opCarrierFlNumIndex,originIndex,destIndex,originIsoRegionIndex,destIsoRegionIndex,originPrevFIndex,tailNumIndexBinIndex0,tailNumIndexBinIndex1,tailNumIndexBinIndex2,tailNumIndexBinIndex3,tailNumIndexBinIndex4,tailNumIndexBinIndex5,tailNumIndexBinIndex6,tailNumIndexBinIndex7,tailNumIndexBinIndex8,tailNumIndexBinIndex9,tailNumIndexBinIndex10,tailNumIndexBinIndex11,tailNumIndexBinIndex12,opCarrierFlNumIndexBinIndex0,opCarrierFlNumIndexBinIndex1,opCarrierFlNumIndexBinIndex2,opCarrierFlNumIndexBinIndex3,opCarrierFlNumIndexBinIndex4,opCarrierFlNumIndexBinIndex5,opCarrierFlNumIndexBinIndex6,opCarrierFlNumIndexBinIndex7,opCarrierFlNumIndexBinIndex8,opCarrierFlNumIndexBinIndex9,opCarrierFlNumIndexBinIndex10,opCarrierFlNumIndexBinIndex11,opCarrierFlNumIndexBinIndex12,originIndexBinIndex0,originIndexBinIndex1,originIndexBinIndex2,originIndexBinIndex3,originIndexBinIndex4,originIndexBinIndex5,originIndexBinIndex6,originIndexBinIndex7,originIndexBinIndex8,destIndexBinIndex0,destIndexBinIndex1,destIndexBinIndex2,destIndexBinIndex3,destIndexBinIndex4,destIndexBinIndex5,destIndexBinIndex6,destIndexBinIndex7,destIndexBinIndex8,originIsoRegionIndexBinIndex0,originIsoRegionIndexBinIndex1,originIsoRegionIndexBinIndex2,originIsoRegionIndexBinIndex3,originIsoRegionIndexBinIndex4,originIsoRegionIndexBinIndex5,destIsoRegionIndexBinIndex0,destIsoRegionIndexBinIndex1,destIsoRegionIndexBinIndex2,destIsoRegionIndexBinIndex3,destIsoRegionIndexBinIndex4,destIsoRegionIndexBinIndex5,originPrevFIndexBinIndex0,originPrevFIndexBinIndex1,originPrevFIndexBinIndex2,originPrevFIndexBinIndex3,originPrevFIndexBinIndex4,originPrevFIndexBinIndex5,originPrevFIndexBinIndex6,originPrevFIndexBinIndex7,originPrevFIndexBinIndex8,scaled_array,label,features,rawPrediction,probability,prediction
1,2,15,6,NK,N611NK,1146,EWR,ATL,1426,0.0,0.296010296010296,0.1415317804974338,2020,large_airport,US-NJ,large_airport,US-GA,RSW,1045,1102,1.0,1,1000-1059,1335,1335,-999.0,-999,-999,170.0,0.229901269393512,0.2050927753651796,large_airport,0.0096063288754944,51.0,0.8582681718206401,0.4186602870813397,0.4285714285714285,0.0,0.6560846582121419,0.8888888888888888,0.393939393939394,0.7802616975253539,0.971590906627907,0.5,0.4961832061068702,0.6111111111111112,0.1194579212099294,0.0031291908806437,-0.59,0.87,-0.78,0.99500998003992,0.3785579418141344,1.0,0.4166085924383306,0.6047946762285185,0.5132087908363137,"Map(vectorType -> dense, length -> 25, values -> List(153.0, 746.0, 170.0, 1068.0, 51.0, 30.6200008392334, 5.0, 26.0, 0.0, -0.02666666607062022, 8.0, 40.0, 30.610000610351562, 30.59000015258789, 10.0, 21.0, 220.0, 22.40823147554826, 7.0, -2.0, 0.004858673004573397, 6.615879808463694, 35.48859550319644, 4.708322525024414, 12.103297233581543))","Map(vectorType -> dense, length -> 25, values -> List(0.296010296010296, 0.14153178049743387, 0.229901269393512, 0.2050927753651796, 0.009606328875494443, 0.8582681718206401, 0.4186602870813397, 0.42857142857142855, 0.0, 0.6560846582121419, 0.8888888888888888, 0.393939393939394, 0.7802616975253539, 0.971590906627907, 0.5, 0.4961832061068702, 0.6111111111111112, 0.11945792120992943, 0.0031291908806437194, 0.99500998003992, 0.4166085924383306, 0.6047946762285185, 0.5132087908363137, 0.37855794181413444, 1.0))",0.0,1.0,24.0,5.0,9.0,0.0,0.0,0.0,2.0,3.0,6.0,0.0,0.0,0.0,0.0,1742.0,34.0,17.0,0.0,18.0,3.0,38.0,1,1,0,1,1,0,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,0,0,"List(0.296010296010296, 0.14153178049743387, 0.229901269393512, 0.2050927753651796, 0.009606328875494443, 0.8582681718206401, 0.4186602870813397, 0.42857142857142855, 0.0, 0.6560846582121419, 0.8888888888888888, 0.393939393939394, 0.7802616975253539, 0.971590906627907, 0.5, 0.4961832061068702, 0.6111111111111112, 0.11945792120992943, 0.0031291908806437194, 0.99500998003992, 0.4166085924383306, 0.6047946762285185, 0.5132087908363137, 0.37855794181413444, 1.0)",0.0,"Map(vectorType -> dense, length -> 23, values -> List(51.0, 0.8582681718206401, 0.4186602870813397, 0.42857142857142855, 0.0, 0.4961832061068702, 0.6111111111111112, 0.11945792120992943, 0.0031291908806437194, -0.59, 0.87, -0.78, 0.37855794181413444, 1.0, 24.0, 9.0, 0.0, 0.0, 0.0, 2.0, 0.0, 18.0, 3.0))","Map(vectorType -> dense, length -> 2, values -> List(417399.0, 92809.0))","Map(vectorType -> dense, length -> 2, values -> List(0.818095757024586, 0.18190424297541394))",0.0
1,2,15,6,OO,N108SY,5492,COS,LAX,1227,1.0,0.3178893178893179,0.1587050927753651,2020,large_airport,US-CO,large_airport,US-CA,IAH,953,1111,1.0,5,0900-0959,1132,1132,-999.0,-999,-999,159.0,0.2143864598025388,0.1539676273193841,large_airport,0.0103597664343567,55.0,0.649606402686509,0.5263157894736842,0.4583333333333333,0.0,0.6349206373609864,0.0,0.6363636363636365,0.5999343909081789,0.2181816290232597,0.5,0.5419847328244275,0.4722222222222222,0.1692844677137871,0.0035762181493071,-0.12,0.87,-0.78,0.99500998003992,0.0448585044664391,0.385984486457127,0.4166085924383306,0.6662550994962397,0.5394046350839161,"Map(vectorType -> dense, length -> 25, values -> List(170.0, 833.0, 159.0, 809.0, 55.0, 30.09000015258789, 20.0, 31.0, 0.0, -0.03999999910593033, 0.0, 64.0, 30.09000015258789, 23.959999084472656, 10.0, 27.0, 170.0, 27.166666666666668, 8.0, -2.0, 0.004858673004573397, 9.265577094177448, 48.038006499521586, 0.7373989820480347, 4.796696186065674))","Map(vectorType -> dense, length -> 25, values -> List(0.3178893178893179, 0.15870509277536518, 0.2143864598025388, 0.15396762731938413, 0.010359766434356753, 0.649606402686509, 0.5263157894736842, 0.4583333333333333, 0.0, 0.6349206373609864, 0.0, 0.6363636363636365, 0.5999343909081789, 0.2181816290232597, 0.5, 0.5419847328244275, 0.47222222222222227, 0.1692844677137871, 0.003576218149307108, 0.99500998003992, 0.4166085924383306, 0.6662550994962397, 0.5394046350839161, 0.04485850446643913, 0.385984486457127))",0.0,1.0,24.0,5.0,1.0,0.0,0.0,0.0,2.0,8.0,7.0,0.0,0.0,0.0,0.0,1266.0,2368.0,101.0,5.0,6.0,1.0,10.0,1,0,0,1,1,1,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,"List(0.3178893178893179, 0.15870509277536518, 0.2143864598025388, 0.15396762731938413, 0.010359766434356753, 0.649606402686509, 0.5263157894736842, 0.4583333333333333, 0.0, 0.6349206373609864, 0.0, 0.6363636363636365, 0.5999343909081789, 0.2181816290232597, 0.5, 0.5419847328244275, 0.47222222222222227, 0.1692844677137871, 0.003576218149307108, 0.99500998003992, 0.4166085924383306, 0.6662550994962397, 0.5394046350839161, 0.04485850446643913, 0.385984486457127)",1.0,"Map(vectorType -> dense, length -> 23, values -> List(55.0, 0.649606402686509, 0.5263157894736842, 0.4583333333333333, 0.0, 0.5419847328244275, 0.47222222222222227, 0.1692844677137871, 0.003576218149307108, -0.12, 0.87, -0.78, 0.04485850446643913, 0.385984486457127, 24.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 6.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(22815.0, 3287.0))","Map(vectorType -> dense, length -> 2, values -> List(0.8740709524174393, 0.1259290475825607))",0.0
1,2,15,6,MQ,N283NN,4238,AUS,ORD,1328,0.0,0.3063063063063063,0.1871298855112514,2020,large_airport,US-TX,large_airport,US-IL,ORD,945,943,0.0,-1,0900-0959,1250,1250,-999.0,-999,-999,185.0,0.2510578279266572,0.1871298855112514,large_airport,0.0071576568091919,38.0,0.7125984931939917,0.7200956937799043,0.6071428571428571,0.0,0.685185186882481,0.8888888888888888,0.7171717171717172,0.6588875234008693,0.871590776581398,0.5,0.7251908396946565,0.5,0.1151832460732984,0.0031291908806437,-0.37,0.87,-0.78,0.99500998003992,0.1808875850209459,0.9619974478578336,0.416713039908119,0.6662550994962397,0.5182013536403316,"Map(vectorType -> dense, length -> 25, values -> List(161.0, 977.0, 185.0, 977.0, 38.0, 30.25, 47.0, 56.0, 0.0, -0.008333333147068819, 8.0, 72.0, 30.260000228881836, 29.709999084472656, 10.0, 51.0, 180.0, 22.0, 7.0, -2.0, 0.00538844989164457, 9.265577094177448, 37.880338329286936, 2.3561043739318848, 11.651078224182129))","Map(vectorType -> dense, length -> 25, values -> List(0.3063063063063063, 0.18712988551125148, 0.25105782792665726, 0.18712988551125148, 0.007157656809191938, 0.7125984931939917, 0.7200956937799043, 0.6071428571428571, 0.0, 0.685185186882481, 0.8888888888888888, 0.7171717171717172, 0.6588875234008693, 0.871590776581398, 0.5, 0.7251908396946565, 0.5, 0.11518324607329844, 0.0031291908806437194, 0.99500998003992, 0.416713039908119, 0.6662550994962397, 0.5182013536403316, 0.18088758502094593, 0.9619974478578335))",0.0,1.0,24.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,1073.0,4000.0,32.0,3.0,0.0,4.0,3.0,1,0,0,0,0,1,1,0,0,0,1,0,0,1,1,1,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,"List(0.3063063063063063, 0.18712988551125148, 0.25105782792665726, 0.18712988551125148, 0.007157656809191938, 0.7125984931939917, 0.7200956937799043, 0.6071428571428571, 0.0, 0.685185186882481, 0.8888888888888888, 0.7171717171717172, 0.6588875234008693, 0.871590776581398, 0.5, 0.7251908396946565, 0.5, 0.11518324607329844, 0.0031291908806437194, 0.99500998003992, 0.416713039908119, 0.6662550994962397, 0.5182013536403316, 0.18088758502094593, 0.9619974478578335)",0.0,"Map(vectorType -> dense, length -> 23, values -> List(38.0, 0.7125984931939917, 0.7200956937799043, 0.6071428571428571, 0.0, 0.7251908396946565, 0.5, 0.11518324607329844, 0.0031291908806437194, -0.37, 0.87, -0.78, 0.18088758502094593, 0.9619974478578335, 24.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0))","Map(vectorType -> dense, length -> 2, values -> List(483660.0, 765524.0))","Map(vectorType -> dense, length -> 2, values -> List(0.387180751594641, 0.6128192484053591))",1.0
1,2,15,6,YV,N921FJ,5767,VPS,DFW,1328,1.0,0.2805662805662806,0.1208053691275167,2020,large_airport,US-FL,large_airport,US-TX,DFW,1050,1044,0.0,-1,1000-1059,1248,1248,-999.0,-999,-999,118.0,0.156558533145275,0.1208053691275167,large_airport,0.007534375588623,40.0,0.7598429365370796,0.5980861244019139,0.6369047619047619,0.0,0.6942355905359356,0.8888888888888888,0.303030303030303,0.7039690193371448,0.9352273017906972,0.5,0.6946564885496183,0.2777777777777778,0.0981568977993445,0.0031291908806437,-0.37,0.87,-0.78,0.99500998003992,0.0173390673824384,0.9568083675262736,0.4166085924383306,0.6662550994962397,0.5558233893602912,"Map(vectorType -> dense, length -> 25, values -> List(141.0, 641.0, 118.0, 641.0, 40.0, 30.3700008392334, 30.0, 61.0, 0.0, -0.002631578888548048, 8.0, 31.0, 30.389999389648438, 30.270000457763672, 10.0, 47.0, 100.0, 20.3739837398374, 7.0, -2.0, 0.004858673004573397, 9.265577094177448, 55.903593632899025, 0.4099259078502655, 11.589329719543457))","Map(vectorType -> dense, length -> 25, values -> List(0.2805662805662806, 0.12080536912751677, 0.15655853314527504, 0.12080536912751677, 0.007534375588623093, 0.7598429365370796, 0.5980861244019139, 0.6369047619047619, 0.0, 0.6942355905359356, 0.8888888888888888, 0.30303030303030304, 0.7039690193371448, 0.9352273017906971, 0.5, 0.6946564885496183, 0.2777777777777778, 0.0981568977993445, 0.0031291908806437194, 0.99500998003992, 0.4166085924383306, 0.6662550994962397, 0.5558233893602912, 0.017339067382438428, 0.9568083675262737))",0.0,1.0,24.0,5.0,12.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,875.0,4137.0,90.0,1.0,2.0,0.0,1.0,1,1,0,1,1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,1,1,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"List(0.2805662805662806, 0.12080536912751677, 0.15655853314527504, 0.12080536912751677, 0.007534375588623093, 0.7598429365370796, 0.5980861244019139, 0.6369047619047619, 0.0, 0.6942355905359356, 0.8888888888888888, 0.30303030303030304, 0.7039690193371448, 0.9352273017906971, 0.5, 0.6946564885496183, 0.2777777777777778, 0.0981568977993445, 0.0031291908806437194, 0.99500998003992, 0.4166085924383306, 0.6662550994962397, 0.5558233893602912, 0.017339067382438428, 0.9568083675262737)",1.0,"Map(vectorType -> dense, length -> 23, values -> List(40.0, 0.7598429365370796, 0.5980861244019139, 0.6369047619047619, 0.0, 0.6946564885496183, 0.2777777777777778, 0.0981568977993445, 0.0031291908806437194, -0.37, 0.87, -0.78, 0.017339067382438428, 0.9568083675262737, 24.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(483660.0, 765524.0))","Map(vectorType -> dense, length -> 2, values -> List(0.387180751594641, 0.6128192484053591))",1.0
1,2,15,6,OH,N705PS,5619,SRQ,PHL,1428,1.0,0.3075933075933076,0.1831819976312672,2020,large_airport,US-FL,large_airport,US-PA,PHL,1100,1057,0.0,-1,1100-1159,1357,1357,-999.0,-999,-999,177.0,0.2397743300423131,0.1831819976312672,large_airport,0.0058391410811828,31.0,0.7007871946269818,0.8277511961722488,0.7142857142857142,0.0,0.6666666686377197,0.8888888888888888,0.6666666666666667,0.6450158868444544,0.9261364005813948,0.5,0.8473282442748091,0.25,0.1256544502617801,0.0067054090299508,-0.6,0.87,-0.78,0.99500998003992,0.0271969290354528,0.1877131795362712,0.4166085924383306,0.6596153624572066,0.578476585555421,"Map(vectorType -> dense, length -> 25, values -> List(162.0, 957.0, 177.0, 957.0, 31.0, 30.219999313354492, 62.0, 74.0, 0.0, -0.019999999552965164, 8.0, 67.0, 30.219999313354492, 30.190000534057617, 10.0, 67.0, 90.0, 23.0, 15.0, -2.0, 0.004858673004573397, 8.979323074520883, 66.7558596162369, 0.5272315144538879, 2.4373269081115723))","Map(vectorType -> dense, length -> 25, values -> List(0.3075933075933076, 0.18318199763126727, 0.2397743300423131, 0.18318199763126727, 0.005839141081182897, 0.7007871946269818, 0.8277511961722488, 0.7142857142857142, 0.0, 0.6666666686377197, 0.8888888888888888, 0.6666666666666667, 0.6450158868444544, 0.9261364005813947, 0.5, 0.8473282442748091, 0.25, 0.12565445026178013, 0.006705409029950827, 0.99500998003992, 0.4166085924383306, 0.6596153624572066, 0.578476585555421, 0.02719692903545282, 0.18771317953627123))",0.0,1.0,24.0,5.0,7.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1176.0,1629.0,83.0,18.0,2.0,12.0,18.0,1,0,0,1,0,0,1,1,0,0,0,0,0,1,1,0,0,1,0,1,1,1,0,1,0,0,1,0,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,"List(0.3075933075933076, 0.18318199763126727, 0.2397743300423131, 0.18318199763126727, 0.005839141081182897, 0.7007871946269818, 0.8277511961722488, 0.7142857142857142, 0.0, 0.6666666686377197, 0.8888888888888888, 0.6666666666666667, 0.6450158868444544, 0.9261364005813947, 0.5, 0.8473282442748091, 0.25, 0.12565445026178013, 0.006705409029950827, 0.99500998003992, 0.4166085924383306, 0.6596153624572066, 0.578476585555421, 0.02719692903545282, 0.18771317953627123)",1.0,"Map(vectorType -> dense, length -> 23, values -> List(31.0, 0.7007871946269818, 0.8277511961722488, 0.7142857142857142, 0.0, 0.8473282442748091, 0.25, 0.12565445026178013, 0.006705409029950827, -0.6, 0.87, -0.78, 0.02719692903545282, 0.18771317953627123, 24.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 12.0))","Map(vectorType -> dense, length -> 2, values -> List(483660.0, 765524.0))","Map(vectorType -> dense, length -> 2, values -> List(0.387180751594641, 0.6128192484053591))",1.0


In [0]:
predictions.write.parquet(f'{team_blob_url}/JD/predictions/dt_2020_best_model_predictions', mode='overwrite')