In [None]:
selected_height = '1000' #input("Please enter the required height - 1000, 500 or 50. Enter only numbers --> ")

In [None]:
import pandas as pd
import re
from pycaret.regression import *
from pandas_profiling import ProfileReport
from pycaret.utils import enable_colab, check_metric
enable_colab()

%matplotlib inline

In [None]:
# Defining the CSV filepath for 3 different heights - 1000, 500, 50 feets
data_path = ''
different_height_data_paths = {'1000': data_path + '\\consolidated_data_1000.csv', 
                               '500' : data_path + '\\consolidated_data_500.csv', 
                               '50' : data_path + '\\consolidated_data_50.csv'}


In [None]:
# Read the CSV file from selected height.
current_height = different_height_data_paths[selected_height]

# Read CSV file at each height
consolidated_df = pd.read_csv(current_height)
consolidated_df.reset_index(drop = True)

In [None]:
# Selecting the independent variable that is required. Due to company proprietary data, actual features will not be shown.
vars_to_keep = ['', '', '']
cleaned_df = consolidated_df[vars_to_keep]

In [None]:
# Cleaning the column names and generate final Pandas Profiling Report. Pandas Profiling Report was generated earlier for feature (independent variables) selection
cleaned_df.columns = [re.sub('\W+', '_', c.lower()) for c in cleaned_df.columns]
title = "Cleaned Profiling Report - " + selected_height + " ft"
profile_filename = '/cleaned_profile' + selected_height + '.html'
cleaned_profile = ProfileReport(cleaned_df, minimal=True, title=title)
cleaned_profile.to_file(data_path + profile_filename)

In [None]:
# Split up train and test data
train_data = cleaned_df.sample(frac=0.7, random_state=88798)
test_data = cleaned_df.drop(train_data.index)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [None]:
#Setting PyCaret up
experiment_regression = setup(data = train_data, 
                            target = 'touchdown_distance', 
                            test_data = test_data,
                            use_gpu= True,
                            normalize = True, 
                            transformation = True, 
                            fold = 10,
                            fold_shuffle = True,
                            silent = True
                            )

In [None]:
# Model training and selecting the best model. R2 is chosen by default
best_model = compare_models()
print(best_model)

In [None]:
# Model tuning
tuned_model = tune_model(estimator = best_model, 
                        fold = 5,
                        optimize = 'MAPE',
                        search_library = 'optuna',
                        choose_better = True)

In [None]:
# Ensemble Model
ensembled_bagging_model = ensemble_model(tuned_model,
                                       optimize = 'MAPE')

In [None]:
ensembled_boosting_model = ensemble_model(tuned_model, 
                                 method='Boosting',
                                 optimize = 'MAPE')

In [None]:
#Finalize the model
best_finalized_model = finalize_model(automl(optimize = 'MAPE'))
print(best_finalized_model)

In [None]:
#Evaluate the model such as learning curve and feature importances
evaluate_model(best_finalized_model)
saved_model = save_model(best_finalized_model, 'best_model_' + selected_height)