In [None]:
from IPython.display import HTML
from IPython.display import Markdown as md
from pyfiglet import Figlet
import datetime
import yaml
import os
import pandas as pd

# Read YAML file
with open("classification_config.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Convert 'None' to None
for i,j in config.items():
    if j == 'None':
        config[i] = None
               
f = Figlet(font='slant')
print (f.renderText('SimpleML'))
print("Type : Classification")
print("User : {}".format(config['user_name']))
print("Analysis name: {}_{}_{}".format(config['analysis_name'], config['user_selected_model'], datetime.datetime.now().strftime ("%Y%m%d")))
print("Model selection : {}".format(config['user_selected_model']))
start_date = datetime.datetime.now()
print("Date time : {}".format(start_date))

# Create analysis folder
try:
    output_folder = "{}/{}/{}/outputs/{}/".format(config['user_name'], config['analysis_name'], config['user_selected_model'], start_date)
    os.makedirs(output_folder)
except FileExistsError:
    # directory already exists
    pass

# default demo dataset target variable mappings
_default_target_variables = {'credit':'default', 'blood':'Class',
'cancer':'Class', 'diabetes':'Class', 'heart':'DEATH', 'hepatitis':'Class',
'parkinsons':'PPE', 'wine':'traffic_volume', 'heart_disease':'type'}

# Caret config
if config['demo_run']:
    _demo_dataset = config['demo_dataset']
    _target = _default_target_variables[_demo_dataset]
    _target_class = _default_target_variables[_demo_dataset]
    print("Input data: {}".format(config['demo_dataset']))
else:
    _input_file = config['input_file']
    # Target processing
    _target = config['target_variable'] # _target = 'medv' # default Target cclass for Boston data
    _target_class = config['target_variable'] # _target_class = 'medv'
    print("Input data : {}".format(config['input_file']))
_pandas_profiling = config['profile'] # _pandas_profiling = True # Default is True which gives detailed 

# Silent preprocessing should be True unfortunately
_silent_preprocessing = True # silent_preproccessing = True

# Temp args
if config['user_selected_model'] == 'Auto select best model':
    _autoselect = True
else:
    _autoselect = False
    
_kfold = 10 # config['Kfolds'] # 10
_round = 4
if _autoselect:
    _sort_best_model_by = config['auto_select_best_model_by'] # 'R2' #['R2']
else:
    _sort_best_model_by = "Accuracy"
_turbo = True
_blacklist = None
_user_selected_model = None
_unseen_predictions = False
# Now create model
number_of_models = 1
_auto_tune = config['auto_tune'] 


In [None]:
from pycaret.datasets import get_data
dataset = get_data('credit')

In [None]:
# data = dataset.sample(frac=0.95, random_state=786).reset_index(drop=True)
# data_unseen = dataset.drop(data.index).reset_index(drop=True)
# print('Data for Modeling: ' + str(data.shape))
# print('Unseen Data For Predictions: ' + str(data_unseen.shape))
# Configure Inputs & Test sets
if _demo_dataset in ['credit', 'blood', 'cancer', 'diabetes', 'heart', 'hepatitis', 'parkinsons', 'wine', 'heart_disease']:
    from pycaret.datasets import get_data
    input_data = get_data(_demo_dataset)
    if _unseen_predictions:
        data = input_data.sample(frac=0.9, random_state=786).reset_index(drop=True)
        data_unseen = input_data.drop(data.index).reset_index(drop=True)

        print('Data for Modeling: ' + str(data.shape))
        print('Unseen Data For Predictions: ' + str(data_unseen.shape))
    else:
        data = input_data
else:
    input_data = pd.read_csv(_input_file)
    # print('No proper input was given, So running regression on Boston dataset')
    logging.info('Succesfully read the input csv')
    if _unseen_predictions:
        data = input_data.sample(frac=0.9, random_state=786).reset_index(drop=True)
        data_unseen = input_data.drop(data.index).reset_index(drop=True)

        print('Data for Modeling: ' + str(data.shape))
        print('Unseen Data For Predictions: ' + str(data_unseen.shape))
    else:
        data = input_data

In [None]:
from pycaret.classification import *

## Preproccessing and EDA results

In [None]:
exp_classification = setup(data = data, target = _target,
train_size = float(config['train_size']),
sampling = config['sampling'],
sample_estimator = config['sample_estimator'],
categorical_features = config['categorical_features'],
categorical_imputation = config['categorical_imputation'],
ordinal_features = config['ordinal_features'],
high_cardinality_features = config['high_cardinality_features'] ,
high_cardinality_method = config['high_cardinality_method'] ,
numeric_features = config['numeric_features'],
numeric_imputation = config['numeric_imputation'],
date_features = config['date_features'] ,
ignore_features = config['ignore_features'] ,
normalize =config['normalize'] ,
normalize_method = config['normalize_method'],
handle_unknown_categorical = config['handle_unknown_categorical'],
unknown_categorical_method = config['unknown_categorical_method'],
pca = config['pca'],
pca_method = config['pca_method'] ,
pca_components = float(config['pca_components']) ,
ignore_low_variance = config['ignore_low_variance'],
combine_rare_levels = config['combine_rare_levels'],
rare_level_threshold = float(config['rare_level_threshold']),
bin_numeric_features = config['bin_numeric_features'],
remove_outliers = config['remove_outliers'],
outliers_threshold = float(config['outliers_threshold']),
remove_multicollinearity = config['remove_multicollinearity'], 
multicollinearity_threshold = float(config['multicollinearity_threshold']),
create_clusters = config['create_clusters'], 
cluster_iter = int(config['cluster_iter']), 
polynomial_features = config['polynomial_features'], 
polynomial_degree = int(config['polynomial_degree']), 
trigonometry_features = config['trigonometry_features'], 
polynomial_threshold = float(config['polynomial_threshold']), 
group_features = config['group_features'], 
group_names = config['group_names'], 
feature_selection = config['feature_selection'], 
feature_selection_threshold = float(config['feature_selection_threshold']), 
feature_interaction = config['feature_interaction'], 
feature_ratio = config['feature_ratio'], 
interaction_threshold = float(config['interaction_threshold']),
session_id = config['session_id'], 
silent=_silent_preprocessing, 
profile = True)

## Models performance

In [None]:
# Compare model performance using metrics available
models_comparison = compare_models(blacklist = _blacklist, fold = config['Kfolds'],  round = _round,  sort = _sort_best_model_by, turbo = _turbo)
models_comparison.to_excel('{}/tmp_models.xlsx'.format(output_folder))
models_comparison_df = pd.read_excel('{}/tmp_models.xlsx'.format(output_folder),index = False)
display(models_comparison)
best_model = models_comparison_df['Model'][0]
top_five_best_models = models_comparison_df['Model'][0:5]
top_three_best_models = models_comparison_df['Model'][0:3]
print("Best suggested model based on best value for {} is {}".format(best_model, _sort_best_model_by))
if _user_selected_model is not None:
    print('Model of interest is selected {}'.format(_user_selected_model))

if _autoselect == True or _user_selected_model is None:
    model_selected = best_model
    print("Selected {} based on {}".format(best_model, _sort_best_model_by))
else:
    model_selected = _user_selected_model
    print("Selected {} based on user choice".format(model_selected))


## Selected models training results & plots

In [None]:
# Now create model
number_of_models = 1

def model_to_abv(model):
    abvs = {'Logistic Regression':'lr',
'K Nearest Neighbour':'knn',
'Naives Bayes':'nb',
"Decision Tree":'dt',
'SVM (Linear)':'svm',
'SVM (RBF)':'rbfsvm',
'Gaussian Process':'gpc',
'Multi Level Perceptron':'mlp',
"Ridge Classifier":'ridge',
"Random Forest":'rf',
"Quadratic Disc. Analysis":'qda',
"AdaBoost":'ada',
"Gradient Boosting Classifier":'gbc',
"Linear Discriminant Analysis" : 'lda',
"Extra Trees Classifier" :'et',
"Extreme Gradient Boosting":'xgboost',
"Light Gradient Boosting": 'lightgbm',
"Cat Boost Classifier":'catboost'}
    return abvs[model]

def train_and_plot_model(model, _auto_tune, plots = True):
    print("Training {}".format(model))
    if _auto_tune:
        print('Training with default hyper parameters')
        trained_model = tune_model(model_to_abv(model))
        print("Completed trained with default hyperparameters tuned {}".format(trained_model))
    else:
        print("Hyperparameter tuning... {}".format(_auto_tune))
        trained_model = create_model(model_to_abv(model))
        print("Completed training with hyperparameters tuned {}".format(trained_model))
    return trained_model 
trained_model = train_and_plot_model(model_selected,True)

In [None]:
if model_to_abv(model_selected) not in ['svm', 'ridge']:
    md(f"AUC Curve")
    plot_model(trained_model, plot = 'auc')

In [None]:
md(f"Precision Recall")
plot_model(trained_model, plot = 'pr')

In [None]:
md(f'Confusion Matrix')
plot_model(trained_model, plot = 'confusion_matrix')

In [None]:
md(f'Discrimination Threshold')
plot_model(trained_model, plot = 'threshold')

In [None]:
md(f'Class Prediction Error')
plot_model(trained_model, plot = 'error')

In [None]:
md(f'Classification Report')
plot_model(trained_model, plot = 'class_report')

In [None]:
md(f'Decision Boundary')
plot_model(trained_model, plot = 'boundary')

In [None]:
# TODO NEED TO FIGURE OUT SUPPORTED ALGOs
# print('Recursive Feature Selection')
# plot_model(trained_model, 'rfe')

In [None]:
md(f'Learning Curve')
plot_model(trained_model, plot = 'learning')

In [None]:
# TODO NEED TO FIGURE OUT SUPPORTED ALGOs
# print('Manifold Learning')
# plot_model(trained_model, plot ='manifold')

In [None]:
# TODO NEED TO FIGURE OUT SUPPORTED ALGOs
# print('Calibration Curve')
# plot_model(trained_model, plot = 'calibration')

In [None]:
md(f'Validation Curve')
plot_model(trained_model, plot ='vc')

In [None]:
md(f'Dimension Learning')
plot_model(trained_model, plot ='dimension')

In [None]:
md(f'Feature Importance')
plot_model(trained_model, plot ='feature')

In [None]:
md(f'Model Hyperparameter')
plot_model(trained_model, plot ='parameter')

In [None]:
try:
    # TODO NEED TO FIGURE OUT SUPPORTED ALGOs
    interpret_model(trained_model)
except:
    pass

In [None]:
# Run predictions
pm = predict_model(trained_model)

In [None]:
# Finalize model
final_model = finalize_model(trained_model)

In [None]:
# Predictions
unseen_predictions = predict_model(final_model, data=data_unseen)
unseen_predictions.head()

In [None]:
print("Pickling model to {}".format(output_folder))
save_model(final_model,'{}FinalModel'.format(output_folder))