In [None]:
from IPython.display import HTML
from IPython.display import Markdown as md
from pyfiglet import Figlet
import datetime
import yaml
import os

# Read config YAML file
with open("clustering_config.yml", 'r') as stream:
    config = yaml.safe_load(stream)
    
# Convert string 'None' to None variables.
for i,j in config.items():
    if j == 'None':
        config[i] = None
               
f = Figlet(font='slant')
print (f.renderText('SimpleML'))
print("Type : Clustering")
print("User : {}".format(config['user_name']))
print("Analysis name: {}_{}_{}".format(config['analysis_name'], config['user_selected_model'], datetime.datetime.now().strftime ("%Y%m%d")))
print("Model selection : {}".format(config['user_selected_model']))
start_date = datetime.datetime.now()
print("Date time : {}".format(start_date))

# Create analysis folder
try:
    output_folder = "{}/{}/{}/outputs/{}/".format(config['user_name'], config['analysis_name'], config['user_selected_model'], start_date)
    os.makedirs(output_folder)
except FileExistsError:
    # directory already exists
    pass

# #default dataset target variable mappings for demo datasets
# _default_target_variables = {'automobile':'price',
# 'bike':'cnt',
# 'boston':'medv',
# 'concrete':'strength',
# 'diamond':'Price',
# 'forest':'area',
# 'gold':'Gold_T',
# 'house':'SalePrice',
# 'insurance':'charges',
# 'parkinsons','PPE',
# 'traffic':'traffic_volume'}

# SimpleML config
if config['demo_run']:
    _demo_dataset = config['demo_dataset']
    _target = _default_target_variables[_demo_dataset]
    _target_class = _default_target_variables[_demo_dataset]
    print("Input data: {}".format(config['demo_dataset']))
else:
    _input_file = config['input_file']
    # Target processing
    _target = config['target_variable'] # _target = 'medv' # default Target cclass for Boston data
    _target_class = config['target_variable'] # _target_class = 'medv'
    print("Input data : {}".format(config['input_file']))
_pandas_profiling = config['profile'] # _pandas_profiling = True # Default is True which gives detailed 

# Silent preprocessing should be True
_silent_preprocessing = True # silent_preproccessing = True

# Temp args
if config['user_selected_model'] == 'Auto select best model':
    _autoselect = True
else:
    _autoselect = False
    
_kfold = 10 # config['Kfolds'] # 10
_round = 4
if _autoselect:
    _sort_best_model_by = config['auto_select_best_model_by'] # 'R2' #['R2']
else:
    _sort_best_model_by = "R2"
_turbo = True
_turbo = True
_blacklist = None
_user_selected_model = None
_unseen_predictions = False
# Now create model config
number_of_models = 1
_auto_tune = config['auto_tune'] 


In [None]:
# Configure Inputs & Test sets
if _demo_dataset in ['boston']:
    from pycaret.datasets import get_data
    input_data = get_data(_demo_dataset)
    if _unseen_predictions:
        data = input_data.sample(frac=0.9, random_state=786).reset_index(drop=True)
        data_unseen = input_data.drop(data.index).reset_index(drop=True)

        print('Data for Modeling: ' + str(data.shape))
        print('Unseen Data For Predictions: ' + str(data_unseen.shape))
    else:
        data = input_data
else:
    input_data = pd.read_csv(_input_file)
    # print('No proper input was given, So running regression on Boston dataset')
    logging.info('Succesfully read the input csv')
    if _unseen_predictions:
        data = input_data.sample(frac=0.9, random_state=786).reset_index(drop=True)
        data_unseen = input_data.drop(data.index).reset_index(drop=True)

        print('Data for Modeling: ' + str(data.shape))
        print('Unseen Data For Predictions: ' + str(data_unseen.shape))
    else:
        data = input_data


### Preprocessing and EDA

pre-processing steps (other than those that are imperative for machine learning experiments which were performed 
automatically). 

In [None]:
# Preprocessing & ED
from pycaret.clustering import *

exp_clu101 = setup(data, normalize = True, 
                   ignore_features = ['MouseID'],
                   session_id = 123, profile = True)

##### ANALYZE RESULTS
Now that we have created a model, we would like to assign the cluster labels to our dataset (1080 samples) to analyze the results. We will achieve this by using the assign_model() function. See an example below:

In [None]:
kmean_results = assign_model(kmeans)
kmean_results.head()

The score grid printed above highlights the highest performing metric for comparison purposes only. The grid by default is sorted using R2 (highest to lowest) which can be changed by passing sort parameter. For example compare_models(sort = 'RMSLE') will sort the grid by RMSLE (lower to higher since lower is better). 

## Model


In [3]:
# Now create model
number_of_models = 1

# Train User selected or auto selected best model
def model_to_abv(model):
    abvs = {'K-Means Clustering':'kmeans',
'Affinity Propagation':'ap',
'Mean shift Clustering':'meanshift',
'Spectral Clustering':'sc',
'Agglomerative Clustering':'hclust',
'Density-Based Spatial Clustering':'dbscan',
'OPTICS Clustering':'optics',
'Birch Clustering':'birch',
'K-Modes Clustering':'kmodes'}
    return abvs[model]

def train_and_plot_model(model, _auto_tune, plots = True):
    print("Training {}".format(model))
    if _auto_tune:
        print('Training with default hyper parameters')
        trained_model = tune_model(model_to_abv(model))
        print("Completed trained with default hyperparameters tuned {}".format(trained_model))
    else:
        print("Hyperparameter tuning... {}".format(_auto_tune))
        trained_model = create_model(model_to_abv(model))
        print("Completed training with hyperparameters tuned {}".format(trained_model))
    return trained_model 
trained_model = train_and_plot_model(model_selected,True)

NameError: name 'model_selected' is not defined

In [None]:
#Cluster PCA Plot
plot_model(kmeans)

In [None]:
# Elbow
plot_model(kmeans, plot = 'elbow')

In [None]:
# sillouhete
plot_model(kmeans, plot = 'silhouette')

In [None]:
# Distribution plit
plot_model(kmeans, plot = 'distribution') #to see size of clusters
# plot_model(kmeans, plot = 'distribution', feature = 'class')
# plot_model(kmeans, plot = 'distribution', feature = 'CaNA_N')

In [None]:
# unseens predictioins
unseen_predictions = predict_model(kmeans, data=data_unseen)
unseen_predictions.head()

In [None]:
save_model(kmeans,'Final Kmeans Model 08Feb2020')

# saved_kmeans = load_model('Final Kmeans Model 08Feb2020')
# new_prediction = predict_model(saved_kmeans, data=data_unseen)