# Overview

This example notebook will demonstrate Howso Engine’s ability to use case MDA to identify centroids in the data, then use the influential cases of each centroid to conduct matching of the data. 

Definitions:
 
    - case_mda: mean decrease in accuracy for the local model as if each individual case were not in the model 

    - influential_cases: returns the most influential cases and their influence weights based on generalized distances as probability mass 

Note: case_mda is available for "targeted" workflow only, influential_cases are available for both "targeted" and "targetless" workflows. 

In [None]:
from pprint import pprint
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.special import boxcox, inv_boxcox
import seaborn as sns
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score, pairwise_distances

from howso import engine
from howso.utilities import infer_feature_attributes

warnings.filterwarnings("ignore", category=[DeprecationWarning, FutureWarning], module="seaborn")

# Generate a simple dataset

For this example, we will create a simple data set which consists of three features. 


In [None]:
# Generate simple data set with three features
blobs = make_blobs(n_samples=1000, n_features=3, centers=50, random_state=0)
df = pd.DataFrame(blobs[0], columns=['x', 'y', 'price'])

#Rescale price feature such that it is always positive
df['price'] = df['price'] + abs(df['price'].min()) + 1

# # (Optional) Inverse Box-Cox transform to create a Pareto-like distribution
# df['x'] = inv_boxcox(df['x'], .5)
# df['price'] = inv_boxcox(df['price'], .25)

# Store the target cluster for demonstration evaluation
target = blobs[1]
df.shape

Visualization of the dataset to show the clusters. Note, the plot below is only showing two dimensions.

In [None]:
ax = sns.scatterplot(data = df, x='x', y='y', hue=target, palette=sns.color_palette("icefire", as_cmap=True), alpha=0.5)
plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
sns.histplot(df['x'], ax=ax[0]).set_title("histogram of 'x'")
sns.histplot(df['price'], ax=ax[1]).set_title("histogram of 'price'")
plt.show()

# Build Howso Engine model

In this section, we will build and analyze the Howso Engine model.

In [None]:
# Identify feature types
features = infer_feature_attributes(df)

# Instantiate and train the model 
t = engine.Trainee(name="Engine - Fuzzy Matching Recipe",  features=features, overwrite_existing=True)
t.train(cases=df)

# Specify the context (independent variable) and action (target) feature
context_features = features.get_names(without=['price'])
action_features = 'price'

# Analyze the model
t.analyze(context_features=context_features, action_features=[action_features])

# Store the session ID to retrieve training cases for later evaluation
sessions = t.get_sessions()
session_id = sessions[0]['id']
session_id

In [None]:
# Feature types
pprint(features)

In [None]:
# Hyperparameters for the model
pprint(t.get_params())

In [None]:
# Get a list of training cases
case_inds = t.get_cases(session=session_id, features=['.session', '.session_training_index']).values

# Identify centroids - case_mda

Identify the case_mda for each of the training cases. The case_mda will be used as centroids during clustering.

In [None]:
# React to each case, extract the case_mda
results = t.react(case_indices=case_inds, preserve_feature_values=context_features,
                  action_features=[action_features], 
                  leave_case_out=True,
                  details={'case_mda': True, 'robust_computation': True})

In [None]:
# Combine the case_mda into a single DataFrame
case_mdas = pd.DataFrame()

for c in results['details']['case_mda']:
    case_mdas = pd.concat([case_mdas, pd.DataFrame(c)])

In [None]:
# Rank the mda and then find the top fifty centroids. Note, the number of centroids can be tuned. 
num_centers = 50
case_mdas = case_mdas.sort_values('mda', ascending=False).reset_index(drop=True)
case_mdas = case_mdas[~case_mdas['.session_training_index'].duplicated()]
centers = case_mdas[0:num_centers]

In [None]:
# The .session and .session_training_index will be used to find the influential cases in the next section
centers.head()

In [None]:
# Get the case values for visualization
center_cases = t.get_cases(case_indices=centers[['.session', '.session_training_index']].values)
center_cases.head()

Two dimensional representation of the identified centroids (dark orange). 

In [None]:
fig, ax = plt.subplots()
ax = sns.scatterplot(data = df, x='x', y='y', hue=target, palette=sns.color_palette("icefire", as_cmap=True), alpha=0.1, ax=ax)
ax = sns.scatterplot(data = center_cases, x='x', y='y', ax=ax)
plt.legend([],[], frameon=False)
plt.show()

# Cluster around centroids - using influential cases

Use the influential cases of each centroid to cluster the data. More specifically, tag the influential cases as the same cluster as the centroid.

Note: most similar cases is an alternative method to cluster data points. 

In [None]:
# Get the influential cases and most similar cases for each of the centroids
num_similar_cases = 5
results = t.react(case_indices=centers[[".session", ".session_training_index"]].values,
                  leave_case_out=True,
                  preserve_feature_values=context_features, action_features=[action_features], 
                  details={'most_similar_cases':True, 'num_most_similar_cases':num_similar_cases, 'influential_cases':True})

In [None]:
# Tag each influential cases with the same cluster as the centroid
centers = centers.reset_index().rename(columns={'index':'target'})
clustered_df = pd.DataFrame()

for i, cases in enumerate(results['details']['influential_cases']):
    tar = centers['target'].iloc[i]
    clustered = pd.DataFrame(cases)
    clustered['target'] = tar

    clustered_df = pd.concat([clustered_df, clustered])
    
clustered_df = clustered_df.reset_index(drop=True)

In [None]:
# Training data for evaluation purposes
df_train = t.get_cases(session=session_id, features=df.columns.tolist() + ['.session_training_index'])
df_train = df_train.join(pd.Series(target, name='target'))

df_train

In [None]:
clustered_df

In [None]:
# Map each of the predicted cluster with the most commonly occurring original cluster
label_map = {}

for c in clustered_df['target'].unique():
    group_df = clustered_df[clustered_df['target'] == c]
    target_original = df_train[df_train['.session_training_index'].isin(group_df['.session_training_index'])]['target'].value_counts().idxmax()
    # print(df_train[df_train['.session_training_index'].isin(group_df['.session_training_index'])]['target'].value_counts())
    label_map[c] = target_original
    
clustered_df = clustered_df.replace({'target':label_map})

In [None]:
# Create a single table with predicted and correct target results
correct_target = []
for i in clustered_df['.session_training_index']:
    correct_t = df_train[df_train['.session_training_index'] == i]['target'].iloc[0]
    correct_target.append(correct_t)
    
clustered_df['correct_target'] = correct_target
clustered_df.head()

In [None]:
# Accuracy evaluation
acc = round(accuracy_score(clustered_df['correct_target'], clustered_df['target']), 3)
print(f'accuracy: {acc}')

Two dimensional representation of the identified clusters.

In [None]:
fig, ax = plt.subplots()
ax = sns.scatterplot(data = df, x='x', y='y', hue=target, palette=sns.color_palette("icefire", as_cmap=True), alpha=0.1, ax=ax)
ax = sns.scatterplot(data = clustered_df, x='x', y='y', hue='target', palette=sns.color_palette("tab10", as_cmap=True), ax=ax)
plt.legend([],[], frameon=False)
plt.show()

# Impact of number of centroids

Howso Engine starts clustering with the most confident centroids (highest case_mda). Therefore, the number specified centroids can have an impact on clustering performance. 

Relatively low ratio of specified clusters vs. actual number of clusters is expected to have higher accuracy. In this example, we will demonstrate the impact of the number of cluster using 10 to 50 clusters. As a ratio of specified clusters to actual clusters, these represents a ratio of 1/5 up to 1. 

In [None]:
# Number of specified clusters
num_centers = [10, 20, 30, 40, 50]

In [None]:
# A function which finds the centroids (using case_mda), then clusters the data (using influential cases), followed by accuracy evaluation
def cluster_eval(num_centers):
    case_mdas = pd.DataFrame()
    
    results = t.react(case_indices=case_inds, preserve_feature_values=context_features,
                  action_features=[action_features], 
                  leave_case_out=True,
                  details={'case_mda':True, 'robust_influences': True})

    for c in results['details']['case_mda']:
        case_mdas = pd.concat([case_mdas, pd.DataFrame(c)])

    case_mdas = case_mdas.sort_values('mda', ascending=False).reset_index(drop=True)
    case_mdas = case_mdas[~case_mdas['.session_training_index'].duplicated()]
    centers = case_mdas[0:num_centers]

    center_cases = t.get_cases(case_indices=centers[['.session', '.session_training_index']].values)

    num_similar_cases = 5

    results = t.react(case_indices=centers[['.session', '.session_training_index']].values,
                      leave_case_out=True,
                      preserve_feature_values=context_features, action_features=[action_features], 
                      details={'most_similar_cases':True, 'num_most_similar_cases':num_similar_cases, 'influential_cases':True})

    centers = centers.reset_index().rename(columns={'index':'target'})
    clustered_df = pd.DataFrame()

    for i, cases in enumerate(results['details']['influential_cases']):
        tar = centers['target'].iloc[i]
        clustered = pd.DataFrame(cases)
        clustered['target'] = tar

        clustered_df = pd.concat([clustered_df, clustered])

    clustered_df = clustered_df.reset_index(drop=True)

    df_train = t.get_cases(session=session_id, features=df.columns.tolist() + ['.session_training_index'])
    df_train = df_train.join(pd.Series(target, name='target'))

    label_map = {}

    for c in clustered_df['target'].unique():
        group_df = clustered_df[clustered_df['target'] == c]
        target_original = df_train[df_train['.session_training_index'].isin(group_df['.session_training_index'])]['target'].value_counts().idxmax()
        # print(df_train[df_train['.session_training_index'].isin(group_df['.session_training_index'])]['target'].value_counts())
        label_map[c] = target_original

    clustered_df = clustered_df.replace({'target':label_map})

    correct_target = []
    for i in  clustered_df['.session_training_index']:
        correct_t = df_train[df_train['.session_training_index'] == i]['target'].iloc[0]
        correct_target.append(correct_t)

    clustered_df['correct_target'] = correct_target

    clustered_df.head()

    return(accuracy_score(clustered_df['correct_target'], clustered_df['target']))

In [None]:
# Get the accuracy results for each of specified number of clusters
acc = []
for n in num_centers:
    acc.append(cluster_eval(n))

Visualize the accuracy vs. number of specified centroid results

In [None]:
ax = sns.lineplot(x=num_centers, y=acc)
ax.set(xlabel='number of centroids', ylabel='accuracy')
plt.show()