In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


import pandas as pd
from pmlb import fetch_data

from howso.engine import (
    load_trainee,
    Trainee,
)
from howso.utilities import infer_feature_attributes

In [2]:
df = fetch_data('adult', local_cache_dir="../data/adult")



In [3]:
sample_size = 2000

# Subsample the data to ensure the example runs quickly
df = df.sample(sample_size)

train_df, test_df = train_test_split(df, test_size=0.33) #,random_state=42)

df = train_df
# Infer features attributes
features = infer_feature_attributes(df)

# Specify Context and Action Features
action_features = ['target']
context_features = features.get_names(without=action_features)
# Create the Trainee
t = Trainee(features=features)
# Train
t.train(df)

# Analyze the Trainee
# (By specifying action_features, this becomes a Targeted analysis)
t.analyze(context_features=context_features, action_features=action_features)

Version 26.0.0 of Howso Engine™ is available. You are using version 25.1.1.dev0+gd82adab.d20240729.
The following parameters from configuration file will override the Amalgam parameters set in the code: {'trace'}


In [4]:
predictions = t.predict(
    test_df[context_features],
    context_features=context_features,
    action_features=action_features,
)

accuracy = accuracy_score(test_df[action_features[0]], predictions['target'])
accuracy

0.8212121212121212

In [5]:
# Recommended metrics
stats = t.react_aggregate(
    action_feature=action_features[0],
    details = {
        "prediction_stats": True,
        "selected_prediction_stats": ["all"]
    }
)

stats

Unnamed: 0,sex,education-num,hours-per-week,age,relationship,fnlwgt,capital-gain,marital-status,native-country,race,education,target,capital-loss,occupation,workclass
precision,0.65968,,,,0.343917,,,0.451035,0.047694,0.272946,0.415706,0.766703,,0.147563,0.150279
accuracy,0.699,,,,0.566,,,0.658,0.905,0.864,0.618,0.823,,0.23,0.696
confusion_matrix,"{'leftover_incorrect': 0, 'leftover_correct': ...",,,,"{'leftover_incorrect': 0, 'leftover_correct': ...",,,"{'leftover_incorrect': 70, 'leftover_correct':...","{'leftover_incorrect': 22, 'leftover_correct':...","{'leftover_incorrect': 99, 'leftover_correct':...","{'leftover_incorrect': 203, 'leftover_correct'...","{'leftover_incorrect': 0, 'leftover_correct': ...",,"{'leftover_incorrect': 188, 'leftover_correct'...","{'leftover_incorrect': 75, 'leftover_correct':..."
mcc,0.276964,,,,0.376964,,,0.457239,0.080628,0.049068,0.512258,0.464457,,0.135315,0.094792
mae,0.363812,1.794467,8.326609,10.066452,0.569633,76036.955347,1566.0264,0.486553,0.161479,0.222156,0.512877,0.251742,194.552871,0.838331,0.472629
recall,0.620098,,,,0.331133,,,0.325056,0.040091,0.201769,0.248208,0.702211,,0.151036,0.132272
rmse,,2.422032,11.883946,12.566454,,101979.322084,6819.244104,,,,,,426.484095,,
spearman_coeff,,0.402244,0.335259,0.388114,,0.071125,0.59617,,,,,,0.476296,,
r2,,0.146226,0.068971,0.130937,,-0.052911,-0.022981,,,,,,-0.000539,,


In [6]:
cases = t.get_cases(session=t.active_session, features=['.session_training_index', '.session'])
session_id = cases['.session'].loc[0]

In [7]:
# results = t.react(
#     case_indices=[(session_id, 1)],
#     preserve_feature_values=context_features,
#     action_features=action_features,
#     leave_case_out=True,
#     details = {
#         "case_mda_robust": True,
#         "prediction_stats": True,
#         "local_case_feature_residual_convictions_full": True}
# )



In [8]:
to_remove = {}
for case in cases[".session_training_index"]:
    results = t.react(
        case_indices=[(session_id, case)],
        preserve_feature_values=context_features,
        action_features=action_features,
        leave_case_out=True,
        details = {
            "case_mda_robust": True,
            "prediction_stats": True,
            "local_case_feature_residual_convictions_full": True}
    )
    case_mdas = results['details']['case_mda_robust'][0]
    local_accuracy = results['details']['prediction_stats']['target']['accuracy']
    residual_conviction = results['details']["local_case_feature_residual_convictions_full"][0]['target']
    # if local_accuracy > 0.3 or local_accuracy < 0.7:
    if residual_conviction > 0.4:
        for mda in case_mdas:
            if mda['mda'] < 0:
                if mda['.session_training_index'] in to_remove:
                    to_remove[mda['.session_training_index']] += 1
                else:
                    to_remove[mda['.session_training_index']] = 1

    # Sort the dictionary keys by their values
    sorted_keys = sorted(to_remove, key=lambda x: to_remove[x], reverse=True)
    to_remove = {key: to_remove[key] for key in sorted_keys}

# to_remove


In [9]:
sample_size

2000

In [10]:
filtered_data = {key: value for key, value in to_remove.items() if value >= 5}
len(filtered_data)

309

In [11]:
# Recommended metrics
stats = t.react_aggregate(
    action_feature=action_features[0],
    details = {
        "prediction_stats": True,
        "selected_prediction_stats": ["all"]
    }
)

stats['target']

precision                                                     0.78323
accuracy                                                        0.833
confusion_matrix    {'leftover_incorrect': 0, 'leftover_correct': ...
mcc                                                          0.503551
mae                                                          0.247953
recall                                                       0.723814
rmse                                                              NaN
spearman_coeff                                                    NaN
r2                                                                NaN
Name: target, dtype: object

In [12]:
# remove cases using ".session_training_index"
for key in filtered_data.keys():
    t.remove_cases(num_cases=1, case_indices=[(session_id, key)])

In [13]:
# Recommended metrics
stats = t.react_aggregate(
    action_feature=action_features[0],
    details = {
        "prediction_stats": True,
        "selected_prediction_stats": ["all"]
    }
)

stats['target']

precision                                                    0.914435
accuracy                                                        0.948
confusion_matrix    {'leftover_incorrect': 0, 'leftover_correct': ...
mcc                                                          0.777643
mae                                                          0.070089
recall                                                       0.864791
rmse                                                              NaN
spearman_coeff                                                    NaN
r2                                                                NaN
Name: target, dtype: object

In [14]:
# t.analyze(context_features=context_features, action_features=action_features)

predictions = t.predict(
    test_df[context_features],
    context_features=context_features,
    action_features=action_features,
)

post_trim_accuracy = accuracy_score(test_df[action_features[0]], predictions['target'])
print(accuracy)
print(post_trim_accuracy)

0.8212121212121212
0.8166666666666667
