In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from scipy import stats

import pandas as pd
from pmlb import fetch_data

from howso.engine import (
    load_trainee,
    Trainee,
)
from howso.utilities import infer_feature_attributes

In [2]:
df = fetch_data('adult', local_cache_dir="../data/adult")

In [3]:
post_trim_accuracy_tracker = []
pre_trim_accuracy_tracker = []

for run in range(1):
    # sample_size = 6000

    # Subsample the data to ensure the example runs quickly
    # df_sample = df.sample(sample_size)

    train_df, test_df = train_test_split(df, test_size=0.20) #,random_state=42)

    # Infer features attributes
    features = infer_feature_attributes(train_df)

    # Specify Context and Action Features
    action_features = ['target']
    context_features = features.get_names(without=action_features)
    # Create the Trainee
    t = Trainee(features=features)
    # Train
    t.train(train_df)

    # Analyze the Trainee
    # (By specifying action_features, this becomes a Targeted analysis)
    t.analyze(context_features=context_features, action_features=action_features)

    pre_predictions = t.predict(
        test_df[context_features],
        context_features=context_features,
        action_features=action_features,
    )

    pre_trim_accuracy = accuracy_score(test_df[action_features[0]], pre_predictions['target'])
    pre_trim_accuracy_tracker.append(pre_trim_accuracy)

    cases = t.get_cases(session=t.active_session, features=['.session_training_index', '.session'])
    session_id = cases['.session'].loc[0]

    to_remove = {}
    for case in cases[".session_training_index"]:
        results = t.react(
            case_indices=[(session_id, case)],
            preserve_feature_values=context_features,
            action_features=action_features,
            leave_case_out=True,
            details = {
                "case_mda_robust": True,
                "prediction_stats": True,
                "local_case_feature_residual_convictions_full": True}
        )
        case_mdas = results['details']['case_mda_robust'][0]
        local_accuracy = results['details']['prediction_stats']['target']['accuracy']
        residual_conviction = results['details']["local_case_feature_residual_convictions_full"][0]['target']
        # if local_accuracy > 0.3 or local_accuracy < 0.7:
        # if residual_conviction > 0.4:
        for mda in case_mdas:
            if mda['mda'] < -0.01:
                if mda['.session_training_index'] in to_remove:
                    to_remove[mda['.session_training_index']] += 1
                else:
                    to_remove[mda['.session_training_index']] = 1

        # Sort the dictionary keys by their values
        sorted_keys = sorted(to_remove, key=lambda x: to_remove[x], reverse=True)
        to_remove = {key: to_remove[key] for key in sorted_keys}

    filtered_data = {key: value for key, value in to_remove.items() if value >= 3}
    len(filtered_data)
    # remove cases using ".session_training_index"
    for key in filtered_data.keys():
        t.remove_cases(num_cases=1, case_indices=[(session_id, key)])

    post_predictions = t.predict(
        test_df[context_features],
        context_features=context_features,
        action_features=action_features,
    )

    post_trim_accuracy = accuracy_score(test_df[action_features[0]], post_predictions['target'])
    post_trim_accuracy_tracker.append(post_trim_accuracy)

Version 28.0.1 of Howso Engine™ is available. You are using version 27.0.0.


KeyboardInterrupt: 

In [None]:
sum(pre_trim_accuracy_tracker)/len(pre_trim_accuracy_tracker)

In [None]:
sum(post_trim_accuracy_tracker)/len(post_trim_accuracy_tracker)

In [None]:
# Perform an independent two-sample t-test (one-sided)
t_stat, p_value = stats.ttest_ind(post_trim_accuracy_tracker, pre_trim_accuracy_tracker, alternative='greater')
p_value