In [1]:
%load_ext autoreload
%autoreload 2

# unit testing

In [2]:
from rule_optimisation.rule_optimiser import RuleOptimiser
import pandas as pd
import numpy as np
from rule_optimisation.optimisation_functions import FScore

In [3]:
def _create_data():
    np.random.seed(0)
    X = pd.DataFrame({
        'A': np.random.randint(0, 10, 10000),
        'B': np.random.randint(0, 100, 10000),
        'C': np.random.uniform(0, 1, 10000),
        'D': [True, False] * 5000,
        'E': ['yes', 'no'] * 5000,
        'AllNa': [np.nan] * 10000,
        'ZeroVar': [1] * 10000
    })
    X.loc[10000] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    X['A'] = X['A'].astype('Int64')
    X['B'] = X['B'].astype('Int64')
    X['D'] = X['D'].astype('boolean')
    y = pd.Series(np.random.randint(0, 2, 10001))
    sample_weight = np.where((X['A']>7).fillna(False)&(y==0), 100, 1)#y * 5
    return X, y, sample_weight


def _create_inputs():
    rule_lambdas = {
        'integer': lambda **kwargs: "(X['A']>{A})".format(**kwargs),
        'float': lambda **kwargs: "(X['C']>{C})".format(**kwargs),
        'categoric': lambda **kwargs: "(X['E']=='yes')".format(**kwargs),
        'boolean': lambda **kwargs: "(X['D']==True)".format(**kwargs),
        'is_na': lambda **kwargs: "(X['A']>{A})|(X['A'].isna())".format(**kwargs),
        'mixed': lambda **kwargs: "((X['A']>{A})&(X['C']>{C})&(X['E']=='yes')&(X['D']==True))|(X['C']>{C%0})".format(**kwargs),
        'missing_col': lambda **kwargs: "(X['Z']>{Z})".format(**kwargs),
        'all_na': lambda **kwargs: "(X['AllNa']>{AllNa})".format(**kwargs),
        'zero_var': lambda **kwargs: "(X['ZeroVar']>{ZeroVar})".format(**kwargs),
        'already_optimal': lambda **kwargs: "(X['A']>{A})".format(**kwargs),
    }
    lambda_kwargs = {
        'integer': {'A': 9},
        'float': {'C': 1.5},
        'categoric': {},
        'boolean': {},
        'is_na': {'A': 9},
        'mixed': {'A': 1, 'C': 1.5, 'C%0': 2.5},
        'missing_col': {'Z': 1},
        'all_na': {'AllNa': 5},
        'zero_var': {'ZeroVar': 1},
        'already_optimal': {'A': 0}
    }
    return rule_lambdas, lambda_kwargs

In [4]:
X, y, sample_weight = _create_data()

In [5]:
X.shape, y.shape

((10001, 7), (10001,))

In [6]:
rule_lambdas, lambda_kwargs = _create_inputs()

In [7]:
from rule_optimisation.rule_optimiser import RuleOptimiser

In [8]:
f1 = FScore(beta=1)
ro = RuleOptimiser(rule_lambdas=rule_lambdas,
                   lambda_kwargs=lambda_kwargs, opt_func=f1.fit, n_iter=30)

In [141]:
ro.fit(X=X, y=y, sample_weight=sample_weight)

100%|██████████| 30/30 [00:00<00:00, 181.56trial/s, best loss: -0.07737844641675759]

  f'Rules `{"`, `".join(rule_names_missing_features)}` use features that are missing from `X` - unable to optimise or apply these rules')
  f'Rules `{"`, `".join(rule_names_no_opt_conditions)}` have no optimisable conditions - unable to optimise these rules')
  f'Rules `{"`, `".join(rule_names_zero_var_features)}` have all zero variance features based on the dataset `X` - unable to optimise these rules')



100%|██████████| 30/30 [00:00<00:00, 195.00trial/s, best loss: -0.0864948723631455]
100%|██████████| 30/30 [00:00<00:00, 201.41trial/s, best loss: -0.07737778159635708]
100%|██████████| 30/30 [00:00<00:00, 107.04trial/s, best loss: -0.0864948723631455] 
100%|██████████| 30/30 [00:00<00:00, 187.56trial/s, best loss: -0.07737844641675759]


{'integer': "(X['A']>0)",
 'float': "(X['C']>0.14437974242018892)",
 'is_na': "(X['A']>0)|(X['A'].isna())",
 'mixed': "((X['A']>3)&(X['C']>0.3449413915707924)&(X['E']=='yes')&(X['D']==True))|(X['C']>0.14437974242018892)",
 'already_optimal': "(X['A']>0.0)",
 'categoric': "(X['E']=='yes')",
 'boolean': "(X['D']==True)",
 'all_na': "(X['AllNa']>5.0)",
 'zero_var': "(X['ZeroVar']>1.0)"}

In [144]:
# for i in range(0, 10):
#     print(i, f1.fit(y_pred=(X['A']>i).fillna(False).astype(int), y_true=y, sample_weight=sample_weight))

In [152]:
orig_rule_performances = {
        'Rule1': 0.1,
        'Rule2': 0.2,
        'Rule3': 0.3
    }    
opt_rule_performances = {
    'Rule1': 0.2,
    'Rule2': 0.4,
    'Rule3': 0.3
}

In [159]:
pd.DataFrame([orig_rule_performances, opt_rule_performances]).T.values

array([[0.1, 0.2],
       [0.2, 0.4],
       [0.3, 0.3]])

In [145]:
ro.orig_rule_performances

{'already_optimal': 0.07737844641675759,
 'integer': 0.0,
 'float': 0.0,
 'is_na': 0.0,
 'mixed': 0.0}

In [147]:
ro.opt_rule_performances

{'float': 0.0864948723631455,
 'mixed': 0.0864948723631455,
 'integer': 0.07737844641675759,
 'already_optimal': 0.07737844641675759,
 'is_na': 0.07737778159635708}

In [65]:
ro.rule_names_missing_features

['missing_col']

In [66]:
ro.rule_names_no_opt_conditions

['categoric', 'boolean', 'all_na']

In [67]:
ro.rule_names_zero_var_features

['zero_var']

In [148]:
ro._return_all_optimisable_rule_features(lambda_kwargs=lambda_kwargs, X=X)

  f'Rules `{"`, `".join(rule_names_no_opt_conditions)}` have no optimisable conditions - unable to optimise these rules')


(['Z', 'A', 'ZeroVar', 'C%0', 'C'], ['categoric', 'boolean', 'all_na'])

In [149]:
exp_all_features = ['Z', 'A', 'ZeroVar', 'C%0', 'C']
exp_rule_name_no_opt_conds = ['categoric', 'boolean', 'all_na']
exp_all_features.sort()
exp_rule_name_no_opt_conds.sort()

In [151]:
ro._return_rules_with_zero_var_features(lambda_kwargs=lambda_kwargs, all_space_funcs=)

['all_na', 'boolean', 'categoric']

In [32]:
from rules.rules import Rules
from rule_application.argo_rule_applier import ArgoRuleApplier

In [30]:
r_ = Rules(rule_lambdas=rule_lambdas, lambda_kwargs=lambda_kwargs)

In [37]:
r_.filter_rules(exclude=['missing_col'])

In [38]:
rule_strings = r_.as_rule_strings(as_numpy=False)

In [41]:
ara = ArgoRuleApplier(rule_strings=rule_strings, opt_func=f1.fit)

In [42]:
ara.apply(X, y)

Unnamed: 0,categoric,boolean,integer,float,is_na,mixed,all_na,zero_var
0,1,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
9996,1,1,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0
9998,1,1,0,0,0,0,0,0
9999,0,0,0,0,0,0,0,0


In [43]:
ara.rule_descriptions

Unnamed: 0_level_0,Precision,Recall,PercDataFlagged,OptMetric,Logic,nConditions
Rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
categoric,0.5054,0.506413,0.49995,0.505906,(X['E']=='yes'),1
boolean,0.5054,0.506413,0.49995,0.505906,(X['D']==True),1
integer,0.0,0.0,0.0,0.0,(X['A']>9.0),1
float,0.0,0.0,0.0,0.0,(X['C']>1.5),1
is_na,0.0,0.0,0.0001,0.0,(X['A']>9.0)|(X['A'].isna()),2
mixed,0.0,0.0,0.0,0.0,((X['A']>1.0)&(X['C']>1.5)&(X['E']=='yes')&(X[...,5
all_na,0.0,0.0,0.0,0.0,(X['AllNa']>5.0),1
zero_var,0.0,0.0,0.0,0.0,(X['ZeroVar']>1.0),1


In [49]:
for rule_name, rule_string in rule_strings.items():
    print(rule_name, f1.fit(y_pred=eval(rule_string).fillna(False).astype(int), y_true=y))

integer 0
float 0
categoric 0.5059059059059059
boolean 0.5059059059059059
is_na 0
mixed 0
all_na 0
zero_var 0


In [44]:
ro.orig_rule_performances

{'integer': 0, 'float': 0, 'is_na': 0, 'mixed': 0}

In [27]:
ro.opt_rule_performances

{'float': 0.6642155224279698,
 'mixed': 0.6642155224279698,
 'integer': 0.6422306211224418,
 'is_na': 0.6421848260125499}

# Unlabelled

In [9]:
from rule_optimisation.optimisation_functions import AlertsPerDay

In [12]:
apd = AlertsPerDay(n_alerts_expected_per_day=10, no_of_days_in_file=10)
ro = RuleOptimiser(rule_lambdas=rule_lambdas,
                   lambda_kwargs=lambda_kwargs, opt_func= apd.fit, n_iter=30)

In [13]:
ro.fit(X=X)

100%|██████████| 30/30 [00:00<00:00, 267.96trial/s, best loss: 8892.49]
  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]

  f'Rules `{"`, `".join(rule_names_missing_features)}` use features that are missing from `X` - unable to optimise or apply these rules')
  f'Rules `{"`, `".join(rule_names_no_opt_conditions)}` have no optimisable conditions - unable to optimise these rules')
  f'Rules `{"`, `".join(rule_names_zero_var_features)}` have all zero variance features based on the dataset `X` - unable to optimise these rules')


100%|██████████| 30/30 [00:00<00:00, 348.18trial/s, best loss: 16.0]
100%|██████████| 30/30 [00:00<00:00, 288.36trial/s, best loss: 8911.36]
100%|██████████| 30/30 [00:00<00:00, 138.79trial/s, best loss: 985.9599999999999]
100%|██████████| 30/30 [00:00<00:00, 364.31trial/s, best loss: 8892.49]


{'integer': "(X['A']>9.0)",
 'float': "(X['C']>0.9934712038306385)",
 'is_na': "(X['A']>9.0)|(X['A'].isna())",
 'mixed': "((X['A']>1.0)&(X['C']>1.5)&(X['E']=='yes')&(X['D']==True))|(X['C']>2.5)",
 'already_optimal': "(X['A']>8)",
 'categoric': "(X['E']=='yes')",
 'boolean': "(X['D']==True)",
 'all_na': "(X['AllNa']>5.0)",
 'zero_var': "(X['ZeroVar']>1.0)"}

In [15]:
ro.opt_rule_performances

{'float': -16.0,
 'mixed': -100.0,
 'integer': -100.0,
 'already_optimal': -8892.49,
 'is_na': -98.01}