In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from rule_scoring.rule_scorer import RuleScorer
import cProfile, pstats

In [3]:
om = pd.read_pickle('~/Downloads/omnyex_processed_for_profiling.pkl')
X = om.drop('chargeback', axis=1)
y = om['chargeback']

In [4]:
X = X.astype(float)

In [5]:
X_rules = (X >= 1).astype(int)

In [6]:
from rule_filtering.rule_filters import FilterRules
from rule_optimisation.optimisation_functions import Precision

In [7]:
X_rules.shape

(1780373, 50)

In [9]:
precision = Precision()

In [10]:
filters = {
    'OptMetric': {
        'Operator': '>=',
        'Value': 0.30
    }
}

In [11]:
params = {
    'filters': filters,
    'opt_func': precision.fit
}

## Old

In [12]:
filename = 'FilterRules_old.dat'

In [13]:
rs = FilterRules(**params)
cProfile.run('rs.fit(X_rules, y)', sort='cumtime', filename=f'{filename}.dat')

Note: No rules remaining after filtering


In [14]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Tue Jan 12 14:30:59 2021    FilterRules_old.dat.dat

         128610 function calls (126668 primitive calls) in 4.239 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    4.239    4.239 {built-in method builtins.exec}
        1    0.000    0.000    4.239    4.239 <string>:1(<module>)
        1    0.119    0.119    4.239    4.239 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_filtering/rule_filtering/rule_filters.py:26(fit)
     51/1    0.707    0.014    4.119    4.119 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils.py:139(return_binary_pred_perf_of_set_numpy)
    676/9    1.341    0.002    2.443    0.271 {built-in method numpy.core._multiarray_umath.implement_array_function}
        1    0.000    0.000    2.141    2.141 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/frame.py:1311(to_numpy)
        2    0.000    0.000    2.141    1

<pstats.Stats at 0x12ce27278>

## New (with .values instead of ,to_numpy in return_binary_pred_perf_of_set_numpy) 

In [19]:
filename = 'FilterRules_new.dat'

In [20]:
rs = FilterRules(**params)
cProfile.run('rs.fit(X_rules, y)', sort='cumtime', filename=f'{filename}.dat')

Note: No rules remaining after filtering


In [21]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Tue Jan 12 14:34:24 2021    FilterRules_new.dat.dat

         127861 function calls (126216 primitive calls) in 2.663 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    2.663    2.663 {built-in method builtins.exec}
        1    0.000    0.000    2.663    2.663 <string>:1(<module>)
        1    0.178    0.178    2.663    2.663 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_filtering/rule_filtering/rule_filters.py:26(fit)
     51/1    1.092    0.021    2.484    2.484 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils.py:139(return_binary_pred_perf_of_set_numpy)
    671/6    0.003    0.000    1.243    0.207 {built-in method numpy.core._multiarray_umath.implement_array_function}
      208    0.714    0.003    0.714    0.003 {method 'reduce' of 'numpy.ufunc' objects}
        1    0.000    0.000    0.673    0.673 <__array_function__ internals>:2(apply_alon

<pstats.Stats at 0x12d93b8d0>

## New (with .values instead of ,to_numpy in return_binary_pred_perf_of_set_numpy AND without precision as input)

In [22]:
filename = 'FilterRules_new_no_opt_func.dat'

In [24]:
filters = {
    'Precision': {
        'Operator': '>=',
        'Value': 0.30
    }
}

In [25]:
rs = FilterRules(filters=filters)
cProfile.run('rs.fit(X_rules, y)', sort='cumtime', filename=f'{filename}.dat')

Note: No rules remaining after filtering


In [26]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Tue Jan 12 14:36:16 2021    FilterRules_new_no_opt_func.dat.dat

         2485 function calls (2446 primitive calls) in 1.747 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.747    1.747 {built-in method builtins.exec}
        1    0.143    0.143    1.747    1.747 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_filtering/rule_filtering/rule_filters.py:26(fit)
        1    0.657    0.657    1.603    1.603 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils.py:139(return_binary_pred_perf_of_set_numpy)
     15/5    0.000    0.000    0.591    0.118 {built-in method numpy.core._multiarray_umath.implement_array_function}
        1    0.000    0.000    0.591    0.591 <__array_function__ internals>:2(tile)
        1    0.000    0.000    0.591    0.591 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/numpy/lib/shape_base.py:1171(tile)
        1    0.5

<pstats.Stats at 0x1294c9400>

## New (with rule_descriptions given in input)

In [33]:
filename = 'FilterRules_new_rule_descriptions_given.dat'

In [34]:
filters = {
    'Precision': {
        'Operator': '>=',
        'Value': 0.30
    }
}

In [35]:
from argo_utils.argo_utils import return_binary_pred_perf_of_set_numpy

In [36]:
rule_descriptions = return_binary_pred_perf_of_set_numpy(y_true=y, y_preds=X_rules, y_preds_columns=X_rules.columns)

In [37]:
rs = FilterRules(filters=filters, rule_descriptions=rule_descriptions)
cProfile.run('rs.fit(X_rules, y)', sort='cumtime', filename=f'{filename}.dat')

Note: No rules remaining after filtering


In [38]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Tue Jan 12 14:42:06 2021    FilterRules_new_rule_descriptions_given.dat.dat

         959 function calls (940 primitive calls) in 0.001 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.001    0.001 {built-in method builtins.exec}
        1    0.000    0.000    0.001    0.001 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_filtering/rule_filtering/rule_filters.py:26(fit)
        1    0.000    0.000    0.001    0.001 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_filtering/rule_filtering/rule_filters.py:77(_iterate_rule_descriptions)
        2    0.000    0.000    0.001    0.000 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/frame.py:2869(__getitem__)
        1    0.000    0.000    0.001    0.001 {built-in method builtins.eval}
        1    0.000    0.000    0.001    0.001 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 /Users/jlaidler

<pstats.Stats at 0x12d56d6a0>

In [None]:
return_binary_pred_perf_of_set_numpy

In [17]:
%timeit X_rules.to_numpy()

5.02 µs ± 61 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [18]:
%timeit X_rules.values

4.48 µs ± 78 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


# Unit testing

In [22]:
import random
import pandas as pd
import numpy as np
from rule_optimisation.optimisation_functions import FScore
import argo_utils.argo_utils as argo_utils
from rule_filtering.rule_filters import FilterRules

In [34]:
def create_data():
    def return_random_num(y, fraud_min, fraud_max, nonfraud_min, nonfraud_max, rand_func):
        data = [rand_func(fraud_min, fraud_max) if i == 1 else rand_func(
            nonfraud_min, nonfraud_max) for i in y]
        return data

    random.seed(0)
    np.random.seed(0)
    y = pd.Series(data=[0]*980 + [1]*20, index=list(range(0, 1000)))
    X_rules = pd.DataFrame(data={
        "Rule1": [0]*980 + [1]*6 + [0] * 14,
        "Rule2": [0]*987 + [1]*6 + [0] * 7,
        "Rule3": [0]*993 + [1]*6 + [0] * 1,
        "Rule4": [round(max(i, 0)) for i in return_random_num(y, 0.4, 1, 0.5, 0.6, np.random.uniform)],
        "Rule5": [round(max(i, 0)) for i in return_random_num(y, 0.2, 1, 0, 0.6, np.random.uniform)],
    },
        index=list(range(0, 1000))
    )
    weights = y.apply(lambda x: 10 if x == 1 else 1)
    return X_rules, y, weights


def return_rule_descriptions(create_data):
    X_rules, y, weights = create_data()
    f4 = FScore(beta=4)
    rd_no_weight = argo_utils.return_binary_pred_perf_of_set_numpy(
        y_true=y, y_preds=X_rules, y_preds_columns=X_rules.columns, opt_func=f4.fit)
    rd_weight = argo_utils.return_binary_pred_perf_of_set_numpy(
        y_true=y, y_preds=X_rules, y_preds_columns=X_rules.columns, sample_weight=weights, opt_func=f4.fit)
    return rd_no_weight, rd_weight


def instantiate_FilterRules(return_rule_descriptions, create_data):
    rd_no_weight, rd_weight = return_rule_descriptions(create_data)
    filters = {
        'Precision': {
            'Operator': '>=',
            'Value': 0.2
        },
        'OptMetric': {
            'Operator': '>=',
            'Value': 0.3
        }
    }
    f4 = FScore(beta=4)
    fr_w_rd_no_weight = FilterRules(
        filters=filters, rule_descriptions=rd_no_weight)
    fr_w_rd_weight = FilterRules(
        filters=filters, rule_descriptions=rd_weight)
    fr_wo_rd_no_weight = FilterRules(filters=filters, opt_func=f4.fit)
    fr_wo_rd_weight = FilterRules(filters=filters, opt_func=f4.fit)
    return fr_w_rd_no_weight, fr_w_rd_weight, fr_wo_rd_no_weight, fr_wo_rd_weight

In [35]:
X_rules, y, weights = create_data()

In [36]:
rd_no_weight, rd_weight = return_rule_descriptions(create_data)

In [37]:
fr_w_rd_no_weight, fr_w_rd_weight, fr_wo_rd_no_weight, fr_wo_rd_weight = instantiate_FilterRules(return_rule_descriptions, create_data)

## No weight

In [38]:
fr_w_rd_no_weight.fit(X_rules, y)

In [39]:
fr_w_rd_no_weight.rules_to_keep

['Rule1', 'Rule2', 'Rule3']

In [40]:
rd_no_weight

Unnamed: 0,Precision,Recall,PercDataFlagged,OptMetric
Rule1,1.0,0.3,0.006,0.312883
Rule2,1.0,0.3,0.006,0.312883
Rule3,1.0,0.3,0.006,0.312883
Rule4,0.018036,0.9,0.998,0.23217
Rule5,0.075377,0.75,0.199,0.491329


## With weight

In [41]:
fr_w_rd_weight.fit(X_rules, y)

In [42]:
fr_w_rd_weight.rules_to_keep

['Rule1', 'Rule2', 'Rule3', 'Rule5']

In [43]:
rd_weight

Unnamed: 0,Precision,Recall,PercDataFlagged,OptMetric
Rule1,1.0,0.3,0.006,0.312883
Rule2,1.0,0.3,0.006,0.312883
Rule3,1.0,0.3,0.006,0.312883
Rule4,0.155172,0.9,0.998,0.701835
Rule5,0.449102,0.75,0.199,0.721562
