In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
import rule_optimisation.optimisation_functions as opt_funcs
import cProfile, pstats

In [48]:
from rule_generation.rule_generator_opt import RuleGeneratorOpt

In [8]:
np.random.seed(0)
def create_y(X):
    y = (X['A'] > 9)|(X['B'] > 0.98)|(X['C'] <= 0)|(X['D'] == 'CN')
    return y.astype(int)
        
X = pd.DataFrame({
    'A': np.random.randint(0, 10, 100000),
    'B': np.random.uniform(0, 1, 100000),
    'C': np.random.randint(-1, 20, 100000),
    'D': ['US', 'GB', 'FR', 'CN', 'missing'] * 20000}
)
y = create_y(X)

In [49]:
X_processed = pd.get_dummies(X)

In [50]:
X_processed.shape

(100000, 8)

In [51]:
rg = RuleGeneratorOpt(opt_func=precision_score, n_total_conditions=4, num_rules_keep=50)

In [52]:
rg.fit(X_processed, y)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,RGO_Rule67,RGO_Rule58,RGO_Rule57,RGO_Rule55,RGO_Rule62,RGO_Rule63,RGO_Rule64,RGO_Rule61,RGO_Rule74,RGO_Rule75,RGO_Rule72,RGO_Rule65,RGO_Rule66,RGO_Rule69,RGO_Rule71,RGO_Rule5,RGO_Rule1
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
3,1,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,1,0,1,1,0,0,1,0,0,0,1,0,0,0,0
99996,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
99997,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
99998,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0


In [53]:
# X_rules.columns[X_rules.var() == 0].tolist()

In [11]:
import cProfile, pstats
# cProfile.run('rg.fit(X_processed, y)', sort='cumtime', filename='rule_gen_dt_profile_opt.dat')

# Old version

In [55]:
filename = 'rule_gen_opt_profile'
cProfile.run('rg.fit(X_processed, y)', sort='cumtime', filename=f'{filename}.dat')

  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Thu Jan  7 12:45:54 2021    rule_gen_opt_profile.dat

         930959 function calls (916502 primitive calls) in 2.997 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.001    3.001 {built-in method builtins.exec}
        1    0.001    0.001    3.001    3.001 <string>:1(<module>)
        1    0.002    0.002    3.000    3.000 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:72(fit)
        1    0.010    0.010    2.242    2.242 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:263(_generate_n_order_pairwise_rules)
7485/1971    0.076    0.000    2.050    0.001 {built-in method numpy.core._multiarray_umath.implement_array_function}
   879/88    0.006    0.000    2.035    0.023 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/sklearn/utils/validation.py:59(inner_f)
    83/73    0

<pstats.Stats at 0x121676358>

# New version with Precision class instead of precision_score (which is much faster)

In [57]:
filename = 'rule_gen_opt_profile_opt_new_prec_class'
precision = opt_funcs.Precision()
rg = RuleGeneratorOpt(opt_func=precision.fit, n_total_conditions=4, num_rules_keep=50)
cProfile.run('rg.fit(X_processed, y)', sort='cumtime', filename=f'{filename}.dat')

In [58]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Thu Jan  7 12:45:56 2021    rule_gen_opt_profile_opt_new_prec_class.dat

         1036325 function calls (1024749 primitive calls) in 1.188 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.194    1.194 {built-in method builtins.exec}
        1    0.002    0.002    1.193    1.193 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:72(fit)
        1    0.011    0.011    0.729    0.729 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:263(_generate_n_order_pairwise_rules)
   154/78    0.035    0.000    0.355    0.005 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils.py:141(return_binary_pred_perf_of_set_numpy)
        2    0.000    0.000    0.290    0.145 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/c

<pstats.Stats at 0x12160fef0>

# Try newest version with larger dataset

In [9]:
np.random.seed(0)
X = pd.DataFrame({
    'A': np.random.randint(0, 10, 1000000),
    'B': np.random.uniform(0, 1, 1000000),
    'C': np.random.randint(-1, 20, 1000000),
    'D': ['US', 'GB', 'FR', 'CN', 'missing'] * 200000}
)
y = create_y(X)
X_processed = pd.get_dummies(X)

In [10]:
filename = 'rule_gen_opt_profile_opt_1m_records'
precision = opt_funcs.Precision()
rg = RuleGeneratorOpt(opt_func=precision.fit, n_total_conditions=4, num_rules_keep=50, n_points=30, ratio_window=1)
cProfile.run('rg.fit(X_processed, y)', sort='cumtime', filename=f'{filename}.dat')

NameError: name 'cProfile' is not defined

In [66]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Thu Jan  7 12:47:02 2021    rule_gen_opt_profile_opt_1m_records.dat

         975560 function calls (965138 primitive calls) in 3.751 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.826    3.826 {built-in method builtins.exec}
        1    0.007    0.007    3.826    3.826 <string>:1(<module>)
        1    0.010    0.010    3.819    3.819 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:72(fit)
  175/140    0.387    0.002    1.683    0.012 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils.py:141(return_binary_pred_perf_of_set_numpy)
        1    0.004    0.004    1.624    1.624 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:129(_generate_numeric_one_condition_rules)
        1    0.038    0.038    1.482    1.482 /Users/jlaidler/Documents/tigre

<pstats.Stats at 0x11fcc0ac8>

In [67]:
rg.fit(X_processed, y)

Unnamed: 0,RGO_Rule39,RGO_Rule69,RGO_Rule66,RGO_Rule67,RGO_Rule68
0,1,0,0,0,1
1,0,0,1,0,0
2,1,0,0,1,0
3,0,1,1,1,1
4,0,0,0,0,0
...,...,...,...,...,...
999995,0,0,0,0,1
999996,0,0,1,0,0
999997,0,0,0,1,0
999998,0,1,1,1,1


In [68]:
rg.rule_descriptions

Unnamed: 0,Logic,Precision,Recall,nConditions,PercDataFlagged,OptMetric
RGO_Rule39,(X['C']<=0),1.0,0.326694,1,0.09483,1.0
RGO_Rule69,(X['D_FR']==False)&(X['D_GB']==False)&(X['D_US...,1.0,0.689009,4,0.2,1.0
RGO_Rule66,(X['D_FR']==False)&(X['D_US']==False)&(X['D_mi...,0.556688,0.767125,3,0.4,0.556688
RGO_Rule67,(X['D_GB']==False)&(X['D_US']==False)&(X['D_mi...,0.55646,0.766812,3,0.4,0.55646
RGO_Rule68,(X['D_FR']==False)&(X['D_GB']==False)&(X['D_mi...,0.556373,0.766691,3,0.4,0.556373


# Run on Omnyex data

In [4]:
# om = pd.read_pickle('/Users/jlaidler/Downloads/omnyex_data.pkl')

# om.shape

# om.drop(om.filter(regex='sim_').columns, axis=1, inplace=True)

# aggs = [col for col in om.columns if '_per_' in col]
# cats = [col for col in om.columns if 'order_' in col]
# cats.remove('order_no')
# cats.remove('order_year')

# X = om[aggs + cats]
# y = om['chargeback']

# X.loc[:, aggs] = X.loc[:, aggs].fillna(-100)
# X.loc[:, cats] = X.loc[:, cats].fillna('missing')

# X = pd.get_dummies(X)

# X.shape

# X = pd.concat([X.filter(regex='order_').iloc[:, :25], X.filter(regex='_per_').iloc[:, :25]], axis=1)

In [31]:
# pd.concat([X, y], axis=1).to_pickle('~/Downloads/omnyex_processed_for_profiling.pkl')

In [30]:
om = pd.read_pickle('~/Downloads/omnyex_processed_for_profiling.pkl')
X = om.drop('chargeback', axis=1)
y = om['chargeback']

# Original

In [13]:
X_subset = X[:1000000]
y_subset = y[:1000000]

In [216]:
filename = 'rule_gen_opt_profile_orig_omnyex_1m_records'
# precision = opt_funcs.Precision()
# rg = RuleGeneratorOpt(opt_func=precision.fit, n_total_conditions=4, num_rules_keep=50, n_points=10, ratio_window=2, remove_corr_rules=False)
# cProfile.run('rg.fit(X_subset, y_subset)', sort='cumtime', filename=f'{filename}.dat')

In [217]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Thu Jan  7 17:05:50 2021    rule_gen_opt_profile_orig_omnyex_1m_records.dat

         11626529 function calls (11511028 primitive calls) in 506.892 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.001    0.001  507.205  507.205 {built-in method builtins.exec}
        1    0.228    0.228  507.204  507.204 <string>:1(<module>)
        1    0.146    0.146  506.976  506.976 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:72(fit)
        1    1.770    1.770  494.517  494.517 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:263(_generate_n_order_pairwise_rules)
        2    1.710    0.855  237.035  118.517 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:210(_generate_pairwise_rules)
        2    1.538    0.769  194.393   97.196 /Users/jlaidler/

<pstats.Stats at 0x12edf5d30>

# Optimised

## 1m records

In [32]:
filename = 'rule_gen_opt_profile_opt_omnyex_1m_records'
precision = opt_funcs.Precision()
rg = RuleGeneratorOpt(opt_func=precision.fit, n_total_conditions=4, num_rules_keep=50, n_points=10, ratio_window=2, remove_corr_rules=False)
cProfile.run('rg.fit(X_subset, y_subset)', sort='cumtime', filename=f'{filename}.dat')

In [33]:
# rg = RuleGeneratorOpt(opt_func=precision.fit, n_total_conditions=4, num_rules_keep=50, n_points=10, ratio_window=2, remove_corr_rules=False)
# rg.fit(X_subset, y_subset)

In [34]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Fri Jan  8 09:33:08 2021    rule_gen_opt_profile_opt_omnyex_1m_records.dat

         12558481 function calls (12438636 primitive calls) in 402.002 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.001    0.001  402.523  402.523 {built-in method builtins.exec}
        1    0.136    0.136  402.309  402.309 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:72(fit)
        1    1.997    1.997  389.182  389.182 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:257(_generate_n_order_pairwise_rules)
        2    1.728    0.864  217.607  108.803 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:204(_generate_pairwise_rules)
 2836/578   50.467    0.018  135.309    0.234 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils

<pstats.Stats at 0x12857efd0>

In [25]:
%timeit X.to_numpy().std(axis=0)

974 ms ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## All records

In [51]:
X_subset = X
y_subset = y

In [52]:
filename = 'rule_gen_opt_profile_orig_omnyex_all_records'
precision = opt_funcs.Precision()
rg = RuleGeneratorOpt(opt_func=precision.fit, n_total_conditions=4, num_rules_keep=10, n_points=20, ratio_window=2, remove_corr_rules=True)
cProfile.run('rg.fit(X_subset, y_subset)', sort='cumtime', filename=f'{filename}.dat')

In [53]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Fri Jan  8 14:23:22 2021    rule_gen_opt_profile_orig_omnyex_all_records.dat

         5796885 function calls (5742090 primitive calls) in 39.366 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   40.464   40.464 {built-in method builtins.exec}
        1    0.182    0.182   40.389   40.389 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:72(fit)
        1    0.054    0.054   16.424   16.424 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:129(_generate_numeric_one_condition_rules)
        1    0.184    0.184   14.802   14.802 /Users/jlaidler/Documents/tigress/tigress/argo/argo/rule_generation/rule_generation/rule_generator_opt.py:257(_generate_n_order_pairwise_rules)
1260/1078    4.399    0.003   14.669    0.014 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_util

<pstats.Stats at 0x121466470>

# Profile return_binary_pred_perf_of_set_numpy

In [63]:
from argo_utils.argo_utils import return_binary_pred_perf_of_set_numpy 

In [64]:
X_subset = X
y_subset = y

In [65]:
# rg = RuleGeneratorOpt(opt_func=precision.fit, n_total_conditions=4, num_rules_keep=10, n_points=30, ratio_window=2, remove_corr_rules=True)
# X_rules = rg.fit(X_subset, y_subset)

In [66]:
# rg.rule_descriptions

In [67]:
preds = X_subset >= 1 

## With opt_func given

In [None]:
filename = 'return_binary_pred_perf_of_set_numpy_opt_func_given'
cProfile.run('return_binary_pred_perf_of_set_numpy(y_true=y_subset, y_preds=preds, y_preds_columns=preds.columns, opt_func=precision.fit)', sort='cumtime', filename=f'{filename}.dat')
# cProfile.run('return_binary_pred_perf_of_set_numpy(y_true=y_subset, y_preds=X_rules, y_preds_columns=X_rules.columns)', sort='cumtime', filename=f'{filename}.dat')

In [73]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Fri Jan  8 17:20:36 2021    return_binary_pred_perf_of_set_numpy_opt_func_given.dat

         127253 function calls (125618 primitive calls) in 2.263 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    2.263    2.263 {built-in method builtins.exec}
        1    0.098    0.098    2.263    2.263 <string>:1(<module>)
     51/1    0.778    0.015    2.165    2.165 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils.py:141(return_binary_pred_perf_of_set_numpy)
    675/8    0.058    0.000    1.185    0.148 {built-in method numpy.core._multiarray_umath.implement_array_function}
        1    0.000    0.000    0.717    0.717 <__array_function__ internals>:2(apply_along_axis)
        1    0.001    0.001    0.717    0.717 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/numpy/lib/shape_base.py:267(apply_along_axis)
       50    0.000    0.000    0.715    0.014 /Users/j

<pstats.Stats at 0x128247400>

## Without opt_func

In [74]:
filename = 'return_binary_pred_perf_of_set_numpy_opt_func_omitted'
# cProfile.run('return_binary_pred_perf_of_set_numpy(y_true=y_subset, y_preds=preds, y_preds_columns=preds.columns, opt_func=precision.fit)', sort='cumtime', filename=f'{filename}.dat')
cProfile.run('return_binary_pred_perf_of_set_numpy(y_true=y_subset, y_preds=preds, y_preds_columns=preds.columns)', sort='cumtime', filename=f'{filename}.dat')

In [75]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Fri Jan  8 17:21:46 2021    return_binary_pred_perf_of_set_numpy_opt_func_omitted.dat

         1543 function calls (1523 primitive calls) in 1.470 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.470    1.470 {built-in method builtins.exec}
        1    0.114    0.114    1.470    1.470 <string>:1(<module>)
        1    0.583    0.583    1.356    1.356 /Users/jlaidler/Documents/tigress/tigress/argo/argo/argo_utils/argo_utils/argo_utils.py:141(return_binary_pred_perf_of_set_numpy)
     14/4    0.000    0.000    0.434    0.108 {built-in method numpy.core._multiarray_umath.implement_array_function}
        1    0.000    0.000    0.433    0.433 <__array_function__ internals>:2(tile)
        1    0.000    0.000    0.433    0.433 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/numpy/lib/shape_base.py:1171(tile)
        1    0.433    0.433    0.433    0.433 {method 'repeat' of 'numpy.ndarra

<pstats.Stats at 0x1213a8400>

In [65]:
# %timeit np.asarray(y_preds)

In [66]:
# %timeit test.to_numpy()

In [105]:
y_preds_np = y_preds.to_numpy()
y_true_np = y_true.to_numpy()

In [106]:
%timeit y_true_rs = np.reshape(np.repeat(y_true_np, y_preds_np.shape[1]), (y_preds_np.shape[0], y_preds_np.shape[1]))   

11.3 s ± 406 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [108]:
%timeit y_true_tiled = np.tile(y_true_np, (y_preds_np.shape[1], 1)).T

8 s ± 211 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [101]:
y_true_np.shape

(200000,)

In [91]:
y_true_tiled.shape

(200000, 1565)

In [92]:
y_true_rs.shape

(200000, 1565)

In [95]:
(y_true_tiled == y_true_rs).mean()

1.0

In [73]:
np.sum(y_true_np)

439

In [76]:
np.sum(y_true_rs, axis=0)

array([439, 439, 439, ..., 439, 439, 439])