In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
from correlation_reduction.correlation_reduction_methods import AgglomerativeClusteringFeatureReduction
from correlation_reduction.similarity_functions import CosineSimilarity, JaccardSimilarity
import cProfile, pstats

In [4]:
om = pd.read_pickle('~/Downloads/omnyex_processed_for_profiling.pkl')
X = om.drop('chargeback', axis=1)
y = om['chargeback']

In [5]:
X = X.astype(float)

In [6]:
X_rules = (X >= 1).astype(int)

In [7]:
X_rules.shape

(1780373, 50)

In [8]:
cs = CosineSimilarity()
js = JaccardSimilarity()

In [9]:
from argo_utils.argo_utils import return_binary_pred_perf_of_set_numpy

In [13]:
X_rules_precisions = return_binary_pred_perf_of_set_numpy(y_true=y, y_preds=X_rules, y_preds_columns=X_rules.columns)['Precision']

## Testing

In [20]:
rs = AgglomerativeClusteringFeatureReduction(threshold=0.5, strategy='top_down', similarity_function=cs.fit, columns_performance=X_rules_precisions)
X_rules_decorr = rs.fit_transform(X_rules)

In [21]:
X_rules_decorr.shape

(1780373, 25)

# top_down with CosineSimilarity

## Old

In [15]:
filename = 'top_down_CosineSimilarity_old.dat'
rs = AgglomerativeClusteringFeatureReduction(threshold=0.5, strategy='top_down', similarity_function=cs.fit, columns_performance=X_rules_precisions)
cProfile.run('rs.fit(X_rules)', sort='cumtime', filename=f'{filename}.dat')

In [16]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Mon Jan 11 15:34:05 2021    top_down_CosineSimilarity_old.dat.dat

         39614 function calls (38917 primitive calls) in 3.999 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.999    3.999 {built-in method builtins.exec}
        1    0.000    0.000    3.999    3.999 <string>:1(<module>)
        1    0.000    0.000    3.999    3.999 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/correlation_reduction_methods.py:34(fit)
        1    0.000    0.000    2.310    2.310 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/generic.py:11478(stat_func)
        1    0.000    0.000    2.310    2.310 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/frame.py:8531(_reduce)
        1    0.000    0.000    2.309    2.309 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/frame.py:8571(func)
        1    0.000 

<pstats.Stats at 0x13542f1d0>

~3sec runtime - pretty quick already

# top_down with JaccardSimilarity

## Old

In [22]:
filename = 'top_down_JaccardSimilarity_old.dat'
rs = AgglomerativeClusteringFeatureReduction(threshold=0.5, strategy='top_down', similarity_function=js.fit, columns_performance=X_rules_precisions)
cProfile.run('rs.fit(X_rules)', sort='cumtime', filename=f'{filename}.dat')

In [23]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Mon Jan 11 15:45:20 2021    top_down_JaccardSimilarity_old.dat.dat

         39788 function calls (39198 primitive calls) in 4.745 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    4.745    4.745 {built-in method builtins.exec}
        1    0.000    0.000    4.745    4.745 <string>:1(<module>)
        1    0.000    0.000    4.745    4.745 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/correlation_reduction_methods.py:34(fit)
        1    0.007    0.007    2.816    2.816 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/similarity_functions.py:41(fit)
    12/10    0.000    0.000    2.656    0.266 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/sklearn/utils/validation.py:59(inner_f)
        1    0.000    0.000    2.643    2.643 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/sklearn/metr

<pstats.Stats at 0x135b00470>

~5sec runtime - pretty quick already

# bottom_up with CosineSimilarity

## Old

In [24]:
filename = 'bottom_up_CosineSimilarity_old.dat'
rs = AgglomerativeClusteringFeatureReduction(threshold=0.5, strategy='bottom_up', similarity_function=cs.fit, columns_performance=X_rules_precisions)
cProfile.run('rs.fit(X_rules)', sort='cumtime', filename=f'{filename}.dat')

In [25]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Mon Jan 11 15:47:37 2021    bottom_up_CosineSimilarity_old.dat.dat

         135594 function calls (133498 primitive calls) in 3.945 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.945    3.945 {built-in method builtins.exec}
        1    0.000    0.000    3.945    3.945 <string>:1(<module>)
        1    0.000    0.000    3.945    3.945 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/correlation_reduction_methods.py:34(fit)
        1    0.000    0.000    2.189    2.189 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/generic.py:11478(stat_func)
        1    0.000    0.000    2.189    2.189 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/frame.py:8531(_reduce)
        1    0.000    0.000    2.189    2.189 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/pandas/core/frame.py:8571(func)
        1    0.0

<pstats.Stats at 0x135a6f550>

## New (replace X.var() with X.values.var(axis=0)

In [40]:
filename = 'bottom_up_CosineSimilarity_new.dat'
rs = AgglomerativeClusteringFeatureReduction(threshold=0.5, strategy='bottom_up', similarity_function=cs.fit, columns_performance=X_rules_precisions)
cProfile.run('rs.fit(X_rules)', sort='cumtime', filename=f'{filename}.dat')

In [41]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Mon Jan 11 15:53:58 2021    bottom_up_CosineSimilarity_new.dat.dat

         134375 function calls (132290 primitive calls) in 2.907 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    2.907    2.907 {built-in method builtins.exec}
        1    0.000    0.000    2.907    2.907 <string>:1(<module>)
        1    0.000    0.000    2.907    2.907 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/correlation_reduction_methods.py:34(fit)
    15/10    0.000    0.000    1.919    0.192 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/sklearn/utils/validation.py:59(inner_f)
        1    0.000    0.000    1.909    1.909 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/similarity_functions.py:17(fit)
        1    0.000    0.000    1.909    1.909 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/sklearn/me

<pstats.Stats at 0x1356f10f0>

shaved ~1 sec off

# bottom_up with JaccardSimilarity

## Old

In [11]:
filename = 'bottom_up_JaccardSimilarity_old.dat'
# rs = AgglomerativeClusteringFeatureReduction(threshold=0.5, strategy='bottom_up', similarity_function=js.fit, columns_performance=X_rules_precisions)
# cProfile.run('rs.fit(X_rules)', sort='cumtime', filename=f'{filename}.dat')

In [12]:
p = pstats.Stats(f'{filename}.dat')
p.sort_stats('cumtime').print_stats()

Mon Jan 11 15:58:07 2021    bottom_up_JaccardSimilarity_old.dat.dat

         99324 function calls (97796 primitive calls) in 3.764 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.764    3.764 {built-in method builtins.exec}
        1    0.000    0.000    3.764    3.764 <string>:1(<module>)
        1    0.000    0.000    3.764    3.764 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/correlation_reduction_methods.py:34(fit)
        1    0.007    0.007    2.817    2.817 /Users/jlaidler/Documents/tigress/tigress/argo/argo/correlation_reduction/correlation_reduction/similarity_functions.py:41(fit)
      9/7    0.000    0.000    2.706    0.387 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/sklearn/utils/validation.py:59(inner_f)
        1    0.000    0.000    2.699    2.699 /Users/jlaidler/venvs/argo/lib/python3.7/site-packages/sklearn/met

<pstats.Stats at 0x133510eb8>