In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import numpy as np
import pandas as pd

import argo_utils.argo_utils as argo_utils
import math

In [3]:
def return_random_num(y, fraud_min, fraud_max, nonfraud_min, nonfraud_max, rand_func):
    data = [rand_func(fraud_min, fraud_max) if i == 1 else rand_func(
        nonfraud_min, nonfraud_max) for i in y]
    return data

random.seed(0)
np.random.seed(0)
y = pd.Series(data=[0]*980 + [1]*20, index=list(range(0, 1000)))
X = pd.DataFrame(data={
    "num_distinct_txn_per_email_1day": [round(max(i, 0)) for i in return_random_num(y, 2, 1, 1, 2, np.random.normal)],
    "num_distinct_txn_per_email_7day": [round(max(i, 0)) for i in return_random_num(y, 4, 2, 2, 3, np.random.normal)],
    "ip_country_us": [round(min(i, 1)) for i in [max(i, 0) for i in return_random_num(y, 0.3, 0.4, 0.5, 0.5, np.random.normal)]],
    "email_kb_distance": [min(i, 1) for i in [max(i, 0) for i in return_random_num(y, 0.2, 0.5, 0.6, 0.4, np.random.normal)]],
    "email_alpharatio":  [min(i, 1) for i in [max(i, 0) for i in return_random_num(y, 0.33, 0.1, 0.5, 0.2, np.random.normal)]],
},
    index=list(range(0, 1000))
)
columns_int = [
    'num_distinct_txn_per_email_1day', 'num_distinct_txn_per_email_7day', 'ip_country_us']
columns_cat = ['ip_country_us']
columns_num = ['num_distinct_txn_per_email_1day',
               'num_distinct_txn_per_email_7day', 'email_kb_distance', 'email_alpharatio']
weights = y.apply(lambda x: 1000 if x == 1 else 1)

In [4]:
X.describe()

Unnamed: 0,num_distinct_txn_per_email_1day,num_distinct_txn_per_email_7day,ip_country_us,email_kb_distance,email_alpharatio
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1.34,2.491,0.471,0.563383,0.50184
std,1.494869,2.354881,0.499408,0.326518,0.197752
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.306039,0.366063
50%,1.0,2.0,0.0,0.585855,0.499346
75%,2.0,4.0,1.0,0.862649,0.638693
max,7.0,12.0,1.0,1.0,1.0


In [5]:
from rule_generation.rule_generator_opt import RuleGeneratorOpt
from rule_generation.rule_generator_dt import RuleGeneratorDT
from sklearn.ensemble import RandomForestClassifier
# from rule_generation.rule_generator_opt_old import RuleGeneratorOpt as RuleGeneratorOptOld
from rule_optimisation.optimisation_functions import FScore

In [44]:
f = FScore(0.5)
f0dot5 = f.fit

In [56]:
params = {
        'opt_func': f0dot5,
        'n_total_conditions': 4,
        'tree_ensemble': RandomForestClassifier(n_estimators=10, random_state=0),
        'precision_threshold': 0,
        'num_cores': 4
    }

In [57]:
rg = RuleGeneratorDT(**params)

In [58]:
X_rules = rg.fit(X, y, weights)

(14, 6) (1000, 14)


In [59]:
rg.rule_descriptions

Unnamed: 0_level_0,Logic,Precision,Recall,nConditions,PercDataFlagged,OptMetric
Rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RGDT_Rule4,(X['email_alpharatio']<=0.57456)&(X['num_disti...,0.989707,1.0,4,0.228,0.991749
RGDT_Rule11,(X['email_alpharatio']<=0.52117)&(X['email_alp...,0.989583,0.95,4,0.219,0.981405
RGDT_Rule3,(X['email_alpharatio']<=0.57456)&(X['email_kb_...,0.988631,0.9,4,0.225,0.969535
RGDT_Rule12,(X['email_alpharatio']<=0.52125)&(X['ip_countr...,0.993687,0.85,4,0.125,0.961191
RGDT_Rule1,(X['email_alpharatio']<=0.52376)&(X['ip_countr...,0.989062,0.85,4,0.205,0.957725
RGDT_Rule0,(X['email_alpharatio']<=0.52123)&(X['num_disti...,0.990712,0.8,4,0.166,0.945626
RGDT_Rule5,(X['email_alpharatio']<=0.57495)&(X['email_alp...,0.992594,0.65,4,0.11,0.897939
RGDT_Rule7,(X['email_alpharatio']<=0.57456)&(X['email_kb_...,0.990854,0.65,4,0.133,0.896799
RGDT_Rule9,(X['email_kb_distance']<=0.23452)&(X['ip_count...,0.996236,0.45,4,0.043,0.801625
RGDT_Rule10,(X['email_kb_distance']>0.23452)&(X['ip_countr...,0.980152,0.4,4,0.17,0.759763


In [60]:
X_rules

Unnamed: 0,RGDT_Rule4,RGDT_Rule11,RGDT_Rule3,RGDT_Rule12,RGDT_Rule1,RGDT_Rule0,RGDT_Rule5,RGDT_Rule7,RGDT_Rule9,RGDT_Rule10,RGDT_Rule8,RGDT_Rule6,RGDT_Rule2,RGDT_Rule13
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,1,1,1,1,1,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,1,1,0,0,1,1,1,0,0,0,0,1,1
996,1,1,0,1,1,0,0,0,0,1,1,0,0,0
997,1,1,1,1,1,1,1,1,1,0,0,0,0,0
998,1,1,1,1,1,1,0,0,0,1,1,1,0,0


In [17]:
argo_utils.sort_rule_dfs_by_opt_metric(rg.rule_descriptions, X_rules)

ValueError: cannot reindex from a duplicate axis

In [21]:
rule_descriptions = rg.rule_descriptions

In [22]:
rule_descriptions.sort_values(
        by=['OptMetric'], ascending=False, inplace=True)

In [33]:
rule_descriptions

Unnamed: 0_level_0,Logic,Precision,Recall,nConditions,PercDataFlagged,OptMetric
Rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RGDT_Rule5,(X['email_alpharatio']<=0.57456)&(X['num_disti...,0.989707,1.0,4,0.228,0.991749
RGDT_Rule12,(X['email_alpharatio']<=0.52117)&(X['email_alp...,0.989583,0.95,4,0.219,0.981405
RGDT_Rule3,(X['email_alpharatio']<=0.57456)&(X['email_kb_...,0.988631,0.9,4,0.225,0.969535
RGDT_Rule13,(X['email_alpharatio']<=0.52125)&(X['ip_countr...,0.993687,0.85,4,0.125,0.961191
RGDT_Rule1,(X['email_alpharatio']<=0.52376)&(X['ip_countr...,0.989062,0.85,4,0.205,0.957725
RGDT_Rule0,(X['email_alpharatio']<=0.52123)&(X['num_disti...,0.990712,0.8,4,0.166,0.945626
RGDT_Rule6,(X['email_alpharatio']<=0.57495)&(X['email_alp...,0.992594,0.65,4,0.11,0.897939
RGDT_Rule8,(X['email_alpharatio']<=0.57456)&(X['email_kb_...,0.990854,0.65,4,0.133,0.896799
RGDT_Rule10,(X['email_kb_distance']<=0.23452)&(X['ip_count...,0.996236,0.45,4,0.043,0.801625
RGDT_Rule11,(X['email_kb_distance']>0.23452)&(X['ip_countr...,0.980152,0.4,4,0.17,0.759763


In [34]:
X_rules

Unnamed: 0,RGDT_Rule0,RGDT_Rule1,RGDT_Rule2,RGDT_Rule3,RGDT_Rule5,RGDT_Rule5.1,RGDT_Rule6,RGDT_Rule7,RGDT_Rule8,RGDT_Rule9,RGDT_Rule10,RGDT_Rule11,RGDT_Rule12,RGDT_Rule13,RGDT_Rule14
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,1,1,0,1,0,0,0,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0,1,1,1,1,1,0,1,0,0,0,1,0,1
996,0,1,0,0,1,1,0,0,0,1,0,1,1,1,0
997,1,1,0,1,1,1,1,0,1,0,1,0,1,1,0
998,1,1,0,1,1,1,0,1,0,1,0,1,1,1,0


In [30]:
[rule for rule in rule_descriptions.index.tolist() if rule not in X_rules.columns]

[]

In [27]:
X_rules.reindex(rule_descriptions.index.tolist(), axis=1)

ValueError: cannot reindex from a duplicate axis

In [45]:
# rg.rule_descriptions

In [46]:
rule_descriptions = rg.rule_descriptions

In [47]:
self = rg

In [49]:
logic_to_name_dict = dict((rule_logic, self._generate_rule_name()) for rule_logic in rule_descriptions.index)

In [51]:
rule_descriptions.rename(logic_to_name_dict, axis=0, inplace=True)

In [54]:
rule_descriptions

Unnamed: 0_level_0,Logic,Precision,Recall,nConditions,PercDataFlagged,OptMetric
Rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ARGORule40,(X['email_alpharatio']<=0.37964)&(X['email_kb_...,0.5,0.25,4,0.01,0.416667
ARGORule41,(X['email_kb_distance']<=0.00092)&(X['num_dist...,0.5,0.2,4,0.008,0.384615
ARGORule42,(X['email_alpharatio']<=0.43844)&(X['email_kb_...,0.296296,0.4,4,0.027,0.3125
ARGORule43,(X['email_alpharatio']<=0.42545)&(X['email_alp...,0.5,0.1,4,0.004,0.277778
ARGORule44,(X['email_alpharatio']<=0.43888)&(X['email_kb_...,0.4,0.1,4,0.005,0.25
ARGORule45,(X['email_alpharatio']<=0.52347)&(X['email_alp...,0.4,0.1,4,0.005,0.25
ARGORule46,(X['email_kb_distance']<=0.00404)&(X['num_dist...,0.21875,0.35,2,0.032,0.236486
ARGORule47,(X['email_alpharatio']<=0.42318)&(X['email_kb_...,0.2,0.35,3,0.035,0.21875
ARGORule48,(X['email_kb_distance']<=0.18498)&(X['email_kb...,1.0,0.05,3,0.001,0.208333
ARGORule49,(X['email_alpharatio']<=0.43844)&(X['email_alp...,1.0,0.05,4,0.001,0.208333


In [53]:
X_rules.rename(logic_to_name_dict, axis=1, inplace=True)

In [21]:
import argo_utils.argo_utils as argo_utils

In [22]:
argo_utils.return_columns_types(X)

(['num_distinct_txn_per_email_1day',
  'num_distinct_txn_per_email_7day',
  'ip_country_us'],
 ['ip_country_us'],
 ['email_kb_distance', 'email_alpharatio'])