In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import random
import pytest
from sklearn.ensemble import RandomForestClassifier
from rule_optimisation.optimisation_functions import FScore
from rule_application.argo_rule_applier import ArgoRuleApplier
from sklearn.metrics import precision_score, recall_score


def create_data():
    def return_random_num(y, fraud_min, fraud_max, nonfraud_min, nonfraud_max, rand_func):
        data = [rand_func(fraud_min, fraud_max) if i == 1 else rand_func(
            nonfraud_min, nonfraud_max) for i in y]
        return data

    random.seed(0)
    np.random.seed(0)
    y = pd.Series(data=[0]*980 + [1]*20, index=list(range(0, 1000)))
    X = pd.DataFrame(data={
        "num_distinct_txn_per_email_1day": [round(max(i, 0)) for i in return_random_num(y, 2, 1, 1, 2, np.random.normal)],
        "num_distinct_txn_per_email_7day": [round(max(i, 0)) for i in return_random_num(y, 4, 2, 2, 3, np.random.normal)],
        "ip_country_us": [round(min(i, 1)) for i in [max(i, 0) for i in return_random_num(y, 0.3, 0.4, 0.5, 0.5, np.random.normal)]],
        "email_kb_distance": [min(i, 1) for i in [max(i, 0) for i in return_random_num(y, 0.2, 0.5, 0.6, 0.4, np.random.normal)]],
        "email_alpharatio":  [min(i, 1) for i in [max(i, 0) for i in return_random_num(y, 0.33, 0.1, 0.5, 0.2, np.random.normal)]],
    },
        index=list(range(0, 1000))
    )
    X = X.astype(float)
    weights = y.apply(lambda x: 1000 if x == 1 else 1)
    return [X, y, weights]


def return_dummy_rules():
    rules = {
        'Rule1': "X['num_distinct_txn_per_email_7day']>=7",
        'Rule2': "X['email_alpharatio']<=0.5",
        'Rule3': "X['num_distinct_txn_per_email_1day']>=1",
        'Rule4': "X['email_kb_distance']<=0.5",
        'Rule5': "X['ip_country_us']==False",
        'Rule6': "X['num_distinct_txn_per_email_1day']<=3",
        'Rule7': "X['num_distinct_txn_per_email_7day']<=5",
        'Rule8': "X['email_kb_distance']>=0.61",
        'Rule9': "X['email_alpharatio']>=0.5"
    }
    return rules


def return_dummy_rules_np():
    rules = {
        'Rule1': "(X['num_distinct_txn_per_email_7day'].to_numpy(na_value=np.nan)>=7.0)",
        'Rule2': "(X['email_alpharatio'].to_numpy(na_value=np.nan)<=0.5)",
        'Rule3': "(X['num_distinct_txn_per_email_1day'].to_numpy(na_value=np.nan)>=1.0)",
        'Rule4': "(X['email_kb_distance'].to_numpy(na_value=np.nan)<=0.5)",
        'Rule5': "(X['ip_country_us'].to_numpy(na_value=np.nan)==False)",
        'Rule6': "(X['num_distinct_txn_per_email_1day'].to_numpy(na_value=np.nan)<=3.0)",
        'Rule7': "(X['num_distinct_txn_per_email_7day'].to_numpy(na_value=np.nan)<=5.0)",
        'Rule8': "(X['email_kb_distance'].to_numpy(na_value=np.nan)>=0.61)",
        'Rule9': "(X['email_alpharatio'].to_numpy(na_value=np.nan)>=0.5)"
    }
    return rules


def fs_instantiated():
    fs = FScore(1)
    return fs


def ara_instantiated(return_dummy_rules, fs_instantiated):
    fs = fs_instantiated
    rules = return_dummy_rules
    ara = ArgoRuleApplier(rules, fs.fit)
    return ara


def ara_instantiated_np(return_dummy_rules_np, fs_instantiated):
    fs = fs_instantiated
    rules = return_dummy_rules_np
    ara = ArgoRuleApplier(rules, fs.fit)
    return ara

In [70]:
X, y, weights = create_data()

In [71]:
f1 = fs_instantiated()

In [72]:
rule_strings = return_dummy_rules()

In [73]:
precision_score(y_true=y, y_pred=X['email_alpharatio']<=0.5)

0.03578528827037773

In [74]:
from rule_optimisation.optimisation_functions import AlertsPerDay

In [80]:
apd = AlertsPerDay(n_alerts_expected_per_day=5, no_of_days_in_file=10)

In [84]:
ara = ArgoRuleApplier(rule_strings=rule_strings, opt_func=f1.fit)#apd.fit)

In [85]:
X_rules = ara.apply(X, y, weights)

In [87]:
ara.rule_descriptions.index

Index(['Rule3', 'Rule6', 'Rule2', 'Rule5', 'Rule7', 'Rule4', 'Rule8', 'Rule1',
       'Rule9'],
      dtype='object', name='Rule')

In [54]:
# # X_rules_ = pd.DataFrame(columns = [f'Rule{i}' for i in range(1, 11)])
# for rule_name, rule_string in rule_strings.items():
#     print(rule_string)
#     X_rules_[rule_name] = eval(rule_string)

In [55]:
# y_preds = X_rules_
# opt_func = apd.fit

In [56]:
X_rules.sum().to_dict()

{'Rule6': 902,
 'Rule7': 882,
 'Rule3': 584,
 'Rule5': 529,
 'Rule2': 503,
 'Rule9': 497,
 'Rule8': 467,
 'Rule4': 424,
 'Rule1': 71}

In [57]:
from sklearn.metrics import fbeta_score

In [58]:
# for rule_name, row in ara.rule_descriptions.iterrows():
#     assert precision_score(y_true=y, y_pred=eval(row['Logic'])) == row['Precision']
#     assert recall_score(y_true=y, y_pred=eval(row['Logic'])) == row['Recall']
#     assert fbeta_score(y_true=y, y_pred=eval(row['Logic']), beta=1) == row['OptMetric']
#     assert eval(row['Logic']).mean() == row['PercDataFlagged']

In [61]:
for rule_name, row in ara.rule_descriptions.iterrows():
#     assert precision_score(y_true=y, y_pred=eval(row['Logic']), sample_weight=weights) == row['Precision']
#     assert recall_score(y_true=y, y_pred=eval(row['Logic']),sample_weight=weights) == row['Recall']
    assert apd.fit(y_pred=eval(row['Logic'])) == row['OptMetric']
    assert eval(row['Logic']).mean() == row['PercDataFlagged']

In [36]:
ara.rule_descriptions.values

array([[0.9725734292939117, 1.0, 0.584, 0.9860960457548565,
        "X['num_distinct_txn_per_email_1day']>=1", 1],
       [0.9577626664112633, 1.0, 0.902, 0.9784257130277384,
        "X['num_distinct_txn_per_email_1day']<=3", 1],
       [0.9737625101433595, 0.9, 0.503, 0.9354293880732754,
        "X['email_alpharatio']<=0.5", 1],
       [0.9707629054362723, 0.85, 0.529, 0.9063766261462998,
        "X['ip_country_us']==False", 1],
       [0.9515813042261405, 0.85, 0.882, 0.8979268453717154,
        "X['num_distinct_txn_per_email_7day']<=5", 1],
       [0.9715475364330326, 0.7, 0.424, 0.8137169427492007,
        "X['email_kb_distance']<=0.5", 1],
       [0.9286488159727596, 0.3, 0.467, 0.4534976002418654,
        "X['email_kb_distance']>=0.61", 1],
       [0.9778357235984355, 0.15, 0.071, 0.2601005722212589,
        "X['num_distinct_txn_per_email_7day']>=7", 1],
       [0.8016032064128257, 0.1, 0.497, 0.17781729273171817,
        "X['email_alpharatio']>=0.5", 1]], dtype=object)

In [24]:
test = pd.DataFrame(
        np.array([[0.03578528827037773, 0.9, 0.503, 0.06883365200764818,
                   "X['email_alpharatio']<=0.5", 1],
                  [0.03424657534246575, 1.0, 0.584, 0.06622516556291391,
                   "X['num_distinct_txn_per_email_1day']>=1", 1],
                  [0.04225352112676056, 0.15, 0.071, 0.06593406593406594,
                   "X['num_distinct_txn_per_email_7day']>=7", 1],
                  [0.0330188679245283, 0.7, 0.424, 0.06306306306306306,
                   "X['email_kb_distance']<=0.5", 1],
                  [0.03213610586011342, 0.85, 0.529, 0.06193078324225866,
                   "X['ip_country_us']==False", 1],
                  [0.022172949002217297, 1.0, 0.902, 0.04338394793926247,
                   "X['num_distinct_txn_per_email_1day']<=3", 1],
                  [0.01927437641723356, 0.85, 0.882, 0.037694013303769404,
                   "X['num_distinct_txn_per_email_7day']<=5", 1],
                  [0.01284796573875803, 0.3, 0.467, 0.024640657084188913,
                   "X['email_kb_distance']>=0.61", 1],
                  [0.004024144869215292, 0.1, 0.497, 0.007736943907156674,
                   "X['email_alpharatio']>=0.5", 1]], dtype=object),
        columns=['Precision', 'Recall', 'PercDataFlagged', 'OptMetric', 'Logic',
                 'nConditions'],
        index=['Rule2', 'Rule3', 'Rule1', 'Rule4', 'Rule5', 'Rule6', 'Rule7', 'Rule8',
               'Rule9'],
    )


In [25]:
test.index.name = 'Rule'

In [26]:
test

Unnamed: 0_level_0,Precision,Recall,PercDataFlagged,OptMetric,Logic,nConditions
Rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Rule2,0.0357853,0.9,0.503,0.0688337,X['email_alpharatio']<=0.5,1
Rule3,0.0342466,1.0,0.584,0.0662252,X['num_distinct_txn_per_email_1day']>=1,1
Rule1,0.0422535,0.15,0.071,0.0659341,X['num_distinct_txn_per_email_7day']>=7,1
Rule4,0.0330189,0.7,0.424,0.0630631,X['email_kb_distance']<=0.5,1
Rule5,0.0321361,0.85,0.529,0.0619308,X['ip_country_us']==False,1
Rule6,0.0221729,1.0,0.902,0.0433839,X['num_distinct_txn_per_email_1day']<=3,1
Rule7,0.0192744,0.85,0.882,0.037694,X['num_distinct_txn_per_email_7day']<=5,1
Rule8,0.012848,0.3,0.467,0.0246407,X['email_kb_distance']>=0.61,1
Rule9,0.00402414,0.1,0.497,0.00773694,X['email_alpharatio']>=0.5,1


# Unlabelled

In [95]:
import json
from rule_application.sim_rule_applier import SimRuleApplier
from rule_optimisation.optimisation_functions import AlertsPerDay

In [91]:
def create_data():
    np.random.seed(0)
    X = pd.DataFrame({
        'eid': list(range(0, 10)),
        'sim_ll': [
            json.dumps({'A': 10, 'B': -1}),
            json.dumps({'A': 10, 'C': -2}),
            json.dumps({'B': -1, 'D': -1}),
            json.dumps({'A': 10, 'B': -1}),
            json.dumps({'A': 10, 'D': -1}),
            json.dumps({'B': -1, 'E': 2}),
            json.dumps({'A': 10, 'B': -1, 'D': -1}),
            json.dumps({'A': 10, 'B': -1}),
            json.dumps({'A': 10, 'B': -1}),
            json.dumps({'A': 10, 'B': -1}),
        ]
    })
    X.set_index('eid', inplace=True)
    y = pd.Series(np.random.randint(0, 2, 10), list(
        range(0, 10)), name='sim_is_fraud')
    weights = y * 10
    return X, y, weights

def expected_results():
    X_rules = pd.DataFrame({
        "A": [1, 1, 0, 1, 1, 0, 1, 1, 1, 1],
        "B": [1, 0, 1, 1, 0, 1, 1, 1, 1, 1],
        "D": [0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
        "C": [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        "E": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
    }
    )
    return X_rules

In [92]:
X, y, weights = create_data()

In [93]:
X_rules = expected_results()

In [96]:
apd = AlertsPerDay(n_alerts_expected_per_day=10, no_of_days_in_file=10)

In [121]:
sra = SimRuleApplier(opt_func=f1.fit)

In [122]:
sra.apply(X=X,y=y,sample_weight=weights)

Unnamed: 0,A,B,D,C,E
0,1,1,0,0,0
1,1,0,0,1,0
2,0,1,1,0,0
3,1,1,0,0,0
4,1,0,1,0,0
5,0,1,0,0,1
6,1,1,1,0,0
7,1,1,0,0,0
8,1,1,0,0,0
9,1,1,0,0,0


In [124]:
sra.rule_descriptions.values

array([[1.        , 0.75      , 0.8       , 0.85714286],
       [1.        , 0.75      , 0.8       , 0.85714286],
       [1.        , 0.375     , 0.3       , 0.54545455],
       [1.        , 0.125     , 0.1       , 0.22222222],
       [1.        , 0.125     , 0.1       , 0.22222222]])

In [110]:
rd, xr = sra._get_rule_descriptions_unlabelled(X_rules)

TypeError: fit() missing 1 required positional argument: 'y_true'

In [108]:
xr

Unnamed: 0,A,B,D,C,E
0,1,1,0,0,0
1,1,0,0,1,0
2,0,1,1,0,0
3,1,1,0,0,0
4,1,0,1,0,0
5,0,1,0,0,1
6,1,1,1,0,0
7,1,1,0,0,0
8,1,1,0,0,0
9,1,1,0,0,0
