In [1]:
%load_ext autoreload
%autoreload 2

Rule optimisation - would need to convert string rules to some sort of f string (replacing values with those from BO)
Inputs: Rule login (without values)
Outputs: Optimised rule

Rule applier (best via eval-string implementation)
Inputs: Rule logic
Outputs: X_rules, rule_descriptions

Rule filtering
Inputs: X_rules, y
Outputs: X_rules_filtered

Rule Generation
Inputs: X, y
Outputs: Rules logic

Rule scoring
Inputs: X_rules, y
Outputs: X_scores, Rule scores

Bayesian Opt inputs
* python string format of rule
* optimisation function

BO then does the following:
* Extracts features from python string format
* Calculates space to optimise for each feature
* For the optimisation function within the BO class, it:
    * Enters the value for each feature
    * Evaluates string
    * Calculates user defined optimisation metric (via provided function)

# Suggested parsers/formats

1) Start off with the raw JSON format (same as used in the system):

{
  "condition": "AND",
  "rules": [
    {
      "condition": "OR",
      "rules": [
        {
          "id": "payer_id.sum_approved_txn_amt_per_paypalid_1day",
          "field": "payer_id.sum_approved_txn_amt_per_paypalid_1day",
          "type": "double",
          "operator": "greater_or_equal",
          "value": "value.60"
        },
        {
          "id": "payer_id.sum_approved_txn_amt_per_paypalid_7day",
          "field": "payer_id.sum_approved_txn_amt_per_paypalid_7day",
          "type": "double",
          "operator": "greater_or_equal",
          "value": "value.120"
        },
        {
          "id": "payer_id.sum_approved_txn_amt_per_paypalid_30day",
          "field": "payer_id.sum_approved_txn_amt_per_paypalid_30day",
          "type": "double",
          "operator": "greater_or_equal",
          "value": "value.500"
        }
      ],
      "data": {}
    },
    {
      "id": "num_items",
      "field": "num_items",
      "type": "integer",
      "operator": "equal",
      "value": "value.1"
    },
    {
      "id": "total_num_items_ordered",
      "field": "total_num_items_ordered",
      "type": "integer",
      "operator": "greater_or_equal",
      "value": "value.2"
    }
  ]
}

2) Then clean up to ARGO standard "rule dictionary" format (easier to read and removes unnecessary k-v pairs):

{
  "condition": "&",
  "rules": [
    {
      "condition": "|",
      "rules": [
        {
          "field": "account_number_num_fraud_transactions_per_account_number_7day",
          "operator": ">=",
          "value": 60
        },
        {
          "field": "payer_id_sum_approved_txn_amt_per_paypalid_7day",
          "operator": ">=",
          "value": 120
        },
        {
          "field": "payer_id_sum_approved_txn_amt_per_paypalid_30day",
          "operator": ">=",
          "value": 500
        }
      ]
    },
    {    
      "field": "num_items",      
      "operator": "==",
      "value": 1
    },
    {    
      "field": "total_num_items_ordered",      
      "operator": ">=",
      "value": 2
    }
  ]
}

Can use the above format when creating rules using ArgoDT/ArgoOpt

3) Then have function to convert to string format. Useful for: 
* Applying rules to datasets - can just use eval() function)
* More readable

In [3]:
rule_str = "((X['account_number_num_fraud_transactions_per_account_number_7day']>=60)|\
(X['payer_id_sum_approved_txn_amt_per_paypalid_7day']>=120)|\
(X['payer_id_sum_approved_txn_amt_per_paypalid_30day']>=500))&\
(X['num_items']==1)&\
(X['total_num_items_ordered']>=2)"

4) Have a separate function to convert to a lambda expression (used for Bayesian Optimisation):

In [4]:
rule_lambda = lambda *args: "((X['account_number_num_fraud_transactions_per_account_number_7day']>={})|\
(X['payer_id_sum_approved_txn_amt_per_paypalid_7day']>={})|\
(X['payer_id_sum_approved_txn_amt_per_paypalid_30day']>={}))&\
(X['num_items']=={})&\
(X['total_num_items_ordered']>={})".format(*args)

In [5]:
rule_lambda(60, 120, 500, 1, 2)

"((X['account_number_num_fraud_transactions_per_account_number_7day']>=60)|(X['payer_id_sum_approved_txn_amt_per_paypalid_7day']>=120)|(X['payer_id_sum_approved_txn_amt_per_paypalid_30day']>=500))&(X['num_items']==1)&(X['total_num_items_ordered']>=2)"

In [6]:
import pandas as pd
import numpy as np

In [7]:
X = pd.DataFrame({
    'num_distinct_email_1day': np.random.randint(0, 10, 100),
    'num_distinct_email_7day': np.random.randint(0, 15, 100),
    'num_distinct_email_30day': np.random.randint(0, 20, 100)
})

In [8]:
rule_lambda = lambda *args: "(X['num_distinct_email_1day']>{})&\
(X['num_distinct_email_7day']>{})&\
(X['num_distinct_email_30day']>{})".format(*args)

In [9]:
eval(rule_lambda(1, 2, 3))

0     False
1     False
2     False
3     False
4      True
      ...  
95    False
96    False
97     True
98    False
99     True
Length: 100, dtype: bool

In [10]:
rule_lambda(1, 2, 3)

"(X['num_distinct_email_1day']>1)&(X['num_distinct_email_7day']>2)&(X['num_distinct_email_30day']>3)"

---

# Development

In [176]:
import pandas as pd
import numpy as np

In [177]:
X = pd.DataFrame(
    {
        'A': np.random.randint(0, 1000, 10000),
        'B': np.random.randint(0, 1000, 10000),
        'C': np.random.randint(0, 1000, 10000),
        'D': np.random.randint(0, 2, 10000),
        'E': np.random.randint(0, 1000, 10000),
        'F': np.random.randint(0, 1000, 10000),
        'G': np.random.randint(0, 1000, 10000)
    }
)

In [209]:
rule_dict = {
  "condition": "|",
  "rules": [
    {
      "condition": "&",
      "rules": [
          {
           "condition": "|",
           "rules": [
                {
                  "field": "A",
                  "operator": ">=",
                  "value": 60
                },
                {
                  "field": "B",
                  "operator": ">=",
                  "value": 120
                },
                {
                  "field": "C",
                  "operator": ">=",
                  "value": 500
                }
               ]
          },
            {
              "field": "F",
              "operator": ">=",
              "value": 120
            },
            {
              "field": "G",
              "operator": ">=",
              "value": 500
            }  
      ]
    },
    {    
      "field": "D",      
      "operator": "==",
      "value": 1
    },
    {    
      "field": "E",      
      "operator": ">=",
      "value": 2
    }
  ]
}

In [210]:
r = Rule(rule_dict=rule_dict)

In [211]:
r.as_string(as_numpy=False)

"(((X['A']>=60)|(X['B']>=120)|(X['C']>=500))&(X['F']>=120)&(X['G']>=500))|(X['D']==1)|(X['E']>=2)"

In [214]:
r.as_dict()

{'condition': '|',
 'rules': [{'condition': '&',
   'rules': [{'condition': '|',
     'rules': [{'field': 'A', 'operator': '>=', 'value': 60},
      {'field': 'B', 'operator': '>=', 'value': 120},
      {'field': 'C', 'operator': '>=', 'value': 500}]},
    {'field': 'F', 'operator': '>=', 'value': 120},
    {'field': 'G', 'operator': '>=', 'value': 500}]},
  {'field': 'D', 'operator': '==', 'value': 1},
  {'field': 'E', 'operator': '>=', 'value': 2}]}

In [225]:
rule_lambda = r.as_lambda(as_numpy=False, with_kwargs=True)

In [226]:
rule_lambda(**{'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
# rule_lambda(1,2,3,4,5,6,7)

"(((X['A']>=1)|(X['B']>=2)|(X['C']>=3))&(X['F']>=6)&(X['G']>=7))|(X['D']==4)|(X['E']>=5)"

In [239]:
rule_string = "(X['F']>1)"
# rule_string = "(X['A']>1)&(X['C']>1)&(X['F']>1)"
# rule_string = "((X['A']>1)|(X['B']>1))&((X['C']>1)|(X['D']>1))&((X['E']>1)|(X['F']>1))"

In [240]:
r = Rule(rule_string=rule_string)

In [241]:
r.as_dict()

{'condition': 'AND', 'rules': [{'field': 'F', 'operator': '>', 'value': 1}]}

In [242]:
r.as_string(as_numpy=False)

"(X['F']>1)"

In [243]:
test=r.as_lambda(as_numpy=False)

In [31]:
import numpy as np
import pandas as pd
arr = np.array(['ab', 'ac', 'ad', np.nan])

In [99]:
~np.char.endswith(arr, 'ab')

array([False,  True,  True,  True])

In [33]:
~np.isin(arr, ['ab', 'ac'])

array([False, False,  True,  True])

In [35]:
arr

array(['ab', 'ac', 'ad', 'nan'], dtype='<U3')

In [34]:
pd.isna(arr)

array([False, False, False, False])

In [86]:
X = pd.DataFrame({
    'A': ['ab', 'ac', 'ad', np.nan, 'bc', 'bac']
})

In [87]:
X['A'].str.endswith(('ab'))

0     True
1    False
2    False
3      NaN
4    False
5    False
Name: A, dtype: object

In [88]:
X.A.str.startswith('a')

0     True
1     True
2     True
3      NaN
4    False
5    False
Name: A, dtype: object

In [89]:
pd.isna(X['A'])

0    False
1    False
2    False
3     True
4    False
5    False
Name: A, dtype: bool

In [102]:
~X['A'].isin(['ab', 'ac'])

0    False
1    False
2     True
3     True
4     True
5     True
Name: A, dtype: bool