In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import StandardScaler

In [3]:
from weight_of_evidence import weight_of_evidence

In [4]:
single_var_decision_tree= weight_of_evidence.SingleVariableDecisionTreeClassifier(
        min_samples_per_node=1, max_depth=2
    )

### Gini

In [5]:
y_1 = np.array([1, 5, 20, 10, 0])
y_c = np.array([2, 20, 50, 10, 10])

In [6]:
single_var_decision_tree._gini(y_1, y_c)

array([0.5  , 0.375, 0.48 , 0.   , 0.   ])

In [7]:
1.0 - (0.4**2) - (0.6**2)

0.48

### Gini Decrease

In [8]:
Y = pd.Series([0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,])

In [9]:
gini_decreases, _, _ = single_var_decision_tree._find_gini_decreases(Y)
    

In [10]:
gini_decreases

0          NaN
1     0.061869
2     0.002778
3     0.041667
4     0.111111
5     0.209921
6     0.125000
7     0.067063
8     0.027778
9     0.004630
10    0.069444
11    0.031566
dtype: float64

### best split

In [11]:
X_SORTED = pd.Series([1, 1, 2, 2, 3, 3, 3, 3, 3, 5, 10, 20,])

In [12]:
single_var_decision_tree._best_split(X_SORTED, Y)

3

### fit

In [13]:
def non_mono_fn(company_age):
    if company_age <= 25:
        return 0.25
    elif company_age <= 75:
        return 0.75
    else:
        return 0.25

In [14]:
company_age = np.arange(1, 100, 0.01)

In [15]:
non_mono_risk = np.vectorize(non_mono_fn)(company_age)

In [16]:
def simulate_outcome(risk_vector):
    random_draws = np.random.uniform(size=risk_vector.shape)
    return (risk_vector > random_draws).astype(int)

In [17]:
Y_non_mono = simulate_outcome(non_mono_risk)

In [18]:
company_age

array([ 1.  ,  1.01,  1.02, ..., 99.97, 99.98, 99.99])

In [19]:
Y_non_mono

array([0, 1, 0, ..., 0, 0, 0])

In [20]:
single_var_decision_tree.fit(pd.Series(company_age), pd.Series(Y_non_mono))

In [21]:
single_var_decision_tree.splits_

[-inf, 24.600000000000023, 74.93000000000006, inf]

### Logit

In [22]:
COMPANY_AGES = [
    "young",
    "medium",
    "young",
    "medium",
    "young",
    "medium",
    "old",
    "medium",
    "old",
    "old",
    "young",
    "young",
]

DEMO_DF = pd.DataFrame(data=COMPANY_AGES, columns=["company_age"])


In [23]:
logit_scaler = weight_of_evidence.LogitScaler()

In [24]:
Y

0     0
1     1
2     0
3     0
4     0
5     1
6     1
7     1
8     1
9     0
10    1
11    1
dtype: int64

In [25]:
DEMO_DF

Unnamed: 0,company_age
0,young
1,medium
2,young
3,medium
4,young
5,medium
6,old
7,medium
8,old
9,old


In [26]:
DEMO_DF['company_age_logit'] = logit_scaler.fit_transform(DEMO_DF, Y)


In [27]:
DEMO_DF

Unnamed: 0,company_age,company_age_logit
0,young,-0.405465
1,medium,1.098612
2,young,-0.405465
3,medium,1.098612
4,young,-0.405465
5,medium,1.098612
6,old,0.693147
7,medium,1.098612
8,old,0.693147
9,old,0.693147


In [28]:
ss = StandardScaler()

In [29]:
DEMO_DF['company_age_woe'] = ss.fit_transform(DEMO_DF[['company_age_logit']])

In [30]:
DEMO_DF.mean()

company_age_logit    0.370547
company_age_woe      0.000000
dtype: float64

In [31]:
DEMO_DF.std()

company_age_logit    0.703466
company_age_woe      1.044466
dtype: float64