In [5]:
import numpy as np
import pandas as pd
import torch
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_boston
import seaborn as sns
from ipywidgets import interact

import warnings
warnings.filterwarnings('ignore')

import custom_gbm_lib

# Probability - Logodds relationship

Since we will be tackling a classification problem we want to predict probabilities (left chart in below cell output).
We nevertheless want to operate in linear space and take linear steps. As such the algorithm will work with logodds (right chart in below cell output).

(The theory behind this stems from Generalized Linear Models)

We will be visiting the probability chart to only calculate the loss (binary cross entropy loss in our case).

In [6]:
import altair as alt

x = np.arange(-10, 10, 1)
y_logodds = x 
y_prob = 1./(1. + np.exp(-y_logodds))

df = pd.DataFrame({"x" : x, "y_logodds" : y_logodds, "y_prob" : y_prob})

point_selection = alt.selection_single(on='mouseover', nearest=True)

r1c1 = alt.Chart(df).mark_circle().encode(
    x = "x:Q",
    y = "y_prob:Q",
    tooltip = "y_prob",
    color=alt.condition(point_selection, alt.value("blue"), alt.value('lightgray')),
    size = alt.value(100)
).properties(
    width=600,
    height=300,
    title="Probability"
).add_selection(
    point_selection
)
r1c2 = alt.Chart(df).mark_circle().encode(
    x = "x:Q",
    y = "y_logodds:Q",
    tooltip = "y_logodds",
    color=alt.condition(point_selection, alt.value("blue"), alt.value('lightgray')),
    size = alt.value(100)
).properties(
    width=600,
    height=300,
    title="Logodds"
).add_selection(
    point_selection
)
alt.hconcat(r1c1, r1c2)

# Load the data

In [7]:
X = pd.read_csv("./titanic_preprocessed.csv")
y = X["Survived"]

# Starting point
### If we were given the data and somebody asked as : 
#### - What would your prediction be if you were given a new row ?

#### We can pose the problem in another way.
#### We are trying to find a single value that we would predict for each row by minimizing the distance between the single value and the target value (mean squared error across rows)

In [8]:
# Minmize loss function by using a pytorch optimizer 
logodds_value = torch.tensor(0.0, requires_grad = True)
learning_rate = 0.005
optimizer = optim.Adam([logodds_value], lr = learning_rate)
y_tensor = torch.tensor(y.to_numpy(), dtype=torch.float32)
for _ in range(2000):
    loss = -torch.sum(y_tensor * torch.log(torch.sigmoid(logodds_value)) + (1. - y_tensor) * torch.log(1. - torch.sigmoid(logodds_value)))     
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(f"The logodds parameter value (pytorch optimizer) is : {logodds_value}")
print(f"The probability parameter value (pytorch optimizer) is : {torch.sigmoid(logodds_value)}")

# Another way to minimize the loss function which also shows the logistic regression hidden inside the optimization steps of the algorithm follows
y_tensor = torch.tensor(y.to_numpy())
model = LogisticRegression(fit_intercept=False, max_iter = 1000, penalty = "none")
model.fit(torch.ones_like(y_tensor).reshape(-1, 1), y_tensor)
print(f"The logodds parameter value (LogisticRegression Model fitting) is {model.coef_[0][0]}")
print(f"The probability parameter value (LogisticRegression Model fitting) is {torch.sigmoid(torch.tensor(model.coef_))[0][0]}")

The logodds parameter value (pytorch optimizer) is : -0.6359884142875671
The probability parameter value (pytorch optimizer) is : 0.3461539149284363
The logodds parameter value (LogisticRegression Model fitting) is -0.635988769668029
The probability parameter value (LogisticRegression Model fitting) is 0.34615384548661404


## The probability value as derived above is basically the same as the ratio in the dataset of (SURVIVED) / ALL

In [9]:
# As one would expect the| probability value as derived from the above optimization problem is the same as if we took the following ratio : 
sum(y == 1) / len(y)

0.34615384615384615

# Run the custom algorithm which will help understand the evolution of the algorithm

In [10]:
X = pd.read_csv("./titanic_preprocessed.csv")
y = X["Survived"]
# Definition of Hyper-Parameters
NUM_CLASSIFIERS = 3
MAX_DEPTH = 4
GRADIENT_BOOST_LEARNING_RATE = 0.1
MINIMIZER_LEARNING_RATE = 0.01
MINIMIZER_TRAINING_EPOCHS = 1000
# Running the custom algorithm 
custom = custom_gbm_lib.PytorchBasedGenericGradientBoost("classifier", NUM_CLASSIFIERS, MAX_DEPTH, GRADIENT_BOOST_LEARNING_RATE=GRADIENT_BOOST_LEARNING_RATE, MINIMIZER_LEARNING_RATE=MINIMIZER_LEARNING_RATE, MINIMIZER_TRAINING_EPOCHS=MINIMIZER_TRAINING_EPOCHS)
df_result = custom.fit(X, y)
df_result_X, df_result_y = df_result

FileNotFoundError: [Errno 2] No such file or directory: '../input/preprocessed-titanic-dataset/titanic_preprocessed.csv'

In [7]:
def highlight_cols(color):
    def inner(value):
        return 'background-color: %s' % color
    return inner

# styled_df = df_result_X.style.applymap(highlight_cols("red"), subset=pd.IndexSlice[:, [col for col in df_result_X if col.startswith("Logodds")]])
# styled_df = styled_df.applymap(highlight_cols("yellow"), subset=pd.IndexSlice[:, [col for col in df_result_X if col.startswith("Probs")]])
# styled_df = styled_df.applymap(highlight_cols("green"), subset=pd.IndexSlice[:, [col for col in df_result_X if col.startswith("Gradients")]])

styled_df = df_result_X.style.background_gradient(sns.light_palette("red", as_cmap=True), subset=pd.IndexSlice[:, [col for col in df_result_X if col.startswith("Logodds")]])
styled_df = styled_df.background_gradient(sns.light_palette("green", as_cmap=True), subset=pd.IndexSlice[:, [col for col in df_result_X if col.startswith("Probs")]])
styled_df = styled_df.background_gradient(sns.light_palette("blue", as_cmap=True), subset=pd.IndexSlice[:, [col for col in df_result_X if col.startswith("Clusters")]])
styled_df.bar(subset=['Gradients_0_1'], align='mid', color=['#d65f5f', '#5fba7d'])
style = styled_df.export()

df_result_X.head(20).style.use(style)

Unnamed: 0.1,Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Survived,Logodds_0,Probs_0,Gradients_0_1,Clusters_1,Logodds_1,Probs_1,Leaf_Prediction_1,Gradients_1_2,Clusters_2,Logodds_2,Probs_2,Leaf_Prediction_2,Gradients_2_3,Clusters_3,Logodds_3,Probs_3,Leaf_Prediction_3
0,0,3.0,22.0,1.0,0.0,7.25,1.0,0,-0.635989,0.346154,0.346154,0,-1.007355,0.267498,-3.713656,0.267498,0,-1.37221,0.202263,-3.648554,0.202263,0,-1.732227,0.150303,-3.60017
1,1,1.0,38.0,1.0,0.0,71.2833,0.0,1,-0.635989,0.346154,-0.653846,1,-0.226803,0.443541,4.091866,-0.556459,1,0.167374,0.541746,3.941761,-0.458254,1,0.549686,0.634063,3.823123
2,2,3.0,26.0,0.0,0.0,7.925,0.0,1,-0.635989,0.346154,-0.653846,1,-0.226803,0.443541,4.091866,-0.556459,1,0.167374,0.541746,3.941761,-0.458254,1,0.549686,0.634063,3.823123
3,3,1.0,35.0,1.0,0.0,53.1,0.0,1,-0.635989,0.346154,-0.653846,1,-0.226803,0.443541,4.091866,-0.556459,1,0.167374,0.541746,3.941761,-0.458254,1,0.549686,0.634063,3.823123
4,4,3.0,35.0,0.0,0.0,8.05,1.0,0,-0.635989,0.346154,0.346154,0,-1.007355,0.267498,-3.713656,0.267498,0,-1.37221,0.202263,-3.648554,0.202263,0,-1.732227,0.150303,-3.60017
5,5,3.0,29.0,0.0,0.0,8.4583,1.0,0,-0.635989,0.346154,0.346154,0,-1.007355,0.267498,-3.713656,0.267498,0,-1.37221,0.202263,-3.648554,0.202263,0,-1.732227,0.150303,-3.60017
6,6,1.0,54.0,0.0,0.0,51.8625,1.0,0,-0.635989,0.346154,0.346154,0,-1.007355,0.267498,-3.713656,0.267498,0,-1.37221,0.202263,-3.648554,0.202263,0,-1.732227,0.150303,-3.60017
7,7,3.0,2.0,3.0,1.0,21.075,1.0,0,-0.635989,0.346154,0.346154,0,-1.007355,0.267498,-3.713656,0.267498,0,-1.37221,0.202263,-3.648554,0.202263,0,-1.732227,0.150303,-3.60017
8,8,3.0,27.0,0.0,2.0,11.1333,0.0,1,-0.635989,0.346154,-0.653846,1,-0.226803,0.443541,4.091866,-0.556459,1,0.167374,0.541746,3.941761,-0.458254,1,0.549686,0.634063,3.823123
9,9,2.0,14.0,1.0,0.0,30.0708,0.0,1,-0.635989,0.346154,-0.653846,1,-0.226803,0.443541,4.091866,-0.556459,1,0.167374,0.541746,3.941761,-0.458254,1,0.549686,0.634063,3.823123


# Step by step algorithm evolution

In [8]:
feature_target_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Survived']
def display_df(step=0):
    if step == 1:
        df_result_X['initial_logodds'] = "???"
        df_result_X['initial_prob'] = "???"
        columns = feature_target_columns + ["initial_logodds", "initial_prob"]
        return df_result_X.loc[:, columns]
    elif step == 2:
        df_result_X['initial_logodds'] = df_result_X.loc[:, "Logodds_0"]
        df_result_X['initial_prob'] = df_result_X.loc[:, "Probs_0"]
        columns = feature_target_columns + ["initial_logodds", "initial_prob"]
        return df_result_X.loc[:, columns]
    elif step == 3:
        columns = feature_target_columns + ["initial_logodds", "initial_prob", "Gradients_0_1"]
        return df_result_X.loc[:, columns]
    elif step == 4:
        columns = feature_target_columns + ["initial_logodds", "initial_prob", "Gradients_0_1", "Clusters_1"]
        return df_result_X.loc[:, columns]
    elif step == 5:
        columns = feature_target_columns + ["initial_logodds", "initial_prob", "Gradients_0_1", "Clusters_1"]   
        return df_result_X.sort_values("Clusters_1", inplace=False).loc[:, columns]
    elif step == 6:
        columns = feature_target_columns + ["initial_logodds", "initial_prob", "Gradients_0_1", "Clusters_1", "Leaf_Prediction_1"]
        return df_result_X.sort_values("Clusters_1", inplace=False).loc[:, columns]
    elif step == 7:
        columns = feature_target_columns + ["initial_logodds", "initial_prob", "Gradients_0_1", "Clusters_1", "Leaf_Prediction_1", "Logodds_1", "Probs_1"]
        return df_result_X.sort_values("Clusters_1", inplace=False).loc[:, columns]
    else:
        return df_result_X.sort_values("Clusters_1", inplace=False)
interact(display_df, step = (1, 7, 1))

interactive(children=(IntSlider(value=1, description='step', max=7, min=1), Output()), _dom_classes=('widget-i…

<function __main__.display_df(step=0)>

# Explanation of the steps taken by the algorithm
#### 1. Step 1 asks the question : What could be an initial logodds or probability prediction? We have covered this, it is the logodds value which when translated to probability via the sigmoid function, minimizes the binary cross entropy loss function.

#### 2. Step 2 has the initial logodds and probability prediction set.

#### 3. Step 3 visits ***EACH and EVERY row respectively and independently***  and considers the prediction of each row as the single parameter to the binary cross entropy loss function. 
#### It calculates the gradient of the loss with respect to the logodds prediction (the loss is the binary cross entropy loss function)

#### 4. Step 4 creates a decision tree whereby the ***target values are the GRADIENTS calculated in step 3***. 
#### The rational behind this is that predictions who would have similar impact to the loss, should be grouped together.

#### 5. Step 5 sorts the rows based on cluster/leaf ids (the cluster id is the id of the decision tree leaf in which the rows end up being present).

#### 6. Step 6 is performed *** for EACH and EVERY cluster/leaf***. 
#### It keeps only the rows for a specific cluster index and finds the single best logodds delta value for the rows of the cluster/leaf that would minimize the binary cross entropy loss. ***It is a single value and since it is a logodds delta value it is being added to the previous logodds values of the rows present in the cluster/leaf.*** 
#### ***Linear operation in the logodds space!***

#### 7. Step 7 updates the initial prediction with the one after the construction of the first decision tree.
#### The logodds value is being updated with a linear relationship in relation to the previous one. The probability is updated via the sigmoid function.

### The process repeats for subsequent trees.

### Step 3 breakdown

In [9]:
index = 0
logodds_value = torch.tensor(df_result_X.iloc[index, :]["Logodds_0"], requires_grad = True)
y_tensor = df_result_X.iloc[index, :]["Survived"]
loss = -torch.sum(y_tensor * torch.log(torch.sigmoid(logodds_value)) + (1. - y_tensor) * torch.log(1. - torch.sigmoid(logodds_value)))     
grad = autograd.grad([loss], [logodds_value])
print(f"The parameter value (by using pytorch autograd) is : {grad[0]}") 

The parameter value (by using pytorch autograd) is : 0.3461537775259567


### Step 6 breakdown 

In [10]:
df_clusters_1_0 = df_result_X.loc[df_result_X["Clusters_1"] == 0 , :]
# Minmize loss function by using a pytorch optimizer 
leaf_delta_value = torch.tensor(0.0, requires_grad = True)
learning_rate = 0.01
optimizer = optim.Adam([leaf_delta_value], lr = learning_rate)
for _ in range(1000):
    loss = F.binary_cross_entropy_with_logits(
        torch.tensor(df_clusters_1_0["Logodds_0"].to_numpy(), ) + leaf_delta_value,
        torch.tensor(df_clusters_1_0["Survived"].to_numpy(), dtype=torch.float32))   
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(f"The parameter value is : {leaf_delta_value}")

The parameter value is : -3.713656187057495


In [11]:
df_clusters_1_0 = df_result_X.loc[df_result_X["Clusters_1"] == 0 , :]
# Minmize loss function by using a pytorch optimizer 
leaf_delta_value.data = torch.tensor(0.0)
learning_rate = 0.01
# optimizer = optim.Adam([leaf_delta_value], lr = learning_rate)
for _ in range(1000):
    loss = F.binary_cross_entropy_with_logits(
        torch.tensor(df_clusters_1_0["Logodds_0"].to_numpy(), ) + leaf_delta_value,
        torch.tensor(df_clusters_1_0["Survived"].to_numpy(), dtype=torch.float32))   
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(f"The parameter value is : {leaf_delta_value}")

The parameter value is : -4.18042516708374


### Step 7 breakdown

In [12]:
# The predictions are updated based on the following formula : 
previous_prediction_column = "Logodds_0"
leaf_prediction_column = "Leaf_Prediction_1"
df_result_X[previous_prediction_column] + GRADIENT_BOOST_LEARNING_RATE * df_result_X[leaf_prediction_column]
# Probability column
# F.sigmoid(torch.tensor(df_result_X[previous_prediction_column] + GRADIENT_BOOST_LEARNING_RATE * df_result_X[leaf_prediction_column])).numpy()

0     -1.007355
1     -0.226803
2     -0.226803
3     -0.226803
4     -1.007355
         ...   
151   -0.226803
152   -1.007355
153   -1.007355
154   -1.007355
155   -1.007355
Length: 156, dtype: float64