## Test the Similarity of Train/Val Dataset

### 1. Test Based on Targets
Here we want to check if they are different.

In [None]:
from scipy.stats import chi2_contingency, fisher_exact
import numpy as np
import pandas as pd
import os

In [None]:
# Read data
y_train_a 
y_val_a = 
y_train_na = 
y_val_na = 

In [None]:
contigency_non_alnc = pd.DataFrame.from_dict({"Train": y_train_na.groupby(
    TARGET).size(), "Val": y_val_na.groupby(TARGET).size()})

contigency_alnc = pd.DataFrame.from_dict({"Train": y_train_a.groupby(
    TARGET).size(), "Val": y_val_a.groupby(TARGET).size()})


In [None]:
print(
    chi2_contingency(contigency_non_alnc.values), 
    "\n----------------------------------------\n",
    chi2_contingency(contigency_alnc.values)
)

In [None]:
print(
    fisher_exact(contigency_non_alnc.values),
    "\n----------------------------------------\n",
    fisher_exact(contigency_alnc.values)
)

### 2. Test Based on Train

In [None]:
from optbinning.binning.metrics import jeffrey as psi

In [None]:

def calculate_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
    '''Calculate the PSI (population stability index) across all variables

    Args:
       expected: numpy matrix of original values
       actual: numpy matrix of new values, same size as expected
       buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
       buckets: number of quantiles to use in bucketing variables
       axis: axis by which variables are defined, 0 for vertical, 1 for horizontal

    Returns:
       psi_values: ndarray of psi values for each variable

    Author:
       Matthew Burke
       github.com/mwburke
       worksofchart.com
    '''

    def psi(expected_array, actual_array, buckets):
        '''Calculate the PSI for a single variable

        Args:
           expected_array: numpy array of original values
           actual_array: numpy array of new values, same size as expected
           buckets: number of percentile ranges to bucket the values into

        Returns:
           psi_value: calculated PSI value
        '''

        def scale_range (input, min, max):
            input += -(np.min(input))
            input /= np.max(input) / (max - min)
            input += min
            return input


        breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

        if buckettype == 'bins':
            breakpoints = scale_range(breakpoints, np.min(expected_array), np.max(expected_array))
        elif buckettype == 'quantiles':
            breakpoints = np.stack([np.percentile(expected_array, b) for b in breakpoints])



        expected_percents = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
        actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

        def sub_psi(e_perc, a_perc):
            '''Calculate the actual PSI value from comparing the values.
               Update the actual value to a very small number if equal to zero
            '''
            if a_perc == 0:
                a_perc = 0.0001
            if e_perc == 0:
                e_perc = 0.0001

            value = (e_perc - a_perc) * np.log(e_perc / a_perc)
            return(value)

        psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))

        return(psi_value)

    if len(expected.shape) == 1:
        psi_values = np.empty(len(expected.shape))
    else:
        psi_values = np.empty(expected.shape[axis])

    for i in range(0, len(psi_values)):
        if len(psi_values) == 1:
            psi_values = psi(expected, actual, buckets)
        elif axis == 0:
            psi_values[i] = psi(expected[:,i], actual[:,i], buckets)
        elif axis == 1:
            psi_values[i] = psi(expected[i,:], actual[i,:], buckets)

    return(psi_values)

In [None]:
x_train_a = pd.read_parquet(os.path.join(path, "m", "X_train.parquet"))
x_val_a = pd.read_parquet(os.path.join(path, "m", "X_val.parquet"))
x_train_na = pd.read_parquet(os.path.join(path, "f", "X_train.parquet"))
x_val_na = pd.read_parquet(os.path.join(path, "f", "X_val.parquet"))

In [None]:
x_train_a.head(2)

In [None]:
x_val_a.head(2)

In [None]:
non_alnc_model_features[0]

In [None]:
calculate_psi(x_train_a[non_alnc_model_features[0]].values, x_val_a[non_alnc_model_features[0]].values)

In [None]:
class PSI:
    """Calculate the PSI (population stability index) across all variables

    Args:
       expected: numpy matrix of original values
       actual: numpy matrix of new values, same size as expected
       buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
       buckets: number of quantiles to use in bucketing variables
       axis: axis by which variables are defined, 0 for vertical, 1 for horizontal

    Returns:
       psi_values: ndarray of psi values for each variable

    Author:
       jnsofini
       Adapted from: https://github.com/mwburke/population-stability-index/blob/master/psi.py
    """

    def __init__(self, expected, actual, bucket_type="bins", buckets=10, axis=0):
        self.expected = expected
        self.actual = actual
        self.bucket_type = bucket_type
        self.buckets = buckets
        self.axis = axis

    def psi(self):
        """Calculate the PSI for a single variable

        Args:
           expected_array: numpy array of original values
           actual_array: numpy array of new values, same size as expected
           buckets: number of percentile ranges to bucket the values into

        Returns:
           psi_value: calculated PSI value
        """

        breakpoints = np.arange(0, self.buckets  + 1) / (self.buckets) * 100

        self.expected = self.expected

        if self.bucket_type == "bins":
            breakpoints = self.scale_range(
                breakpoints, np.min(self.expected), np.max(self.expected)
            )
        elif self.bucket_type == "quantiles":
            breakpoints = np.stack(
                [np.percentile(self.expected, b) for b in breakpoints]
            )

        expected_percents = np.histogram(self.expected, breakpoints)[0] / len(
            self.expected
        )
        actual_percents = np.histogram(self.actual, breakpoints)[0] / len(self.actual)

        psi_value = sum(
            self.sub_psi(expected_percents[i], actual_percents[i])
            for i in range(0, len(expected_percents))
        )

        return psi_value

    def get_psi(self):
        if len(self.expected.shape) == 1:
            psi_values = np.empty(len(self.expected.shape))
        else:
            psi_values = np.empty(self.expected.shape[self.axis])

        for i in range(0, len(psi_values)):
            if len(psi_values) == 1:
                psi_values = self.psi()
            elif self.axis == 0:
                psi_values[i] = self.psi()
            elif self.axis == 1:
                psi_values[i] = self.psi()

        return psi_values

    @staticmethod
    def scale_range(input, min, max):
        input += -(np.min(input))
        input /= np.max(input) / (max - min)
        input += min
        return input

    @staticmethod
    def sub_psi(e_perc, a_perc):
        """Calculate the actual PSI value from comparing the values.
            Update the actual value to a very small number if equal to zero
        """
        if a_perc == 0:
            a_perc = 0.0001
        if e_perc == 0:
            e_perc = 0.0001

        value = (e_perc - a_perc) * np.log(e_perc / a_perc)
        return value

In [None]:
PSI(x_train_a[non_alnc_model_features[0]].values, x_val_a[non_alnc_model_features[0]].values).get_psi()

As per the source The common interpretations of the PSI result are:

- PSI < 0.1: no significant population change
- PSI < 0.2: moderate population change
- PSI >= 0.2: significant population change

In [None]:
psi_data = {"feature":[], "psi_SB_Core":[]}
for col in non_alnc_model_features:
    if x_train_na[col].dtype in ["object", "category", "string"]:
        continue
    psi_data["psi_SB_Core"].append(PSI(x_train_na[col].values, x_val_na[col].values).get_psi())
    psi_data["feature"].append(col)

In [None]:
pd.DataFrame.from_dict(psi_data)

In [None]:
cat_cols = x_train_na.select_dtypes(
            include=["object", "category", "string"]
        ).columns.values

In [None]:
def categorical_psi(actual, expected):
    index1, actual_freq = np.unique(actual, return_counts=True)
    index2, expected_freq = np.unique(expected, return_counts=True)
    data = pd.merge(
        pd.DataFrame({"index": index1, "actual": actual_freq}),
        pd.DataFrame({"index": index2, "expected": expected_freq}),
        on="index",
        # how="outer"
        )

    data["actual%"] = data["actual"]/data["actual"].sum()
    data["expected%"] = data["expected"]/data["expected"].sum()

    def sub_psi(e_perc, a_perc):
        """Calculate the actual PSI value from comparing the values.
            Update the actual value to a very small number if equal to zero
        """
        if a_perc == 0:
            a_perc = 0.0001
        if e_perc == 0:
            e_perc = 0.0001

        value = (e_perc - a_perc) * np.log(e_perc / a_perc)
        return value

    data["PSI"] = data[["actual%", "expected%"]].apply(lambda x: sub_psi(x["actual%"], x["expected%"]), axis=1)

    return data["PSI"].sum()

In [None]:
categorical_psi(x_train_na['B1_FARM_LOCN_PROV_CD'].values, x_val_na['B1_FARM_LOCN_PROV_CD'].values)

In [None]:
psi_data = {"feature":non_alnc_model_features, "psi_SB_Core": []}
for col in alnc_model_features:
    if x_train_na[col].dtype in ["object", "category", "string"]:
        psi_score = categorical_psi(x_train_a[col].values, x_val_a[col].values)
    else:
        psi_score = PSI(x_train_a[col].values, x_val_a[col].values).get_psi()
    # psi_data["feature"].append(col)
    psi_data["psi_SB_Core"].append(psi_score)

In [None]:
psi_table_na = pd.DataFrame.from_dict(psi_data)

In [None]:
psi_table_a = pd.DataFrame.from_dict(psi_data).rename(columns={"psi_SB_Core":"psi_alnc"})

In [None]:
psi_table_all = pd.merge(psi_table_na, psi_table_a, on="feature", how="outer")

In [None]:
psi_table_all.to_csv("stability-test-train-val_split.csv")

Train a random forest on the data

In [None]:
X_train_direct_lending = pd.concat([x_train_na.assign(target=0), x_val_na.assign(target=1)], axis=0)
X_train_alnc = pd.concat([x_train_a.assign(target=0), x_val_a.assign(target=1)], axis=0)

In [None]:
X = X_train_SB_core[non_alnc_model_features]
y = X_train_SB_core['target'].values

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

model = RandomForestClassifier(n_jobs=-1, max_depth=5, min_samples_leaf = 5)
predictions = np.zeros(y.shape) #creating an empty prediction array

In [None]:
t = None
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(fold, t:=train_idx)

In [None]:
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    X_train, X_test = X.filter(items=train_idx, axis=0), X.filter(items=test_idx, axis=0)
    y_train, y_test = y[train_idx], y[test_idx]
 
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:, 1] #calculating the probability
    predictions[test_idx] = probs