In [None]:
import numpy as np
from src.helper import get_split_data

In [None]:
X_trn, y_trn, X_val, y_val, X_tst, y_tst = get_split_data.split_data_for_training()

In [None]:
def calc_gini(df, val_col, label_col, pos_val, split_point, debug=False):
    """
    This function calculates the Gini impurity of a dataset. Gini impurity
    is a measure of the probability of a random sample being classified
    incorrectly when a feature is used to split the data. The lower the
    impurity, the better the split.
    Parameters:
    df (pd.DataFrame): The dataframe containing the data
    val_col (str): The column name of the feature used to split the data
    label_col (str): The column name of the target variable
    pos_val (str or int): The value of the target variable that represents
    the positive class
    split_point (float): The threshold used to split the data.
    debug (bool): optional, when set to True, prints the calculated Gini
    impurities and the final weighted average
    Returns:
    float: The weighted average of Gini impurity for the positive and
    negative subsets.
    """

    ge_split = df[val_col] >= split_point
    eq_pos = df[label_col] == pos_val
    tp = df[ge_split & eq_pos].shape[0]
    fp = df[ge_split & ~eq_pos].shape[0]
    tn = df[~ge_split & ~eq_pos].shape[0]
    fn = df[~ge_split & eq_pos].shape[0]
    pos_size = tp + fp
    neg_size = tn + fn
    total_size = len(df)
    if pos_size == 0:
        gini_pos = 0
    else:
       gini_pos = 1 - (tp / pos_size) ** 2 - (fp / pos_size) ** 2
    if neg_size == 0:
        gini_neg = 0
    else:
        gini_neg = 1 - (tn / neg_size) ** 2 - (fn / neg_size) ** 2
    weighted_avg = gini_pos * (pos_size / total_size) + \
                   gini_neg * (neg_size / total_size)
    if debug:
        print(f'{gini_pos=:.3} {gini_neg=:.3} {weighted_avg=:.3}')

    return weighted_avg


In [None]:
calc_gini(X_trn.assign(result_match=y_trn), val_col='points_difference', label_col='result_match', pos_val=1, split_point=0, debug=True)

In [None]:
import matplotlib.pyplot as plt

values = np.arange(-70, 90, .1)
ginis = []
for v in values:
        ginis.append(calc_gini(X_trn.assign(result_match=y_trn), val_col='points_difference', label_col='result_match', pos_val=1, split_point=v, debug=False))
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(values, ginis)
ax.set_title('Gini Coefficient')
ax.set_ylabel('Gini Coefficient')
ax.set_xlabel('Split Point')

In [None]:
import pandas as pd

gini_df = pd.DataFrame({'gini': ginis, 'value': values})
print(gini_df.query('gini <= gini.min()'))

In [None]:
from sklearn import tree
import dtreeviz

stump_dt = tree.DecisionTreeClassifier(max_depth=1)
stump_dt.fit(X_trn, y_trn)

In [None]:
class_names_str = ['home_win' if cls == 1 else 'home_not_win' for cls in stump_dt.classes_]

fig, ax = plt.subplots(figsize=(8, 4))
features = list(c for c in X_trn.columns)
tree.plot_tree(stump_dt, feature_names=features, filled=True, class_names=class_names_str, ax=ax)

In [None]:
stump_dt.score(X_val, y_val)

In [None]:
from sklearn import dummy
dummy_model = dummy.DummyClassifier()
dummy_model.fit(X_trn, y_trn)
dummy_model.score(X_val, y_val)

In [None]:
import xgboost as xgb
kag_stump = xgb.XGBClassifier(n_estimators=1, max_depth=1)
kag_stump.fit(X_trn, y_trn)

In [None]:
kag_stump = xgb.XGBClassifier(n_estimators=1, max_depth=1)
kag_stump.fit(X_trn, y_trn)
kag_stump.score(X_val, y_val)

In [None]:
from src.helper.plot_tree import my_dot_export

my_dot_export(kag_stump, num_trees=0, filename='img/stump_xg_kag.dot', title='XGBoost Stump')

In [None]:
import numpy as np
def inv_logit(p: float) -> float:
    """
    Compute the inverse logit function of a given value.
    The inverse logit function is defined as:
    f(p) = exp(p) / (1 + exp(p))
    Parameters
    ----------
    p : float
    The input value to the inverse logit function.
    Returns
    -------
    float
    The output of the inverse logit function.
    """
    return np.exp(p) / (1 + np.exp(p))

In [None]:
inv_logit(-0.012807931)

In [None]:
inv_logit(.182794467)