In [None]:
# Plot text on a graph

def plot_text(ax: plt.Axes):
    """
    text on barplot
    """
    for p in ax.patches:
        percentage = '{:.1f}%'.format(p.get_height())
        ax.annotate(
            percentage,  # text
            # coordinate xy
            (p.get_x() + p.get_width() / 2., p.get_height()),
            # center
            ha='center',
            va='center',
            xytext=(0, 10),
            # offset point
            textcoords='offset points',
            fontsize=14)




In [None]:
def barplot_group_by_category(df_data: pd.DataFrame, col_main: str, col_group: str,
                              title: str) -> None:
    """
    build barplot with normalized data and graph data annotations

    highligting target variable by variable
    """

    plt.figure(figsize=(15, 6))

    data = (df_data.groupby([col_main])[col_group].value_counts(normalize=True).rename(
        'percentage').mul(100).reset_index().sort_values(col_main))

    ax = sns.barplot(x=col_main,
                     y="percentage",
                     hue=col_group,
                     data=data,
                     palette='rocket')

    for p in ax.patches:
        percentage = '{:.1f}%'.format(p.get_height())
        ax.annotate(
            percentage,
            (p.get_x() + p.get_width() / 2., p.get_height()),
            ha='center',
            va='center',
            xytext=(0, 7),
            textcoords='offset points',
            fontsize=12)

    plt.title(title, fontsize=16)
    plt.ylabel('Percentage', fontsize=14)
    plt.xlabel(col_main, fontsize=14)
    plt.show()


def barplot_group_within_targetvar(df_data: pd.DataFrame, col_main: str, col_group: str,
                                   title: str) -> None:
    """
    build barplot with normalized data and graph data annotations

    highligting proportion of variable within target group
    """

    plt.figure(figsize=(15, 6))

    data = (df_data.groupby([col_group])[col_main].value_counts(normalize=True).rename(
        'percentage').mul(100).reset_index().sort_values(col_group))

    ax = sns.barplot(x=col_main,
                     y="percentage",
                     hue=col_group,
                     data=data,
                     palette='rocket')

    for p in ax.patches:
        percentage = '{:.1f}%'.format(p.get_height())
        ax.annotate(
            percentage,
            (p.get_x() + p.get_width() / 2., p.get_height()),
            ha='center',
            va='center',
            xytext=(0, 7),
            textcoords='offset points',
            fontsize=12)

    plt.title(title, fontsize=16)
    plt.ylabel('Percentage', fontsize=14)
    plt.xlabel(col_main, fontsize=14)
    plt.show()

In [None]:
def barplot_group_within_targetvar(df_data: pd.DataFrame, col_main: str, col_group: str,
                  title: str) -> None:
    """
    build barplot with normalized data and graph data annotations

    highligting proportion of variable within target group
    """

    plt.figure(figsize=(15, 6))

    data = (df_data.groupby([col_group])[col_main].value_counts(normalize=True).rename(
            'percentage').mul(100).reset_index().sort_values(col_group))

    ax = sns.barplot(x=col_main,
                     y="percentage",
                     hue=col_group,
                     data=data,
                     palette='rocket')

    for p in ax.patches:
        percentage = '{:.1f}%'.format(p.get_height())
        ax.annotate(
            percentage,
            (p.get_x() + p.get_width() / 2., p.get_height()),
            ha='center',
            va='center',
            xytext=(0, 7),
            textcoords='offset points',
            fontsize=12)

    plt.title(title, fontsize=16)
    plt.ylabel('Percentage', fontsize=14)
    plt.xlabel(col_main, fontsize=14)
    plt.show()

In [None]:
# checking for multicollinearity - VIF (Variance Inflation Factor)
# tells you how much the variance of a regression coefficient is inflated due to multicollinearity

def compute_vif(df):
    vif_data = []
    X = df.dropna()
    for i in range(X.shape[1]):
        y = X.iloc[:, i]
        X_other = X.drop(X.columns[i], axis=1)

        model = LinearRegression().fit(X_other, y)
        r2 = model.score(X_other, y)
        vif = 1 / (1 - r2) if r2 < 1 else float('inf')

        vif_data.append({'Variable': X.columns[i], 'VIF': vif})

    return pd.DataFrame(vif_data)

In [None]:
# chi-square test and Cramer's V functions for categorical variables association

def cramers_v(confusion_matrix):
    """Compute Cramér's V (strength of association) from a confusion matrix."""
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


def categorical_assoc_with_churn(df, churn_col='Churn'):
    """
    For each categorical feature in df (excluding churn_col),
    run Chi-square test + Cramér's V against churn.
    """
    results = []

    # Loop over categorical columns
    for col in df.select_dtypes(include=['object', 'category']).columns:
        if col == churn_col:
            continue  # skip target

        contingency_table = pd.crosstab(df[churn_col], df[col])

        # Chi-square
        chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

        # Cramer's V
        cv = cramers_v(contingency_table)

        results.append({
            'Feature': col,
            'Chi2': chi2,
            'p_value': p,
            'Cramers_V': cv
        })

    return pd.DataFrame(results).sort_values('Cramers_V', ascending=False)


In [None]:
# evaluation metrics : Regression & Classification problems



def r2_adjusted(y_true: np.ndarray, y_pred: np.ndarray,
                X_test: np.ndarray) -> float:

    N_objects = len(y_true)
    N_features = X_test.shape[1]
    r2 = r2_score(y_true, y_pred)
    return 1 - (1 - r2) * (N_objects - 1) / (N_objects - N_features - 1)


def mpe(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Mean percentage error"""
    return np.mean((y_true - y_pred) / y_true) * 100


def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Mean absolute percentage error"""
    return np.mean(np.abs((y_pred - y_true) / y_true)) * 100


def wape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Weighted Absolute Percent Error"""
    return np.sum(np.abs(y_pred - y_true)) / np.sum(y_true) * 100


def huber_loss(y_true: np.ndarray, y_pred: np.ndarray, delta: float = 1.345):
    """Huber loss fucntion"""
    assert len(y_true) == len(y_pred), 'difference size of data'
    huber_sum = 0
    for i in range(len(y_true)):
        if abs(y_true[i] - y_pred[i]) <= delta:
            huber_sum += 0.5 * (y_true[i] - y_pred[i])**2
        else:
            huber_sum += delta * (abs(y_true[i] - y_pred[i]) - 0.5 * delta)
    huber_sum /= len(y_true)
    return huber_sum


def logcosh(y_true: np.ndarray, y_pred: np.ndarray):
    """Log-Cosh loss function"""
    return np.sum(np.log(np.cosh(y_true - y_pred)))


def rmsle(y_true: np.ndarray, y_pred: np.ndarray) -> np.float64:
    """
    The Root Mean Squared Log Error (RMSLE) metric
    """
    try:
        return np.sqrt(mean_squared_log_error(y_true, y_pred))
    except:
        return None


def get_metrics_regression(y_test: np.ndarray,
                           y_pred: np.ndarray,
                           X_test: np.ndarray,
                           name: str = None,
                           delta: float = 1.345):
    """generate table with metrics for regression problem"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]

    df_metrics['MAE'] = mean_absolute_error(y_test, y_pred)
    df_metrics['MSE'] = mean_squared_error(y_test, y_pred)
    df_metrics['RMSE'] = np.sqrt(mean_squared_error(y_test, y_pred))
    df_metrics['RMSLE'] = rmsle(y_test, y_pred)
    df_metrics['R2 adjusted'] = r2_adjusted(y_test, y_pred, X_test)
    # df_metrics['Huber_loss'] = huber_loss(y_test, y_pred, delta)
    # df_metrics['Logcosh'] = logcosh(y_test, y_pred)
    df_metrics['MPE_%'] = mpe(y_test, y_pred)
    df_metrics['MAPE_%'] = mape(y_test, y_pred)
    df_metrics['WAPE_%'] = wape(y_test, y_pred)

    return df_metrics


def get_metrics_classification(y_test, y_pred, y_score, name):
    """generate table with metrics for classification problem"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score) #[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred)
    df_metrics['Recall'] = recall_score(y_test, y_pred)
    df_metrics['f1'] = f1_score(y_test, y_pred)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

In [None]:
def check_overfitting_classification(model, X_train, y_train, X_test, y_test, metric_fun):
    """
   check overfitting for classification

    Parameters:
    - model: Trained classification model.
    - X_train: Training features.
    - y_train: Training target labels.
    - X_test: Test features.
    - y_test: Test target labels.
    - metric_fun: Metric function (e.g., accuracy_score, f1_score, roc_auc_score).
    """
    # Predict on training and test sets
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Calculate metric values
    value_train = metric_fun(y_train, y_pred_train)
    value_test = metric_fun(y_test, y_pred_test)

    # Print results
    print(f'{metric_fun.__name__} train: %.3f' % value_train)
    print(f'{metric_fun.__name__} test: %.3f' % value_test)
    print(f'delta = {(abs(value_train - value_test) / value_test * 100):.1f} %')


In [None]:
# Catboost for Classification


def objective_lgb(trial, X, y, N_FOLDS, random_state, cat_feat):
    params = {

        "n_estimators": trial.suggest_categorical("n_estimators", [300]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.08797829241393999]),
#         "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5,1.0),
        "l2_leaf_reg": trial.suggest_uniform("l2_leaf_reg", 1e-5, 1e2),
        'random_strength': trial.suggest_uniform('random_strength', 10, 50),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS", "No"]),
        'border_count': trial.suggest_categorical('border_count', [128, 254]),
        'grow_policy': trial.suggest_categorical('grow_policy', ["SymmetricTree", "Depthwise", "Lossguide"]),

        'od_wait': trial.suggest_int('od_wait', 500, 2000),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 15),
        #"cat_features": trial.suggest_categorical("cat_features", ["cat_features"])
        #"loss_function": trial.suggest_categorical("loss_function", ["MAE"]),
        "use_best_model": trial.suggest_categorical("use_best_model", [True]),
        "eval_metric": trial.suggest_categorical("eval_metric", ["AUC"]),
        "random_state": random_state
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float(
            "bagging_temperature", 0, 100)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float(
            "subsample", 0.1, 1, log=True)

    cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        train_data = Pool(data=X_train, label=y_train, cat_features=cat_feat)
        eval_data = Pool(data=X_test, label=y_test, cat_features=cat_feat)

        model = CatBoostClassifier(**params)
        model.fit(train_data,
                  eval_set=[eval_data],
                  early_stopping_rounds=100,
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = roc_auc_score(y_test, preds)

    return np.mean(cv_predicts)

In [None]:
def cross_validation_cat(X_train: pd.DataFrame,
                         y_train: pd.Series,
                         X_test: pd.DataFrame,
                         y_test: pd.Series,
                         clf,
                         params: dict,
                         cat_features: list = None,
                         eval_metric: str = None,
                         early_stop: bool = False,
                         early_stopping_rounds: int = 100,
                         num_folds: int = 3,
                         random_state: int = 10,
                         shuffle: bool = True):

    """using cross-validation for classification problem (catboost base algorithms - no tuning)"""
    
    # shuffle - shuffle data before splitting only
    folds = KFold(n_splits=num_folds, random_state=random_state, shuffle=shuffle)
    score_oof = []
    predictions_test = []

    for fold, (train_index,
               test_index) in enumerate(folds.split(X_train, y_train)):
        X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
        y_val_exp = np.exp(y_val) - 1

        model = clf(**params)

        if early_stop == True:
            if eval_metric is None:
                model.fit(X_train_,
                          y_train_,
                          eval_set=[(X_val, y_val)],
                          cat_features=cat_features,
                          silent=True,
                          early_stopping_rounds=early_stopping_rounds)
            else:
                model.fit(X_train_,
                          y_train_,
                          eval_set=[(X_val, y_val)],
                          eval_metric=eval_metric,
                          silent=True,
                          cat_features=cat_features,
                          early_stopping_rounds=early_stopping_rounds)
        else:
            model.fit(X_train_, y_train_, cat_features=cat_features)

        y_pred_val = model.predict(X_val)
        y_pred = model.predict(X_test)

        print(
            "Fold:", fold + 1,
            "ROC-AUC SCORE %.3f" % roc_auc_score(y_val, y_pred_val))
        print("---")

        # oof list
        score_oof.append(roc_auc_score(y_val, y_pred_val))
        # holdout list
        predictions_test.append(y_pred)

    return score_oof, predictions_test