In [1]:
import pandas as pd
import numpy as np


In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import kagglehub


def prepare_titanic_data():
    # Download latest version
    path = kagglehub.dataset_download("yasserh/titanic-dataset")

    df = pd.read_csv(path + "/Titanic-Dataset.csv")
    display(df)

    numeric_features = ["Age", "SibSp", "Parch", "Fare"]
    categorical_features = ["Pclass", "Sex", "Embarked"]

    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median"))]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(drop="first")),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    processed_df = df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

    Y = processed_df["Survived"].to_numpy()
    X = processed_df.drop(["Survived"], axis=1)

    X_transformed = preprocessor.fit_transform(X)

    return (
        X_transformed,
        Y,
    )


# Вызываем функцию и получаем данные
X, y = prepare_titanic_data()

# Для отображения результата
X




Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


array([[22.,  1.,  0., ...,  1.,  0.,  1.],
       [38.,  1.,  0., ...,  0.,  0.,  0.],
       [26.,  0.,  0., ...,  0.,  0.,  1.],
       ...,
       [28.,  1.,  2., ...,  0.,  0.,  1.],
       [26.,  0.,  0., ...,  1.,  0.,  0.],
       [32.,  0.,  0., ...,  1.,  1.,  0.]])

In [3]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_test


array([[28.,  0.,  0., ...,  1.,  0.,  0.],
       [28.,  0.,  0., ...,  1.,  0.,  1.],
       [ 7.,  4.,  1., ...,  1.,  1.,  0.],
       ...,
       [31.,  1.,  0., ...,  0.,  0.,  0.],
       [23.,  0.,  0., ...,  1.,  0.,  1.],
       [19.,  0.,  0., ...,  1.,  0.,  1.]])

In [4]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0, criterion="gini")
clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))


              precision    recall  f1-score   support

           0       0.81      0.87      0.84       110
           1       0.77      0.68      0.72        69

    accuracy                           0.80       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



In [5]:
from sklearn.tree import DecisionTreeRegressor
from scipy.optimize import minimize


class CustomGradientBoosting:
    def sigmoid(self, x: np.ndarray):
        return 1 / (1 + np.exp(-x))

    def log_loss(self, y_true: np.ndarray, y_pred: np.ndarray):
        eps = 1e-15
        y_pred = np.clip(y_pred, eps, 1 - eps)
        return -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def log_loss_gradient(self, y_true: np.ndarray, y_pred: np.ndarray):
        eps = 1e-15
        y_pred = np.clip(y_pred, eps, 1 - eps)
        return -(y_true / y_pred - (1 - y_true) / (1 - y_pred))

    def __init__(
        self,
        n_estimators=100,
        BaseAlgorithmType: type[DecisionTreeRegressor] = DecisionTreeRegressor,
        learning_rate=0.02,
        max_depth=5,
    ):
        self.n_estimators = n_estimators
        self.BaseAlgorithmType = BaseAlgorithmType
        self.learning_rate = learning_rate
        self.max_depth = max_depth

    def fit(self, X, y):
        self._base_algorithms = []
        self._alphas = []
        pred_y = np.zeros_like(y, dtype=np.float64)

        for t in range(0, self.n_estimators):
            base_algorithm = DecisionTreeRegressor(
                criterion="squared_error", max_depth=5
            )
            base_algorithm.fit(X, -self.log_loss_gradient(y, self.sigmoid(pred_y)))
            alpha = minimize(
                lambda alpha: self.log_loss(
                    y,
                    self.sigmoid(pred_y + alpha * base_algorithm.predict(X)),
                ),
                0.01,
                method="L-BFGS-B",
                bounds=[(0.001, 10)],
                options={"ftol": 1e-6, "gtol": 1e-6},
            ).x[0]

            pred_y += self.learning_rate * alpha * base_algorithm.predict(X)
            self._base_algorithms.append(base_algorithm)
            self._alphas.append(alpha)

    def predict_proba(self, X):
        predictions = np.zeros(len(X))
        for base_algorithm, alpha in zip(self._base_algorithms, self._alphas):
            predictions += self.learning_rate * alpha * base_algorithm.predict(X)
        return self.sigmoid(predictions)

    def predict(self, X):
        return np.where(self.predict_proba(X) >= 0.5, 1, 0)


In [6]:
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import KFold
import sklearn


def perform_cross_validation(
    X: np.ndarray,
    y: np.ndarray,
    model: sklearn.base.ClassifierMixin,
    n_splits: int = 5,
    random_state: int = 0,
) -> dict:
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    cv_scores = []

    for train_index, test_index in kf.split(X):
        X_cv_train, X_cv_test = X[train_index], X[test_index]
        y_cv_train, y_cv_test = y[train_index], y[test_index]

        model.fit(X_cv_train, y_cv_train)

        y_cv_pred = model.predict(X_cv_test)
        f1 = f1_score(y_cv_test, y_cv_pred)
        cv_scores.append(f1)

    results = {"scores": cv_scores, "mean_score": np.mean(cv_scores)}

    return results


custom_gradient_boosting = CustomGradientBoosting(
    n_estimators=50,
)
custom_gradient_boosting.fit(X_train, y_train)
cv_results = perform_cross_validation(X, y, custom_gradient_boosting)

# Вывод результатов кросс-валидации
print("Результаты кросс-валидации (F1-score):")
print(f"Отдельные оценки: {cv_results['scores']}")
print(f"Средняя F1-мера: {cv_results['mean_score']:.4f}")


Результаты кросс-валидации (F1-score):
Отдельные оценки: [0.7666666666666667, 0.7022900763358778, 0.7394957983193278, 0.746031746031746, 0.762589928057554]
Средняя F1-мера: 0.7434


In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Создаем модель градиентного бустинга из sklearn
sklearn_gradient_boosting = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.02,
    max_depth=3,
    random_state=0,
)
sklearn_gradient_boosting.fit(X_train, y_train)
cv_results = perform_cross_validation(X, y, sklearn_gradient_boosting)

# Вывод результатов кросс-валидации
print("Результаты кросс-валидации (F1-score):")
print(f"Отдельные оценки: {cv_results['scores']}")
print(f"Средняя F1-мера: {cv_results['mean_score']:.4f}")


Результаты кросс-валидации (F1-score):
Отдельные оценки: [0.7563025210084033, 0.6788990825688074, 0.7226890756302521, 0.7068965517241379, 0.6976744186046512]
Средняя F1-мера: 0.7125


In [8]:
%timeit sklearn_gradient_boosting.fit(X_train, y_train)


30.4 ms ± 726 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%timeit custom_gradient_boosting.fit(X_train, y_train)


154 ms ± 2.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
