In [46]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
import os
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import glob
import os
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
import optuna

In [47]:
# Функция для обучения на 1 наборе данных
def fitting(path):
    try:
        # Получим список файлов по выданному пути к папке
        current_data = os.listdir(path)
    # В случае ошибки отловим ее и прервем выполнение цикла
    except Exception:
        return "Не папка"
    # В случае отсутствия ошибки - сохраним данные
    else:
        current_data = os.listdir(path)
    # Выделим тренировочный датасет
    train_data = [data for data in current_data if data.endswith('train.parquet')][0]
    # Выделим тестовый датасет
    test_data = [data for data in current_data if data.endswith('test.parquet')][0]
    # Откроем тренировочные данные
    print("reading data", path)
    train_data = pd.read_parquet(path + f'/{train_data}')
    # train_data = train_data.iloc[:3000]
    # Откроем тестовые данные
    test_data = pd.read_parquet(path + f'/{test_data}')
    # test_data = test_data.iloc[:3000]

    # ----------------------------------------------
    # Разделение на признаки и целевую переменную
    X = train_data.drop(columns=['target', 'id', 'smpl'])
    y = train_data['target']

    # Отбор важных признаков с использованием GradientBoostingClassifier
    model = lgb.LGBMClassifier(random_state=10, n_jobs=16, verbose=-1)
    model.fit(X, y)

    # Селектор фич, у которых importance выше среднего значения
    selector = SelectFromModel(model, threshold="mean", prefit=True)
    X_selected = selector.transform(X)

    print("Количество выбранных признаков:", X_selected.shape[1])

    # Применение селектора к тестовому набору данных
    X_test = test_data.drop(columns=['id', 'smpl'])
    X_test_selected = selector.transform(X_test)

    # Находим выбранные признаки
    # Получаем маску выбранных признаков из селектора
    selected_features_mask = selector.get_support()

    # Извлекаем названия выбранных признаков, используя маску
    im_f = X.columns[selected_features_mask]
    im_f = im_f.tolist()

    print("Выбранные признаки:", im_f)

    X_selected = train_data[im_f].copy()
    X_test_selected = test_data[im_f].copy()
    y = train_data['target']

    # Обучаем LightGBM с кросс-валидацией на выбранных признаках
    lgb_model = lgb.LGBMClassifier(verbose=-1)
    skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=10)  # стратификация по таргету
    scores = cross_val_score(lgb_model, X_selected, y, cv=skf, scoring='roc_auc')
    print("ROC-AUC на LightGBM с кросс-валидацией:", scores.mean())
    
    # Определяем функцию для оптимизации гиперпараметров
    def objective(trial):
        param = {
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.15),
            'max_depth': trial.suggest_int('max_depth', 3, 13),
            'num_leaves': trial.suggest_int('num_leaves', 50, 1000),
            'n_estimators': trial.suggest_int('n_estimators', 200, 700),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 90),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-6, 1.0, log = True),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-6, 1.0, log = True),
        }
        model = lgb.LGBMClassifier(**param, verbose=-1, n_jobs=16)
        scores = cross_val_score(model, X_selected, y, cv=skf, scoring='roc_auc')
        return scores.mean()

    # Запускаем Optuna для оптимизации
    study = optuna.create_study(direction="maximize")
    # Warm-start
    warm_start_small = {
        'learning_rate': 0.01290806719885743, 
        'max_depth': 3, 
        'num_leaves': 156, 
        'n_estimators': 500, 
        'min_child_samples': 32, 
        'subsample': 0.9052108095701373, 
        'colsample_bytree': 0.9153415375057271, 
        'lambda_l1': 0.5216384377631789, 
        'lambda_l2': 0.5141215730887658,
    }
    
    warm_start_large = {
        'learning_rate': 0.005, 
        'max_depth': -1, 
        'num_leaves': 512, 
        'n_estimators': 450, 
        'min_child_samples': 32, 
        'subsample': 0.7, 
        'colsample_bytree': 0.7, 
        'lambda_l1': 1, 
        'lambda_l2': 1,
    }
    
    warm_start_middle = {
        'learning_rate': 0.04, 
        'max_depth': -1, 
        'num_leaves': 128, 
        'n_estimators': 400, 
        'min_child_samples': 20, 
        'subsample': 0.65, 
        'colsample_bytree': 0.9, 
        'lambda_l1': 1, 
        'lambda_l2': 0,
    }
    
    num_rows = train_data.shape[0]
    num_iterations = 50
    warm_start = {}
    if num_rows < 70000:
        num_iterations = 30
        warm_start = warm_start_small
    elif num_rows < 300000:
        num_iterations = 20
        warm_start = warm_start_middle
    else:
        num_iterations = 10
        warm_start = warm_start_large
        
    
    study.enqueue_trial({**warm_start})
    study.optimize(objective, n_trials=num_iterations, timeout=1200)

    # Получаем лучшие параметры
    best_params = study.best_params
    print("Лучшие параметры:", best_params)

    # Финальное обучение модели с лучшими параметрами
    final_model = lgb.LGBMClassifier(**best_params, n_jobs=16, verbose = -1)
    final_model.fit(X_selected, y)

    # Предсказания для тестовых данных
    y_test_pred = final_model.predict_proba(X_test_selected)[:, 1]
    
    # ---------------------------------------
    # Объединим предсказание с метками
    test_data['target'] = y_test_pred
    # Отсортируем предсказание
    prediction = test_data[['id', 'target']].sort_values(by='id', ascending=True)
    # Вернем предсказание, как результат работы модели
    return prediction

In [48]:
# Функция смотрит на рабочее окружение и на папки с данными
def model():
    # Пропишем путь к файлам данных
    data = 'data'
    # Запишем список датасетов в папке:
    folders = os.listdir(data)
    # Создадим цикл для прохождения по каждому файлу и генерации предсказания
    for fold in folders:
        print("Training on", fold)
        # Запишем новый путь к данным
        data_path = data + f'/{fold}'
        # Вызовем функцию, передав в нее путь к папке для обучения
        prediction = fitting(path=data_path)
        # Сохраним полученное предсказание
        if type(prediction) is not str:
            # Сохраняем предсказание
            prediction.to_csv(f"predictions/{fold}.csv", index=False)
            print("Предсказание создано!")
        else:
            print("Невозможно создать предсказание!")
    
    '''print("Training on pd_ul_9")
    # Запишем новый путь к данным
    data_path = r"C:\Users\79168\Desktop\FINAL\pd_ul_9"
    # Вызовем функцию, передав в нее путь к папке для обучения
    prediction = fitting(path=data_path)
    # Сохраним полученное предсказание
    if type(prediction) is not str:
        # Сохраняем предсказание
        prediction.to_csv(r"C:\Users\79168\Desktop\FINAL\predictions\pd_ul_9.csv", index=False)
        print("Предсказание создано!")
    else:
        print("Невозможно создать предсказание!")'''

In [49]:
# Обозначаем действия при запуске кода
if __name__ == "__main__":
    # Запускаем модель
    model()

Training on pd_ul_9
reading data C:\Users\79168\Desktop\FINAL\pd_ul_9




Количество выбранных признаков: 109
Выбранные признаки: ['feature_1', 'feature_2', 'feature_4', 'feature_6', 'feature_8', 'feature_13', 'feature_18', 'feature_22', 'feature_27', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_39', 'feature_47', 'feature_50', 'feature_51', 'feature_53', 'feature_56', 'feature_57', 'feature_67', 'feature_72', 'feature_74', 'feature_75', 'feature_78', 'feature_86', 'feature_87', 'feature_90', 'feature_93', 'feature_99', 'feature_102', 'feature_111', 'feature_114', 'feature_115', 'feature_118', 'feature_123', 'feature_125', 'feature_129', 'feature_130', 'feature_132', 'feature_140', 'feature_143', 'feature_144', 'feature_146', 'feature_149', 'feature_150', 'feature_154', 'feature_156', 'feature_158', 'feature_167', 'feature_169', 'feature_173', 'feature_177', 'feature_178', 'feature_182', 'feature_183', 'feature_184', 'feature_186', 'feature_188', 'feature_192', 'feature_194', 'feature_207', 'feature_208', 'feat

[I 2024-12-07 18:37:23,087] A new study created in memory with name: no-name-4704d372-c210-4104-a12e-c595b0e6b0a8


ROC-AUC на LightGBM с кросс-валидацией: 0.7103090438053156


[I 2024-12-07 18:37:27,229] Trial 0 finished with value: 0.7428119041043647 and parameters: {'learning_rate': 0.01290806719885743, 'max_depth': 3, 'num_leaves': 156, 'n_estimators': 500, 'min_child_samples': 32, 'subsample': 0.9052108095701373, 'colsample_bytree': 0.9153415375057271, 'lambda_l1': 0.5216384377631789, 'lambda_l2': 0.5141215730887658}. Best is trial 0 with value: 0.7428119041043647.




[I 2024-12-07 18:37:29,715] Trial 1 finished with value: 0.7977710708696624 and parameters: {'learning_rate': 0.07496070519323274, 'max_depth': 5, 'num_leaves': 740, 'n_estimators': 451, 'min_child_samples': 47, 'subsample': 0.5628524663806997, 'colsample_bytree': 0.6967002940408294, 'lambda_l1': 0.0034973396909666805, 'lambda_l2': 1.238280249551073e-06}. Best is trial 1 with value: 0.7977710708696624.




[I 2024-12-07 18:37:32,598] Trial 2 finished with value: 0.8189115082652779 and parameters: {'learning_rate': 0.06497122311580714, 'max_depth': 5, 'num_leaves': 400, 'n_estimators': 331, 'min_child_samples': 82, 'subsample': 0.6949491220774011, 'colsample_bytree': 0.7229942999372205, 'lambda_l1': 0.00010051157263392181, 'lambda_l2': 0.0015270686778521313}. Best is trial 2 with value: 0.8189115082652779.




[I 2024-12-07 18:37:34,980] Trial 3 finished with value: 0.776054233900133 and parameters: {'learning_rate': 0.10496366569746643, 'max_depth': 13, 'num_leaves': 490, 'n_estimators': 420, 'min_child_samples': 56, 'subsample': 0.9683504499256039, 'colsample_bytree': 0.6368186561523589, 'lambda_l1': 6.780744913473282e-06, 'lambda_l2': 8.355758601427117e-05}. Best is trial 2 with value: 0.8189115082652779.




[I 2024-12-07 18:37:35,840] Trial 4 finished with value: 0.7992027984902881 and parameters: {'learning_rate': 0.08556724820043946, 'max_depth': 3, 'num_leaves': 456, 'n_estimators': 246, 'min_child_samples': 41, 'subsample': 0.5103323655531723, 'colsample_bytree': 0.5609018550837122, 'lambda_l1': 0.9181443643096244, 'lambda_l2': 0.006523504097523804}. Best is trial 2 with value: 0.8189115082652779.




[I 2024-12-07 18:37:45,499] Trial 5 finished with value: 0.7688161649636379 and parameters: {'learning_rate': 0.018735508004291506, 'max_depth': 13, 'num_leaves': 725, 'n_estimators': 459, 'min_child_samples': 48, 'subsample': 0.7827366050510294, 'colsample_bytree': 0.9867416596094993, 'lambda_l1': 6.0148064885157774e-05, 'lambda_l2': 0.00026504351079845395}. Best is trial 2 with value: 0.8189115082652779.




[I 2024-12-07 18:37:50,312] Trial 6 finished with value: 0.81893662629371 and parameters: {'learning_rate': 0.12435532486815258, 'max_depth': 6, 'num_leaves': 613, 'n_estimators': 699, 'min_child_samples': 61, 'subsample': 0.9615712409832657, 'colsample_bytree': 0.8359501959235971, 'lambda_l1': 2.07254496629882e-05, 'lambda_l2': 0.01917498210466046}. Best is trial 6 with value: 0.81893662629371.




[I 2024-12-07 18:37:55,332] Trial 7 finished with value: 0.7982788232663959 and parameters: {'learning_rate': 0.0754985794147639, 'max_depth': 9, 'num_leaves': 987, 'n_estimators': 395, 'min_child_samples': 33, 'subsample': 0.6542033136247525, 'colsample_bytree': 0.8094991131982983, 'lambda_l1': 3.541566839633794e-06, 'lambda_l2': 0.051344831845408584}. Best is trial 6 with value: 0.81893662629371.




[I 2024-12-07 18:37:58,705] Trial 8 finished with value: 0.7622978393235228 and parameters: {'learning_rate': 0.13178203105004185, 'max_depth': 10, 'num_leaves': 700, 'n_estimators': 572, 'min_child_samples': 16, 'subsample': 0.6912382055224302, 'colsample_bytree': 0.7837582422063823, 'lambda_l1': 3.664892781960188e-05, 'lambda_l2': 3.6626963016585983e-06}. Best is trial 6 with value: 0.81893662629371.




[I 2024-12-07 18:38:02,799] Trial 9 finished with value: 0.7944257703081233 and parameters: {'learning_rate': 0.0653024284901108, 'max_depth': 8, 'num_leaves': 407, 'n_estimators': 250, 'min_child_samples': 24, 'subsample': 0.7072035323562745, 'colsample_bytree': 0.7271709525970221, 'lambda_l1': 5.87891738215397e-06, 'lambda_l2': 9.484743633277157e-05}. Best is trial 6 with value: 0.81893662629371.




[I 2024-12-07 18:38:11,691] Trial 10 finished with value: 0.7705170894648941 and parameters: {'learning_rate': 0.14434315134826992, 'max_depth': 7, 'num_leaves': 58, 'n_estimators': 677, 'min_child_samples': 70, 'subsample': 0.8424672616337232, 'colsample_bytree': 0.8803799878306873, 'lambda_l1': 0.003274746447676777, 'lambda_l2': 0.8877175051167041}. Best is trial 6 with value: 0.81893662629371.




[I 2024-12-07 18:38:16,452] Trial 11 finished with value: 0.8091855709419918 and parameters: {'learning_rate': 0.0424928426271777, 'max_depth': 6, 'num_leaves': 286, 'n_estimators': 334, 'min_child_samples': 87, 'subsample': 0.9896682175277339, 'colsample_bytree': 0.8408337113669889, 'lambda_l1': 0.0002729359514723115, 'lambda_l2': 0.004235049803011655}. Best is trial 6 with value: 0.81893662629371.




[I 2024-12-07 18:38:21,443] Trial 12 finished with value: 0.8444074906958088 and parameters: {'learning_rate': 0.11925672858247591, 'max_depth': 5, 'num_leaves': 604, 'n_estimators': 684, 'min_child_samples': 89, 'subsample': 0.8025002175649082, 'colsample_bytree': 0.6450098090385614, 'lambda_l1': 0.00039884849396440116, 'lambda_l2': 0.028273646749462818}. Best is trial 12 with value: 0.8444074906958088.




[I 2024-12-07 18:38:23,632] Trial 13 finished with value: 0.8316712036927447 and parameters: {'learning_rate': 0.11546425504172861, 'max_depth': 5, 'num_leaves': 619, 'n_estimators': 700, 'min_child_samples': 68, 'subsample': 0.8363030426906544, 'colsample_bytree': 0.5088013291911327, 'lambda_l1': 0.03423555453718298, 'lambda_l2': 0.045435479213337245}. Best is trial 12 with value: 0.8444074906958088.




[I 2024-12-07 18:38:25,722] Trial 14 finished with value: 0.8473030338896123 and parameters: {'learning_rate': 0.10696883335343621, 'max_depth': 4, 'num_leaves': 933, 'n_estimators': 609, 'min_child_samples': 73, 'subsample': 0.8263374541164522, 'colsample_bytree': 0.5271035377802693, 'lambda_l1': 0.043995181994689214, 'lambda_l2': 0.08959455853014209}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:28,149] Trial 15 finished with value: 0.8129593245749004 and parameters: {'learning_rate': 0.0993349761104139, 'max_depth': 3, 'num_leaves': 949, 'n_estimators': 597, 'min_child_samples': 77, 'subsample': 0.7924504736692948, 'colsample_bytree': 0.6218639322889282, 'lambda_l1': 0.03929182128939772, 'lambda_l2': 0.18443499484975556}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:30,101] Trial 16 finished with value: 0.8183293223392641 and parameters: {'learning_rate': 0.14151552791472669, 'max_depth': 4, 'num_leaves': 858, 'n_estimators': 609, 'min_child_samples': 89, 'subsample': 0.8769829117586343, 'colsample_bytree': 0.5006276143222754, 'lambda_l1': 0.03971888778588942, 'lambda_l2': 0.10186800818937469}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:34,666] Trial 17 finished with value: 0.8214160781683565 and parameters: {'learning_rate': 0.09966036424885873, 'max_depth': 11, 'num_leaves': 842, 'n_estimators': 531, 'min_child_samples': 76, 'subsample': 0.7689378619602234, 'colsample_bytree': 0.5962386144007847, 'lambda_l1': 0.0007482844278592469, 'lambda_l2': 0.011755403431953516}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:37,353] Trial 18 finished with value: 0.8056032929603766 and parameters: {'learning_rate': 0.11404597329905201, 'max_depth': 7, 'num_leaves': 835, 'n_estimators': 647, 'min_child_samples': 69, 'subsample': 0.6148618259550485, 'colsample_bytree': 0.6810950522584998, 'lambda_l1': 0.004758642917063039, 'lambda_l2': 0.0011183213003218074}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:40,119] Trial 19 finished with value: 0.7882057048171383 and parameters: {'learning_rate': 0.04520672836197548, 'max_depth': 4, 'num_leaves': 290, 'n_estimators': 640, 'min_child_samples': 61, 'subsample': 0.9199346253448498, 'colsample_bytree': 0.5638262518836179, 'lambda_l1': 0.22334037815725397, 'lambda_l2': 0.28234219387785336}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:43,182] Trial 20 finished with value: 0.8006203232466695 and parameters: {'learning_rate': 0.08969996114926405, 'max_depth': 7, 'num_leaves': 633, 'n_estimators': 549, 'min_child_samples': 90, 'subsample': 0.8337369480400921, 'colsample_bytree': 0.6618199374340129, 'lambda_l1': 0.0093596420662575, 'lambda_l2': 0.028348012910053174}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:45,307] Trial 21 finished with value: 0.8266423376862483 and parameters: {'learning_rate': 0.12095199974173061, 'max_depth': 5, 'num_leaves': 614, 'n_estimators': 700, 'min_child_samples': 68, 'subsample': 0.8269942762842298, 'colsample_bytree': 0.5041473604308714, 'lambda_l1': 0.05434920887916685, 'lambda_l2': 0.07210515777596116}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:48,231] Trial 22 finished with value: 0.81407674807012 and parameters: {'learning_rate': 0.11592394932121669, 'max_depth': 4, 'num_leaves': 552, 'n_estimators': 639, 'min_child_samples': 78, 'subsample': 0.7442507703995447, 'colsample_bytree': 0.5715957453929739, 'lambda_l1': 0.0006399220600306889, 'lambda_l2': 0.0036722168440030374}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:50,405] Trial 23 finished with value: 0.8440458436895886 and parameters: {'learning_rate': 0.13281793818579934, 'max_depth': 6, 'num_leaves': 914, 'n_estimators': 659, 'min_child_samples': 83, 'subsample': 0.8748372598812126, 'colsample_bytree': 0.5300296579757036, 'lambda_l1': 0.016770305613308226, 'lambda_l2': 0.025551457686043593}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:52,294] Trial 24 finished with value: 0.7785657737273315 and parameters: {'learning_rate': 0.1308115481099176, 'max_depth': 6, 'num_leaves': 914, 'n_estimators': 603, 'min_child_samples': 81, 'subsample': 0.8921364102726158, 'colsample_bytree': 0.5379366490758488, 'lambda_l1': 0.16562147610097344, 'lambda_l2': 0.22917762179932682}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:54,639] Trial 25 finished with value: 0.8245718756986362 and parameters: {'learning_rate': 0.14765875075035384, 'max_depth': 8, 'num_leaves': 791, 'n_estimators': 650, 'min_child_samples': 83, 'subsample': 0.7296891006821875, 'colsample_bytree': 0.6115777665774254, 'lambda_l1': 0.011660792061286071, 'lambda_l2': 0.014041253241994504}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:38:56,623] Trial 26 finished with value: 0.7887686905748215 and parameters: {'learning_rate': 0.13761364986959643, 'max_depth': 4, 'num_leaves': 891, 'n_estimators': 505, 'min_child_samples': 74, 'subsample': 0.9358024161490326, 'colsample_bytree': 0.5907699826031847, 'lambda_l1': 0.0017842487119946045, 'lambda_l2': 0.0002813162694963901}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:39:02,389] Trial 27 finished with value: 0.8447878118383504 and parameters: {'learning_rate': 0.10710608351704745, 'max_depth': 6, 'num_leaves': 985, 'n_estimators': 578, 'min_child_samples': 61, 'subsample': 0.8659450625311218, 'colsample_bytree': 0.5412692504311626, 'lambda_l1': 0.00025748007369392226, 'lambda_l2': 0.1171143877312783}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:39:09,609] Trial 28 finished with value: 0.8204554122118332 and parameters: {'learning_rate': 0.08787989666226396, 'max_depth': 7, 'num_leaves': 975, 'n_estimators': 573, 'min_child_samples': 60, 'subsample': 0.7958063401217346, 'colsample_bytree': 0.6514987002703406, 'lambda_l1': 0.00016723466011908268, 'lambda_l2': 0.13270460315610808}. Best is trial 14 with value: 0.8473030338896123.




[I 2024-12-07 18:39:12,844] Trial 29 finished with value: 0.8080586788706093 and parameters: {'learning_rate': 0.10563327401816913, 'max_depth': 3, 'num_leaves': 285, 'n_estimators': 489, 'min_child_samples': 55, 'subsample': 0.8608591618486459, 'colsample_bytree': 0.552672815109823, 'lambda_l1': 0.00031166314116054674, 'lambda_l2': 0.6011004821628853}. Best is trial 14 with value: 0.8473030338896123.


Лучшие параметры: {'learning_rate': 0.10696883335343621, 'max_depth': 4, 'num_leaves': 933, 'n_estimators': 609, 'min_child_samples': 73, 'subsample': 0.8263374541164522, 'colsample_bytree': 0.5271035377802693, 'lambda_l1': 0.043995181994689214, 'lambda_l2': 0.08959455853014209}
Предсказание создано!
