In [None]:
import glob
from catboost.utils import get_gpu_device_count
import pandas as pd
import numpy as np

In [2]:
is_gpu_available = get_gpu_device_count()
device = 'GPU' if is_gpu_available else 'CPU'

device

'GPU'

In [None]:
TRAIN_PATH = r'/data/Кейс-3. Отток юридических лиц из расчетно-кассового обслуживания/train'
TEST_PATH = r'/data/Кейс-3. Отток юридических лиц из расчетно-кассового обслуживания/test'

In [4]:
def get_data(path):
    filenames_train = glob.glob(path + '/*.csv')
    data_files_train = []
    
    for filename in filenames_train:
        data_files_train.append(pd.read_csv(filename))

    return pd.concat(data_files_train, ignore_index=True)

train_df = get_data(TRAIN_PATH)
test_df = get_data(TEST_PATH)

In [5]:
ids = test_df['id'].tolist()

In [None]:
f = [
       'feature_168', 'feature_87', 'feature_72', 'feature_124', 'feature_141',
       'feature_29', 'feature_55', 'feature_142', 'feature_78', 'feature_183',
       'feature_84', 'feature_146', 'feature_134', 'feature_26', 'feature_12',
       'feature_127', 'feature_59', 'feature_100', 'feature_96', 'feature_112',
       'feature_169', 'feature_16', 'feature_76', 'feature_81', 'feature_79',
       'feature_22', 'feature_152', 'feature_43', 'feature_20', 'feature_18',
       'feature_44', 'id', 'feature_177', 'feature_50', 'feature_6',
       'feature_66', 'feature_9', 'feature_46', 'feature_103', 'feature_75',
       'feature_8', 'feature_36', 'feature_41', 'feature_184', 'feature_62',
       'feature_95', 'feature_133', 'feature_28', 'feature_108', 'feature_128',
       'feature_117', 'feature_161', 'feature_157', 'feature_107', 'feature_147'
]

In [7]:
train_df = train_df[f + ['target']]
test_df = test_df[f]

In [8]:
feature_operations = {
    'atan_feature_112': ('feature_112', np.arctan),
    'log_feature_26': ('feature_26', np.log1p),
    'sin_feature_26': ('feature_26', np.sin),
    'tan_feature_26': ('feature_26', np.tan)
}

for new_feature, (base_feature, operation) in feature_operations.items():
    train_df[new_feature] = operation(train_df[base_feature])
    test_df[new_feature] = operation(test_df[base_feature])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [None]:
!pip install "autogluon==1.1.1"

In [None]:
!pip install "scikit_learn==1.2.2"

In [11]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [12]:
train_tab = TabularDataset(train_df)
test_tab = TabularDataset(test_df)

In [13]:
predictor = TabularPredictor(label='target', eval_metric='roc_auc', problem_type='binary')

predictor.fit(
    train_tab, 
    time_limit=3600*11,
    hyperparameters={
        'CAT': {'auto_class_weights': 'Balanced'},
        'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
        'XGB': {}
    },
    ag_args_fit={'num_gpus': 1},
    num_gpus=1,
    presets='best_quality'
)

No path specified. Models will be saved in: "AutogluonModels/ag-20241104_104436"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.14
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Jun 27 20:43:36 UTC 2024
CPU Count:          4
Memory Avail:       29.83 GB / 31.36 GB (95.1%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x798a03c03eb0>

In [14]:
pred = predictor.predict_proba(test_tab)[1].tolist()

In [None]:
import csv

data = [
    {
        'id': id_value,
        'target': pred_value   
    } 
    for id_value, pred_value in zip(ids, pred)
]

csv_filename = 'submission_autogluon.csv'

with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    fieldnames = ['id', 'target']
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    writer.writeheader()

    for row in data:
        writer.writerow(row)