## Импорт библиотек

In [1]:
# Для построения пайплайна обучения используется lightautoml, который конфликтуает с новой версией pandas. Поэтому
!pip uninstall pandas -y
!pip install --upgrade pip > installations.txt
!pip uninstall torch -y > installations.txt # конфликтует 
!pip install torch==2.0.0 > installations.txt
!pip install pandas==1.4.3 pyarrow yellowbrick polars transformers nltk gensim lightautoml > installations.txt
!pip install --upgrade -q wandb > installations.txt

Found existing installation: pandas 2.2.0
Uninstalling pandas-2.2.0:
  Successfully uninstalled pandas-2.2.0
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.1.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.
torchdata 0.7.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.
torchtext 0.16.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.
torchvision 0.16.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.[0m[31m
[0m

In [2]:
# for dataframe
import polars as pl
import numpy as np
import pyarrow as pa
import pandas as pd

# for system
import os
import time
import sys

# for metric
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

# for demention decrease
from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

# for common functions
from collections import OrderedDict
from collections import Counter
from kaggle_secrets import UserSecretsClient
from copy import deepcopy as copy
from typing import Tuple, List

# for monitoring of models
import wandb

# for machine learning
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.report.report_deco import ReportDeco

# my functions
sys.path.append('/kaggle/input/next-orders')
from my_functions import MarketDataProcessor

  from .autonotebook import tqdm as notebook_tqdm


## Препроцессинг данных

____________________________________________________
Использовался ноутбук с TPU и RAM 300GB, чтобы не испытывать проблем с памятью во время операционных обработок датасета.

In [3]:
# tmp импорт в pandas
raw = pd.read_csv('../input/sbermarket-internship-competition/train.csv')
sub = pd.read_csv('../input/sbermarket-internship-competition/sample_submission.csv', sep = ",")

# Приведение столбца 'cart' к int
raw['cart'] = raw['cart'].astype(int)

mk_data = MarketDataProcessor(raw, sub)

# Отбрасываем выбросы - юзеров с малым количеством товаров в истории их заказов
filtered_raw, filtered_sub, proportion = mk_data.filter_train_data()
print(f"Процент наблюдений, используемый для тренировки: {proportion:.2f}%")

Процент наблюдений, используемый для тренировки: 0.97%


### Polars ускоряет обработку данных
__________________________________________________
В итоге мы получаем pd датафрейм с усредненным рейтингом id во всем датасете и набор временных переменных, основанных на этой метрике (рейтинг)

In [4]:
%%time
mk_data.get_dummies_matrix()
mk_data.generate_time_features()
Train = mk_data.compile_dataset()

Test = mk_data.compile_dataset(history_flag = 1)

DuplicateError: column with name 't_rating_per_weekday_mean' has more than one occurrences

Error originated just after this operation:
DF ["user_id", "category", "id", "weekday_mean"]; PROJECT */35 COLUMNS; SELECTION: "None"

### X / Y 

In [5]:
x_cols = Train.select_dtypes(include=['int', 'int8', 'int32', 'uint32', 'uint8', 'float']).drop(columns = ['target'], axis = 1).columns.tolist()
x_cols_pca = Train.select_dtypes(include=['int', 'int8', 'int32', 'uint32', 'uint8', 'float']).drop(columns = ['user_id', 'target'], axis = 1).columns.tolist()
print(x_cols)
y_cols = ['target']
print(y_cols)

NameError: name 'Train' is not defined

### Добавляем главные компоненты

In [None]:
scaler = RobustScaler()

s_Train = Train.copy()
s_Test = Test.copy()
s_Train[x_cols_pca] = scaler.fit_transform(s_Train[x_cols_pca])
s_Test[x_cols_pca] = scaler.transform(s_Test[x_cols_pca])

pca = PCA(n_components=10)

s_Train_pca = pca.fit_transform(s_Train[x_cols])
s_Test_pca = pca.transform(s_Test[x_cols])

Train_pca = pd.DataFrame(s_Train_pca, columns=[f'pc{i+1}' for i in range(s_Train_pca.shape[1])])
Test_pca = pd.DataFrame(s_Test_pca, columns=[f'pc{i+1}' for i in range(s_Train_pca.shape[1])])

s_Train[Train_pca.columns.to_list()] = Train_pca
s_Test[Test_pca.columns.to_list()] = Test_pca

### Кластеризация id(юзер-категория)

In [None]:
print('Elbow Method to determine the number of clusters to be formed:')
warnings.filterwarnings("ignore", message="findfont:.*")
Elbow_M = KElbowVisualizer(KMeans(), k=21)
Elbow_M.fit(Train_pca.drop(columns = ['id', 'target'], axis = 1))
Elbow_M.show()

### Сохранение данных в parquet

In [None]:
# Train.to_parquet('Train.parquet', index=False)
# s_Train.to_parquet('s_Train.parquet', index=False)
# s_Test.to_parquet('s_Test.parquet', index=False)

In [None]:
s_Train = pd.read_parquet('/kaggle/input/next-orders/s_Train.parquet')
s_Test = pd.read_parquet('/kaggle/input/next-orders/s_Test.parquet')

## LightAutoML пайплайн


In [None]:
Train_set, Valid_set = train_test_split(Train, test_size = TEST_SIZE,
                                        stratify = None, random_state = 23)
Train_set.reset_index(drop=True, inplace=True)
Valid_set.reset_index(drop=True, inplace=True)

In [None]:
def f1 (real, pred, **kwargs):
    return f1_score(real, (pred > 0.2).astype(int), **kwargs)

N_THREADS = 2
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TARGET_NAME = 'target'
ROLES = {'target': TARGET_NAME, 'drop': ['id', 'user_id', 'category'], 'category': ['Clusters']}
TASK = Task('binary', metric = f1)

reader = PandasToPandasReader(TASK, cv=N_FOLDS, random_state=RANDOM_STATE)

lama_params = {
    "task": TASK,
    "cpu_limit": N_THREADS,
    "reader_params": {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
}
default_nn_params = {
    "bs": 128, "num_workers": 0, "path_to_save": None, "n_epochs": 1, "freeze_defaults": True
}


### Weight and Biasses мониторинг моделей

In [None]:
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)
wandb.login()

CONFIG = dict (
    lama_params,
    general_params = {"use_algos": [["linear_12", "lgbm", "denselight"]]},
    tuning_params = {'max_tuning_iter': 20},
    lgb_params = {'default_params': {'num_threads': N_THREADS}},
    nn_params={**default_nn_params,'lr': 0.03},
    infra = "Kaggle",
    competition = 'plant-pathology',
    _wandb_kernel = 'ayut'
)
CONFIG['model_name'] = 'lightAutoML-experiments_w_features_1'
run = wandb.init(project='sber-inter', 
                 config=CONFIG,
                 group='lightAutoML', 
                 job_type='train')

wandb.config.type = 'lightAutoML'
wandb.config.kaggle_competition = 'SberMarket Competition'

In [None]:
%%time 

# наиболее оптимальный на данный момент пайплайн
automl = TabularUtilizedAutoML(
    task = TASK,
    timeout = 3600*3,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

# выбранные среди всех наиболее значимые переменные
train_pred = automl.fit_predict(Train_set[['total_order_in_cat_max',
 'ordered_mean',
 'pc2',
 'weekday_std',
 'pc9',
 'pc10',
 'category',
 'pc4',
 'pc5',
 'pc6',
 'month_std',
 'pc3',
 'week_std',
 'hour_std',
 'hour_mean',
 'pc1',
 'pc7', 'target']], roles = ROLES, verbose = 2)

print('Score', "%.5f" % f1(Train_set.target, train_pred.data))
valid_pred = automl.predict(Valid_set[['total_order_in_cat_max',
 'ordered_mean',
 'pc2',
 'weekday_std',
 'pc9',
 'pc10',
 'category',
 'pc4',
 'pc5',
 'pc6',
 'month_std',
 'pc3',
 'week_std',
 'hour_std',
 'hour_mean',
 'pc1',
 'pc7', 'target']])
print('Score on out of folds validation', "%.5f" % f1(Valid_set.target, valid_pred.data))
# best catboost params 
# {'task_type': 'CPU', 'thread_count': 4, 'random_seed': 42, 'num_trees': 3000, 'learning_rate': 0.03, 'l2_leaf_reg': 0.009044636094268511, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 4, 'min_data_in_leaf': 7, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Min', 'verbose': 100, 'allow_writing_files': False}
# best linear
#
# best lgbm 
# {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 105, 'feature_fraction': 0.8625799184703501, 'bagging_fraction': 0.5053328530427746, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 7.265259184516205e-05, 'reg_lambda': 0.621571500507215, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 4, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42, 'min_sum_hessian_in_leaf': 4.636375055852895}
# best denselight
# {'num_workers': 0, 'pin_memory': False, 'max_length': 256, 'is_snap': False, 'input_bn': False, 'max_emb_size': 256, 'bert_name': None, 'pooling': 'cls', 'device': 'cpu', 'use_cont': True, 'use_cat': True, 'use_text': False, 'lang': 'en', 'deterministic': True, 'multigpu': False, 'random_state': 42, 'model': 'denselight', 'model_with_emb': False, 'path_to_save': None, 'verbose_inside': None, 'verbose': 1, 'n_epochs': 30, 'snap_params': {'k': 3, 'early_stopping': True, 'patience': 10, 'swa': True}, 'bs': 1024, 'emb_dropout': 0.1, 'emb_ratio': 3, 'opt': 'Adam', 'opt_params': {'lr': 0.003757084358753148, 'weight_decay': 0}, 'sch': 'ReduceLROnPlateau', 'scheduler_params': {'patience': 5, 'factor': 0.5, 'min_lr': 1e-05}, 'loss': None, 'loss_params': {}, 'loss_on_logits': True, 'clip_grad': False, 'clip_grad_params': {}, 'init_bias': True, 'dataset': 'UniversalDataset', 'tuned': True, 'optimization_search_space': None, 'verbose_bar': False, 'freeze_defaults': True, 'n_out': None, 'hid_factor': [2, 2], 'hidden_size': [512, 256], 'block_config': [2, 2], 'compression': 0.5, 'growth_size': 256, 'bn_factor': 2, 'drop_rate': 0.1, 'noise_std': 0.05, 'num_init_features': None, 'act_fun': 'LeakyReLU', 'use_noise': False, 'use_bn': True, 'embedding_size': 10, 'cat_embedder': 'cat', 'cont_embedder': 'cont', 'stop_by_metric': False, 'tuning_params': {'fit_on_holdout': True, 'max_tuning_iter': 50, 'max_tuning_time': 3600}}

In [None]:
# fast_fi = automl.get_feature_scores('fast', silent=False)
# fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

In [None]:
import joblib

joblib.dump(automl, 'automl_02.pkl')
# automl=joblib.load('/kaggle/input/next-orders/automl_rd.pkl')

run = wandb.init(project='sber-inter', 
                 config=CONFIG,
                 group='lightAutoML', 
                 job_type='save_experiment')

wandb.config.type = 'lightAutoML'
wandb.config.kaggle_competition = 'SberMarket Competition'

artifact = wandb.Artifact(name='automl', type='model')
artifact.add_file('/kaggle/working/automl_01.pkl')
run.log_artifact(artifact)

run.finish()

### Нахождение лучшего порогового значения для сепарации 0 и 1

In [None]:
best_score = 0
for i in np.arange(0.01, 1.0, 0.01):
    score = f1 = f1_score(Valid_set.target, (valid_pred.data > i).astype(int))
    if score > best_score:
        best_score = score
        proba_split = i

print('At i =', "%.2f" % proba_split,'score is : ' "%.5f" % best_score)

In [None]:
s_Test['target'] = (predictions.data > proba_split).astype(int)
submit = pd.merge(sub['id'], s_Test[['id', 'target']], on='id')

## Подготовка сабмита

In [None]:
import csv

with open('submission02.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(submit.columns)
    for row in submit.values:
        csvwriter.writerow(row)