In [33]:
# !python3 -m venv myenv
# !source myenv/bin/activate
!pip uninstall pandas -y
!pip install --upgrade pip > installations.txt
!pip uninstall torch -y > installations.txt
!pip install torch==2.0.0 > installations.txt
!pip install pandas==1.4.3 pyarrow yellowbrick polars transformers nltk gensim lightautoml > installations.txt
!pip install --upgrade -q wandb > installations.txt

Found existing installation: pandas 1.4.3
Uninstalling pandas-1.4.3:
  Successfully uninstalled pandas-1.4.3
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lightautoml 0.3.8.1 requires pandas<2.0.0, which is not installed.
torchaudio 2.1.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.
torchdata 0.7.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.
torchtext 0.16.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.
torchvision 0.16.0 requires torch==2.1.0, but you have torch 2.0.0 which is incompatible.[0m[31m
[0m

In [34]:
import polars as pl
import numpy as np
import pyarrow as pa
import os
import time
import optuna
import requests
import sys
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
from copy import deepcopy as copy
import torch.nn as nn
from collections import OrderedDict
from collections import Counter

from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.boost_cb import BoostCB
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/next-orders/Train (1).parquet
/kaggle/input/sbermarket-internship-competition/sample_submission.csv
/kaggle/input/sbermarket-internship-competition/train.csv


In [35]:
print(pd.__version__)

1.4.3


In [36]:
def ohe_data(raw: pl.DataFrame) -> pl.DataFrame:
    """
    Вовращает ohe матрицу для категорий.

    Args:
        raw (pl.DataFrame): Polars DataFrame.

    Returns:
        (tpl.DataFrame).
    """

    # OHE 'cart', группировка по юзеру и дате заказа по максимальным значениям
    # Таким образом для каждой категории, если она была в этот день в заказе у пользователя, будет 1
    train_raw = raw.to_dummies(columns='cart').group_by(['user_id', 'order_completed_at']).max()
    train_raw = train_raw.sort(['user_id', 'order_completed_at'])

    return train_raw

def full_history(train_raw: pl.DataFrame) -> pl.DataFrame:
    """
    Обрабатывает датафрейм polars и возвращает весь датасет.

    Args:
        raw (pl.DataFrame): Polars DataFrame для обработки.

    Returns:
        pl.DataFrame: Обработанный DataFrame для train.
    """
    
    # Вычисление переменных времени
    train_raw = train_raw.with_columns(pl.col("order_completed_at").str.to_datetime())
    train_raw = train_raw.with_columns(
        [
            (pl.col('user_id').cum_count() - pl.lit(1)).over(['user_id']).alias('order_number'),
            pl.col("order_completed_at").dt.hour().alias("hour"),
            pl.col("order_completed_at").dt.week().alias("week"),
            pl.col("order_completed_at").dt.weekday().alias("weekday"),
            pl.col("order_completed_at").dt.day().alias("day"),
            pl.col("order_completed_at").dt.month().alias("month"),
            pl.col("order_completed_at").dt.year().alias("year"),
        ]
    )
    train_raw = train_raw.drop('order_completed_at')

    return train_raw

def create_dataset(train_data: pl.DataFrame, history_flag: int = 0)-> pd.DataFrame:
    """
    Компилирует историю заказов (train_data) в укомплектованный тренировочный pd.Dataframe "Train".

    Args:
        train_data (pl.DataFrame): история заказов (корзина на каждого юзера).
        history_flag (int): флаг 0 - Тренировочный датасет, 1 - общий (тестовый) датасет без таргета.

    Returns:
        pd.DataFrame: X + Y for model/ or just X.
    """
    
    # Преобразование в длинный формат юзера и категории товара
    if not history_flag:
        # Сепарация последнего заказа из истории заказов
        grouped = train_data.group_by('user_id').agg(pl.max('order_number').alias('max_order_number'))
        merged = train_data.join(grouped, on='user_id')
        last_order = merged['max_order_number'] == merged['order_number']

        # Разделение на train_data (заказы, кроме последнего) и valid_data (последний заказ) .group_by('user_id').sum()
        train_tmp = train_data.filter(~last_order)
        val_data = train_data.filter(last_order).drop('hour', 'week', 'weekday', 'day', 'month', 'year')
        train_melt = train_tmp.drop('order_number').melt(id_vars=['user_id', 'hour', 'week', 'weekday', 'day', 'month', 'year'], variable_name='category', value_name='ordered')
    elif history_flag == 1:
        train_melt = train_data.drop('order_number').melt(id_vars=['user_id', 'hour', 'week', 'weekday', 'day', 'month', 'year'], variable_name='category', value_name='ordered')
    else: 
        print("Invalid history flag. Exiting...")
        exit()
    # Тренировочный датасет
    Train = train_melt
    
    order_number_df = val_data.select(['user_id', 'order_number']).unique() # Количество заказов у юзера
    dl_tmp = Train.group_by('category').agg(pl.col(['ordered']).sum()) # Количество заказов по категории
    if history_flag == 1:
        order_number_df = order_number_df.with_column(pl.col("order_number").apply(lambda x: x + 1))
        dl_tmp = Train.group_by('category').agg(pl.col(['ordered']).sum()+1)
    Train = Train.join(order_number_df, on='user_id').rename({"order_number": "total_order_num"})
    Train = Train.join(dl_tmp, on='category').rename({"ordered_right": "total_order_in_cat"})
    # Вычисление рейтинга для каждой записи !!?? Как еще можно рассчитывать рейтинг
    Train = Train.with_columns(
        [
            (pl.col('ordered') / pl.col('total_order_num')).alias('total_rating'),
            (pl.col('user_id').cast(pl.Utf8) + ';' + pl.col('category')).alias('id')
        ]
    )
    Train = Train.group_by('user_id', 'category', 'id').agg(
        pl.col(['hour']).mean().name.suffix("_mean"), pl.col(['hour']).max().name.suffix("_max"),
        pl.col(['hour']).min().name.suffix("_min"), pl.col(['hour']).std().name.suffix("_std"),
        pl.col(['week']).mean().name.suffix("_mean"), pl.col(['week']).max().name.suffix("_max"),
        pl.col(['week']).min().name.suffix("_min"), pl.col(['week']).std().name.suffix("_std"),
        pl.col(['weekday']).mean().name.suffix("_mean"), pl.col(['weekday']).max().name.suffix("_max"),
        pl.col(['weekday']).min().name.suffix("_min"), pl.col(['weekday']).std().name.suffix("_std"),
        pl.col(['day']).mean().name.suffix("_mean"), pl.col(['day']).max().name.suffix("_max"),
        pl.col(['day']).min().name.suffix("_min"), pl.col(['day']).std().name.suffix("_std"),
        pl.col(['month']).mean().name.suffix("_mean"), pl.col(['month']).max().name.suffix("_max"),
        pl.col(['month']).min().name.suffix("_min"), pl.col(['month']).std().name.suffix("_std"),
        pl.col(['year']).mean().name.suffix("_mean"), pl.col(['year']).max().name.suffix("_max"),
        pl.col(['year']).min().name.suffix("_min"), pl.col(['year']).std().name.suffix("_std"),
        pl.col(['total_rating']).mean().name.suffix("_mean"), pl.col(['total_rating']).max().name.suffix("_max"),
        pl.col(['total_rating']).min().name.suffix("_min"), pl.col(['total_rating']).std().name.suffix("_std"),
        pl.col(['ordered']).mean().name.suffix("_mean"), pl.col(['ordered']).max().name.suffix("_max"),
        pl.col(['ordered']).min().name.suffix("_min"), pl.col(['ordered']).std().name.suffix("_std"),
        pl.col(['total_order_in_cat']).mean().name.suffix("_mean"), pl.col(['total_order_in_cat']).max().name.suffix("_max"),
        pl.col(['total_order_in_cat']).min().name.suffix("_min"), pl.col(['total_order_in_cat']).std().name.suffix("_std"),
        pl.col(['total_order_num']).mean().name.suffix("_mean"), pl.col(['total_order_num']).max().name.suffix("_max"),
        pl.col(['total_order_num']).min().name.suffix("_min"), pl.col(['total_order_num']).std().name.suffix("_std"),
    )
    # Вычисление рейтинга по времени
    rating_per_hour = Train.group_by(['year_mean', 'month_mean', 'hour_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_hour_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_hour_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_hour_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_hour_median"),
    )

    rating_per_weekday = Train.group_by(['year_mean', 'month_mean', 'weekday_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_w_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_w_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_w_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_w_median"),
    )
    rating_per_day = Train.group_by(['year_mean', 'month_mean', 'day_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_d_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_d_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_d_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_d_median"),
    )

    rating_per_month = Train.group_by(['year_mean', 'month_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_m_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_m_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_m_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_m_median"),
    )


    Train = Train.join(rating_per_month, on=['year_mean', 'month_mean', 'user_id'])
    Train = Train.join(rating_per_weekday, on=['year_mean', 'month_mean', 'weekday_mean', 'user_id'])
    Train = Train.join(rating_per_day, on=['year_mean', 'month_mean', 'day_mean', 'user_id'])
    Train = Train.join(rating_per_hour, on=['year_mean', 'month_mean', 'hour_mean', 'user_id'])
    
    if not history_flag:
    # Присоединение целевой переменной из valid_melt к Train
        valid_melt = val_data.drop('order_number',).melt(id_vars=['user_id', ], variable_name='category', value_name='target')
        Train = Train.join(valid_melt, on = ['user_id', 'category'])
    #     Train = Train.drop('user_id', 'category')

    # Преобразование в pandas DataFrame
    Train = Train.to_pandas()
    Train['id'] = Train['id'].str.replace('cart_', '')
    Train['category'] = Train['category'].str.replace('cart_', '')

    return Train

In [46]:
# tmp импорт в pandas

raw = pd.read_csv('../input/sbermarket-internship-competition/train.csv')
sub = pd.read_csv('../input/sbermarket-internship-competition/sample_submission.csv', sep = ",")

# Приведение столбца 'cart' к int
raw['cart'] = raw['cart'].astype(int)

def filter_raw_data(raw, sub):
    users = map(int, (x.split(';')[0] for x in sub['id']))
    user_counts = Counter(users)
    frequent_users = {user for user, count in user_counts.items() if count > 25}
    sub_users = {user for user, count in user_counts.items()}
    filtered_raw = raw[raw['user_id'].isin(frequent_users)]
    filtered_sub = raw[raw['user_id'].isin(sub_users)]
    total_count = sum(user_counts.values())
    frequent_count = sum(count for user, count in user_counts.items() if user in frequent_users)
    proportion = frequent_count / total_count
    
    return filtered_raw, filtered_sub, proportion

filtered_raw, filtered_sub, proportion = filter_raw_data(raw, sub)
print(f"Процент наблюдений, используемый для тренировки: {proportion:.2f}%")

Процент наблюдений, используемый для тренировки: 0.97%


In [50]:
%%time
# filtered_raw = pl.from_pandas(filtered_raw)
# train_raw = ohe_data(filtered_raw)
# train_data = full_history(train_raw)
# Train = create_dataset(train_data)

# filtered_sub = pl.from_pandas(filtered_sub)
# test_sub = ohe_data(filtered_sub)
# test_data= full_history(test_sub)
# Test = create_dataset(test_data, history_flag = 1)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/IPython/core/magics/execution.py", line 1340, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 2, in <module>
  File "/tmp/ipykernel_13/1043687319.py", line 14, in ohe_data
    train_raw = raw.to_dummies(columns='cart').group_by(['user_id', 'order_completed_at']).max()
  File "/usr/local/lib/python3.10/site-packages/polars/dataframe/frame.py", line 9064, in to_dummies
    return self._from_pydf(self._df.to_dummies(columns, separator, drop_first))
polars.exceptions.DuplicateError: unable to hstack, column with name "cart_0" already exists

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2144, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/usr/local/lib/python3.10/site-packages/IPython/core/ultratb.py", line 1435, in 

In [10]:
Train

Unnamed: 0,user_id,category,id,hour_mean,hour_max,hour_min,hour_std,week_mean,week_max,week_min,...,rating_per_w_median,rating_per_d_mean,rating_per_d_std,rating_per_d_sum,rating_per_d_median,rating_per_hour_mean,rating_per_hour_std,rating_per_hour_sum,rating_per_hour_median,target
0,0,0,0;0,8.500000,9,8,0.707107,32.000000,35,29,...,0.0,0.966042,5.660171,8.250000,0.0,0.966042,5.660171,8.250000,0.0,0
1,3,0,3;0,13.833333,19,10,3.488075,28.666667,48,15,...,0.0,0.152875,0.820443,1.305556,0.0,0.152875,0.820443,1.305556,0.0,0
2,4,0,4;0,9.428571,16,6,3.823486,20.571429,28,16,...,0.0,0.241361,1.196201,2.061224,0.0,0.241361,1.196201,2.061224,0.0,0
3,5,0,5;0,12.642857,20,6,4.877060,29.142857,35,23,...,0.0,0.066315,0.281990,0.566327,0.0,0.066315,0.281990,0.566327,0.0,0
4,7,0,7;0,11.800000,18,5,4.211096,12.100000,26,1,...,0.0,0.100703,0.461075,0.860000,0.0,0.100703,0.461075,0.860000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9838929,19991,99,19991;99,9.000000,12,6,4.242641,35.000000,35,35,...,0.0,0.731850,4.387127,6.250000,0.0,0.731850,4.387127,6.250000,0.0,0
9838930,19992,99,19992;99,10.500000,12,9,2.121320,35.000000,35,35,...,0.0,1.083138,5.638909,9.250000,0.0,1.083138,5.638909,9.250000,0.0,0
9838931,19993,99,19993;99,9.666667,11,7,2.309401,35.666667,36,35,...,0.0,0.403331,2.278648,3.444444,0.0,0.403331,2.278648,3.444444,0.0,0
9838932,19994,99,19994;99,11.500000,16,8,3.696846,35.500000,36,35,...,0.0,0.219555,1.265042,1.875000,0.0,0.219555,1.265042,1.875000,0.0,0


In [13]:
x_cols = Train.select_dtypes(include=['int', 'int8', 'int32', 'uint32', 'uint8', 'float']).drop(columns = ['target'], axis = 1).columns.tolist()
x_cols_pca = Train.select_dtypes(include=['int', 'int8', 'int32', 'uint32', 'uint8', 'float']).drop(columns = ['user_id', 'target'], axis = 1).columns.tolist()
print(x_cols)
y_cols = ['target']
print(y_cols)

['user_id', 'hour_mean', 'hour_max', 'hour_min', 'hour_std', 'week_mean', 'week_max', 'week_min', 'week_std', 'weekday_mean', 'weekday_max', 'weekday_min', 'weekday_std', 'day_mean', 'day_max', 'day_min', 'day_std', 'month_mean', 'month_max', 'month_min', 'month_std', 'year_mean', 'year_max', 'year_min', 'year_std', 'total_rating_mean', 'total_rating_max', 'total_rating_min', 'total_rating_std', 'ordered_mean', 'ordered_max', 'ordered_min', 'ordered_std', 'total_order_in_cat_mean', 'total_order_in_cat_max', 'total_order_in_cat_min', 'total_order_in_cat_std', 'total_order_num_mean', 'total_order_num_max', 'total_order_num_min', 'total_order_num_std', 'rating_per_m_mean', 'rating_per_m_std', 'rating_per_m_sum', 'rating_per_m_median', 'rating_per_w_mean', 'rating_per_w_std', 'rating_per_w_sum', 'rating_per_w_median', 'rating_per_d_mean', 'rating_per_d_std', 'rating_per_d_sum', 'rating_per_d_median', 'rating_per_hour_mean', 'rating_per_hour_std', 'rating_per_hour_sum', 'rating_per_hour_med

In [5]:
import numpy as np
import pandas as pd
import datetime
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt, numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
from sklearn import metrics
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
np.random.seed(42)

In [14]:
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
scaler = RobustScaler()

s_Train = Train.copy()
s_Train[x_cols_pca] = scaler.fit_transform(s_Train[x_cols_pca])

pca = PCA(n_components=10)

s_Train_pca = pca.fit_transform(s_Train[x_cols])
# test_X_scaled = pca.transform(test_X_scaled)

Train_pca = pd.DataFrame(s_Train_pca, columns=[f'pc{i+1}' for i in range(s_Train_pca.shape[1])])
# Train_pca['target'] = Train['target'].values
# Train_pca['id'] = Train['id'].values

In [17]:
s_Train[Train_pca.columns.to_list()] = Train_pca
s_Train

Unnamed: 0,user_id,category,id,hour_mean,hour_max,hour_min,hour_std,week_mean,week_max,week_min,...,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10
0,0,0,0;0,-0.768116,-1.8,0.50,-1.594049,0.821429,0.0625,0.684211,...,-9814.434779,21.039675,10.066589,0.192679,0.388528,-2.988666,1.415498,-2.724929,-3.571390,2.320423
1,3,0,3;0,0.623188,0.2,1.00,-0.275924,0.345238,0.8750,-0.052632,...,-9811.440029,21.039691,1.745630,0.585644,10.410185,0.163803,1.324620,2.394956,3.260103,-0.467190
2,4,0,4;0,-0.525880,-0.4,0.00,-0.116946,-0.811224,-0.3750,0.000000,...,-9810.438093,21.039691,1.833156,-1.388269,1.424025,-0.399947,1.373414,1.256920,-0.083770,0.817486
3,5,0,5;0,0.312629,0.4,0.00,0.382428,0.413265,0.0625,0.368421,...,-9809.438700,21.039695,-0.104339,-0.549669,3.711174,0.174886,0.178057,0.744336,0.411674,-0.234346
4,7,0,7;0,0.092754,0.0,-0.25,0.066774,-2.021429,-0.5000,-0.789474,...,-9807.439341,21.039693,0.551093,-0.841710,3.246724,-0.763211,1.146164,1.633433,2.318662,-0.138792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9838929,19991,99,19991;99,-0.637681,-1.2,0.00,0.081725,1.250000,0.0625,1.000000,...,10176.563818,16.158928,3.238045,1.610065,2.940811,-2.144490,0.719904,-6.095499,2.062425,0.459697
9838930,19992,99,19992;99,-0.246377,-1.2,0.75,-0.923739,1.250000,0.0625,1.000000,...,10177.564639,16.158922,6.779051,2.811337,2.565957,-0.782181,-1.411136,-5.722500,2.487012,-0.044950
9838931,19993,99,19993;99,-0.463768,-1.4,0.25,-0.834593,1.345238,0.1250,1.000000,...,10178.562612,16.158936,-1.272023,-0.240670,0.732097,3.056448,1.010589,3.120446,0.578490,1.438864
9838932,19994,99,19994;99,0.014493,-0.4,0.50,-0.176971,1.321429,0.1250,1.000000,...,10179.562075,16.158940,-3.478253,-0.586469,1.684106,2.565737,2.135874,0.076319,-0.809026,0.456661


In [None]:
# print('Elbow Method to determine the number of clusters to be formed:')
# warnings.filterwarnings("ignore", message="findfont:.*")
# Elbow_M = KElbowVisualizer(KMeans(), k=21)
# Elbow_M.fit(Train_pca.drop(columns = ['id', 'target'], axis = 1))
# Elbow_M.show()

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=7)
yhat_kmeans = kmeans.fit_predict(Train_pca.drop(columns=['id', 'target']))

# Добавление информации о кластерах в исходный датафрейм
# Train_pca["Clusters"] = yhat_kmeans
s_Train["Clusters"] = yhat_kmeans
# Train["Clusters"] = yhat_kmeans

In [None]:
# Train_pca = Train_pca.drop(columns = ['id', 'target'], axis = 1)
s_Train[Train_pca.columns.to_list()] = Train_pca

In [8]:
# Train.to_parquet('Train.parquet', index=False)
# Train_pca.to_parquet('s_Train_pca.parquet', index=False)

## Parquet

In [7]:
Train = pd.read_parquet('/kaggle/input/next-orders/Train (1).parquet')

In [6]:
N_THREADS = 2 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'target' # Target column name

In [7]:
Train_set, Valid_set = train_test_split(Train, test_size = TEST_SIZE,
                                        stratify = None, random_state = 23)
Train_set.reset_index(drop=True, inplace=True)
Valid_set.reset_index(drop=True, inplace=True)

In [8]:
%%time
model0 = BoostCB()
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie)

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=60)
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

pipe1 = LGBSimpleFeatures()

model = BoostCB()

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 0 ns, sys: 422 µs, total: 422 µs
Wall time: 429 µs


In [9]:
def f1 (real, pred, **kwargs):
    return f1_score(real, (pred > 0.2).astype(int), **kwargs)

# ROLES = {'target': TARGET_NAME, 'drop': ['id'], 'category': ['Clusters']}
ROLES = {'target': TARGET_NAME, 'drop': ['id'], 'category': ['Clusters']}
TASK = Task('binary', metric = f1)
reader = PandasToPandasReader(TASK, cv=N_FOLDS, random_state=RANDOM_STATE)

lama_params = {
    "task": TASK,
    "cpu_limit": N_THREADS,
    "reader_params": {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
}
default_nn_params = {
    "bs": 128, "num_workers": 0, "path_to_save": None, "n_epochs": 1, "freeze_defaults": True
}


In [10]:
from kaggle_secrets import UserSecretsClient
import wandb
from wandb.keras import WandbCallback

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)
wandb.login()

2024-02-27 07:31:08.556825: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-27 07:31:08.556976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-27 07:31:08.713459: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Currently logged in as: [33mirinyakov2016[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [11]:
CONFIG = dict (
    lama_params,
    general_params = {"use_algos": [["linear_12", "lgbm", "denselight"]]},
    tuning_params = {'max_tuning_iter': 20},
    lgb_params = {'default_params': {'num_threads': N_THREADS}},
    nn_params={**default_nn_params,'lr': 0.03},
    infra = "Kaggle",
    competition = 'plant-pathology',
    _wandb_kernel = 'ayut'
)
# Update CONFIG dict with the name of the model.
CONFIG['model_name'] = 'lightAutoML-denselight'
print('Training configuration: ', CONFIG)

# Initialize W&B run
run = wandb.init(project='sber-inter', 
                 config=CONFIG,
                 group='lightAutoML', 
                 job_type='train')

wandb.config.type = 'lightAutoML'
wandb.config.kaggle_competition = 'SberMarket Competition'

Training configuration:  {'task': <lightautoml.tasks.base.Task object at 0x7c3a6c8d27a0>, 'cpu_limit': 2, 'reader_params': {'n_jobs': 2, 'cv': 5, 'random_state': 42}, 'general_params': {'use_algos': [['linear_12', 'lgbm', 'denselight']]}, 'tuning_params': {'max_tuning_iter': 20}, 'lgb_params': {'default_params': {'num_threads': 2}}, 'nn_params': {'bs': 128, 'num_workers': 0, 'path_to_save': None, 'n_epochs': 1, 'freeze_defaults': True, 'lr': 0.03}, 'infra': 'Kaggle', 'competition': 'plant-pathology', '_wandb_kernel': 'ayut', 'model_name': 'lightAutoML-denselight'}


In [26]:
%%time 
from lightautoml.report.report_deco import ReportDeco

# RD = ReportDeco(output_path = 'tabularAutoML_lld_report')

# automl = TabularAutoML(
#         **lama_params,
#         general_params = {"use_algos": ["linear_12"]},
#     #     tuning_params = {'max_tuning_iter': 20},
# #         lgb_params = {'default_params': {'num_threads': N_THREADS}},
# #         nn_params={**default_nn_params,'lr': 0.03},
#     #     linear_pipeline_params = pipe,
#     #     gbm_pipeline_params = pipe,
#     #     nn_pipeline_params = pipe
    
# )
automl = TabularAutoML(task = TASK, 
                       timeout = 300,
                       cpu_limit = 4,
                       reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 23},
                       general_params = {'use_algos': [['cb']]},
                      )

# automl = AutoML(reader, [
#     [pipeline_lvl1],
#     [pipeline_lvl2],
# ], skip_conn=False)

train_pred = automl.fit_predict(Train_set[['hour_mean', 'hour_max', 'hour_min', 'hour_std', 
                                           'weekday_mean', 'weekday_max', 'weekday_min', 'weekday_std',
                                           'month_mean', 'month_max', 'month_min', 'month_std',
                                           'total_rating_mean', 'total_rating_max', 'total_rating_min', 'total_rating_std', 
                                           'ordered_mean', 'ordered_max', 'ordered_min', 'ordered_std', 
                                           'total_order_in_cat_mean', 'total_order_in_cat_max', 'total_order_in_cat_min', 
                                           'total_order_in_cat_std', 'total_order_num_mean', 'total_order_num_max', 
                                           'total_order_num_min', 'total_order_num_std', 'rating_per_m_mean', 
                                           'rating_per_d_mean', 'rating_per_hour_mean', 'target']], roles = ROLES, verbose = 10)
# print('Score', "%.5f" % f1(Train_set.target, train_pred.data))
# valid_pred = automl.predict(Valid_set)
# print('Score on out of folds validation', "%.5f" % f1(Valid_set.target, valid_pred.data))

[07:49:47] Stdout logging level is DEBUG.
[07:49:47] Task: binary

[07:49:47] Start automl preset with listed constraints:
[07:49:47] - time: 300.00 seconds
[07:49:47] - CPU: 4 cores
[07:49:47] - memory: 16 GB

[07:49:47] [1mTrain data shape: (7871147, 17)[0m

[07:49:54] Feats was rejected during automatic roles guess: []
[07:49:55] Layer [1m1[0m train process start. Time left 292.92 secs
[07:50:03] Training until validation scores don't improve for 100 rounds
[07:53:14] [100]	valid's binary_logloss: 0.0890262	valid's Opt metric: 0
[07:53:16] Early stopping, best iteration is:
[1]	valid's binary_logloss: 0.0899184	valid's Opt metric: 0
[07:53:16] [1mSelector_LightGBM[0m fitting and predicting completed
[07:53:16] Start fitting [1mLvl_0_Pipe_0_Mod_0_CatBoost[0m ...
[07:53:16] Training params: {'task_type': 'CPU', 'thread_count': 4, 'random_seed': 42, 'num_trees': 3000, 'learning_rate': 0.05, 'l2_leaf_reg': 0.01, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max

In [25]:
train_pred

array([[0.02154765],
       [0.01641403],
       [0.02047052],
       ...,
       [0.01741803],
       [0.01721921],
       [0.01768548]], dtype=float32)

In [None]:
wandb.finish()

In [18]:
automl.get_feature_scores('fast')

In [19]:
# Accurate feature importances calculation with detailed info (Permutation importances) -  can take long time to calculate
accurate_fi = automl.get_feature_scores('fast')
accurate_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

AttributeError: 'NoneType' object has no attribute 'set_index'

In [32]:
import joblib

joblib.dump(automl_rd.model, 'automl_rd.pkl')
# automl=joblib.load('/kaggle/input/next-orders/automl_rd.pkl')


# Initialize a new W&B run
run = wandb.init(project='sber-inter', 
                 config=CONFIG,
                 group='lightAutoML', 
                 job_type='save')

# Update `wandb.config`
wandb.config.type = 'lightAutoML'
wandb.config.kaggle_competition = 'SberMarket Competition'

# Save model as Model Artifact
artifact = wandb.Artifact(name='automl_rd', type='model')
artifact.add_file('/kaggle/input/next-orders/automl_rd.pkl')
run.log_artifact(artifact)

# Finish W&B run
run.finish()

PicklingError: Can't pickle <function f1 at 0x7c05cb045510>: it's not the same object as __main__.f1

In [14]:
best_score = 0
for i in np.arange(0.01, 1.0, 0.01):
    score = f1 = f1_score(Valid_set.target, (valid_pred.data > i).astype(int))
    if score > best_score:
        best_score = score
        proba_split = i

print('At i =', "%.2f" % proba_split,'score is : ' "%.5f" % best_score)

At i = 0.21 score is : 0.42834


In [None]:
Train

In [None]:
Test = Train.copy()
Test['order_number'] += 1 
Test['ordered'] = Test['ordered'] + Test['target']
test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)
Test['rating'] = Test['ordered'] / Test['order_number']
Test = Test.drop('target', axis=1)
Test.head(3)

In [None]:
predictions = automl.predict(Test)
print('Train target mean:', "%.5f" % Train.target.mean())
print('Test target mean:', "%.5f" % (predictions.data > 0.5).astype(int).mean())

In [None]:
th = 0.5
train_mean = Train.target.mean()
test_mean = (predictions.data > th).astype(int).mean()

while test_mean < train_mean:
    th -= 0.005
    test_mean = (predictions.data > th).astype(int).mean()
    
print('Threshold:', "%.4f" % th)
print('Train mean:', "%.5f" % train_mean)
print('New Test mean:', "%.5f" % test_mean)

In [None]:
Test['target'] = (predictions.data > th).astype(int)
submit = pd.merge(sub['id'], Test[['id', 'target']], on='id')

In [None]:
import csv

with open('submission.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(submit.columns)
    for row in submit.values:
        csvwriter.writerow(row)

In [None]:
submit