In [1]:
# !python3 -m venv myenv
# !source myenv/bin/activate
!pip uninstall pandas -y
!pip install --upgrade pip > installations.txt
!pip uninstall torch -y > installations.txt
!pip install torch==2.0.0 > installations.txt
!pip install pandas==1.4.3 pyarrow yellowbrick polars transformers nltk gensim lightautoml > installations.txt
!pip install --upgrade -q wandb > installations.txt

Found existing installation: pandas 2.2.0
Uninstalling pandas-2.2.0:
  Successfully uninstalled pandas-2.2.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.13 requires pandas, which is not installed.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.7.13 requires pandas, which is not installed.
stable-baselines3 2.1.0 requires pandas, which is not installed.
torchaudio 2.1.2+cpu requires torch==2.1.2, but you have torch 2.0.0 which is incompatible.
torchtext 0.16.2+cpu requires torch==2.1.2, but you have torch 2.0.0 which is incompatible.
torchvision 0.16.2+cpu requires torch==2.1.2, but you have torch 2.0.0 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not curre

In [2]:
import polars as pl
import numpy as np
import pyarrow as pa
import os
import time
import optuna
import requests
import sys
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
from copy import deepcopy as copy
import torch.nn as nn
from collections import OrderedDict
from collections import Counter

from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.boost_cb import BoostCB
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/sbermarket-internship-competition/sample_submission.csv
/kaggle/input/sbermarket-internship-competition/train.csv
/kaggle/input/next-orders/Train (1).parquet
/kaggle/input/next-orders/s_Train.parquet
/kaggle/input/next-orders/s_Test.parquet


In [3]:
print(pd.__version__)

1.4.3


In [4]:
def ohe_data(raw: pd.DataFrame) -> pl.DataFrame:
    """
    Вовращает ohe матрицу для категорий.

    Args:
        raw (pd.DataFrame): Pandas DataFrame.

    Returns:
        (tpl.DataFrame).
    """

    # OHE 'cart', группировка по юзеру и дате заказа по максимальным значениям
    # Таким образом для каждой категории, если она была в этот день в заказе у пользователя, будет 1
    train_raw = pd.get_dummies(raw, columns=['cart'])
    train_raw = pl.from_pandas(train_raw)
    train_raw = train_raw.group_by(['user_id', 'order_completed_at']).max()
    train_raw = train_raw.sort(['user_id', 'order_completed_at'])

    return train_raw

def full_history(train_raw: pl.DataFrame) -> pl.DataFrame:
    """
    Обрабатывает датафрейм polars и возвращает весь датасет.

    Args:
        raw (pl.DataFrame): Polars DataFrame для обработки.

    Returns:
        pl.DataFrame: Обработанный DataFrame для train.
    """
    
    # Вычисление переменных времени
    train_raw = train_raw.with_columns(pl.col("order_completed_at").str.to_datetime())
    train_raw = train_raw.with_columns(
        [
            (pl.col('user_id').cum_count() - pl.lit(1)).over(['user_id']).alias('order_number'),
            pl.col("order_completed_at").dt.hour().alias("hour"),
            pl.col("order_completed_at").dt.week().alias("week"),
            pl.col("order_completed_at").dt.weekday().alias("weekday"),
            pl.col("order_completed_at").dt.day().alias("day"),
            pl.col("order_completed_at").dt.month().alias("month"),
            pl.col("order_completed_at").dt.year().alias("year"),
        ]
    )
    train_raw = train_raw.drop('order_completed_at')

    return train_raw

def create_dataset(train_data: pl.DataFrame, history_flag: int = 0)-> pd.DataFrame:
    """
    Компилирует историю заказов (train_data) в укомплектованный тренировочный pd.Dataframe "Train".

    Args:
        train_data (pl.DataFrame): история заказов (корзина на каждого юзера).
        history_flag (int): флаг 0 - Тренировочный датасет, 1 - общий (тестовый) датасет без таргета.

    Returns:
        pd.DataFrame: X + Y for model/ or just X.
    """
    # Сепарация последнего заказа из истории заказов
    grouped = train_data.group_by('user_id').agg(pl.max('order_number').alias('max_order_number'))
    merged = train_data.join(grouped, on='user_id')
    last_order = merged['max_order_number'] == merged['order_number']

    # Разделение на train_data (заказы, кроме последнего) и valid_data (последний заказ) .group_by('user_id').sum()
    train_tmp = train_data.filter(~last_order)
    val_data = train_data.filter(last_order).drop('hour', 'week', 'weekday', 'day', 'month', 'year')
    # Преобразование в длинный формат юзера и категории товара
    if not history_flag:
        train_melt = train_tmp.drop('order_number').melt(id_vars=['user_id', 'hour', 'week', 'weekday', 'day', 'month', 'year'], variable_name='category', value_name='ordered')
    elif history_flag == 1:
        train_melt = train_data.drop('order_number').melt(id_vars=['user_id', 'hour', 'week', 'weekday', 'day', 'month', 'year'], variable_name='category', value_name='ordered')
    else: 
        print("Invalid history flag. Exiting...")
        exit()
    # Тренировочный датасет
    Train = train_melt
    
    order_number_df = val_data.select(['user_id', 'order_number']).unique() # Количество заказов у юзера
    dl_tmp = Train.group_by('category').agg(pl.col(['ordered']).sum()) # Количество заказов по категории
    if history_flag == 1:
        order_number_df = order_number_df.with_columns(pl.col("order_number") + 1)
        dl_tmp = Train.group_by('category').agg(pl.col(['ordered']).sum()+1)
    Train = Train.join(order_number_df, on='user_id').rename({"order_number": "total_order_num"})
    Train = Train.join(dl_tmp, on='category').rename({"ordered_right": "total_order_in_cat"})
    # Вычисление рейтинга для каждой записи !!?? Как еще можно рассчитывать рейтинг
    Train = Train.with_columns(
        [
            (pl.col('ordered') / pl.col('total_order_num')).alias('total_rating'),
            (pl.col('user_id').cast(pl.Utf8) + ';' + pl.col('category')).alias('id')
        ]
    )
    Train = Train.group_by('user_id', 'category', 'id').agg(
        pl.col(['hour']).mean().name.suffix("_mean"), pl.col(['hour']).max().name.suffix("_max"),
        pl.col(['hour']).min().name.suffix("_min"), pl.col(['hour']).std().name.suffix("_std"),
        pl.col(['week']).mean().name.suffix("_mean"), pl.col(['week']).max().name.suffix("_max"),
        pl.col(['week']).min().name.suffix("_min"), pl.col(['week']).std().name.suffix("_std"),
        pl.col(['weekday']).mean().name.suffix("_mean"), pl.col(['weekday']).max().name.suffix("_max"),
        pl.col(['weekday']).min().name.suffix("_min"), pl.col(['weekday']).std().name.suffix("_std"),
        pl.col(['day']).mean().name.suffix("_mean"), pl.col(['day']).max().name.suffix("_max"),
        pl.col(['day']).min().name.suffix("_min"), pl.col(['day']).std().name.suffix("_std"),
        pl.col(['month']).mean().name.suffix("_mean"), pl.col(['month']).max().name.suffix("_max"),
        pl.col(['month']).min().name.suffix("_min"), pl.col(['month']).std().name.suffix("_std"),
        pl.col(['year']).mean().name.suffix("_mean"), pl.col(['year']).max().name.suffix("_max"),
        pl.col(['year']).min().name.suffix("_min"), pl.col(['year']).std().name.suffix("_std"),
        pl.col(['total_rating']).mean().name.suffix("_mean"), pl.col(['total_rating']).max().name.suffix("_max"),
        pl.col(['total_rating']).min().name.suffix("_min"), pl.col(['total_rating']).std().name.suffix("_std"),
        pl.col(['ordered']).mean().name.suffix("_mean"), pl.col(['ordered']).max().name.suffix("_max"),
        pl.col(['ordered']).min().name.suffix("_min"), pl.col(['ordered']).std().name.suffix("_std"),
        pl.col(['total_order_in_cat']).mean().name.suffix("_mean"), pl.col(['total_order_in_cat']).max().name.suffix("_max"),
        pl.col(['total_order_in_cat']).min().name.suffix("_min"), pl.col(['total_order_in_cat']).std().name.suffix("_std"),
        pl.col(['total_order_num']).mean().name.suffix("_mean"), pl.col(['total_order_num']).max().name.suffix("_max"),
        pl.col(['total_order_num']).min().name.suffix("_min"), pl.col(['total_order_num']).std().name.suffix("_std"),
    )
    # Вычисление рейтинга по времени
    rating_per_hour = Train.group_by(['year_mean', 'month_mean', 'hour_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_hour_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_hour_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_hour_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_hour_median"),
    )

    rating_per_weekday = Train.group_by(['year_mean', 'month_mean', 'weekday_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_w_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_w_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_w_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_w_median"),
    )
    rating_per_day = Train.group_by(['year_mean', 'month_mean', 'day_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_d_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_d_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_d_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_d_median"),
    )

    rating_per_month = Train.group_by(['year_mean', 'month_mean', 'user_id']).agg(
        ((pl.col("total_rating_mean").mean()) * 100).alias("rating_per_m_mean"),
        ((pl.col("total_rating_mean").std()) * 100).alias("rating_per_m_std"),
        ((pl.col("total_rating_mean").sum())).alias("rating_per_m_sum"),
        ((pl.col("total_rating_mean").median()) * 100).alias("rating_per_m_median"),
    )


    Train = Train.join(rating_per_month, on=['year_mean', 'month_mean', 'user_id'])
    Train = Train.join(rating_per_weekday, on=['year_mean', 'month_mean', 'weekday_mean', 'user_id'])
    Train = Train.join(rating_per_day, on=['year_mean', 'month_mean', 'day_mean', 'user_id'])
    Train = Train.join(rating_per_hour, on=['year_mean', 'month_mean', 'hour_mean', 'user_id'])
    
    if not history_flag:
    # Присоединение целевой переменной из valid_melt к Train
        valid_melt = val_data.drop('order_number',).melt(id_vars=['user_id', ], variable_name='category', value_name='target')
        Train = Train.join(valid_melt, on = ['user_id', 'category'])
    #     Train = Train.drop('user_id', 'category')

    # Преобразование в pandas DataFrame
    Train = Train.to_pandas()
    Train['id'] = Train['id'].str.replace('cart_', '')
    Train['category'] = Train['category'].str.replace('cart_', '')

    return Train

In [60]:
# tmp импорт в pandas

raw = pd.read_csv('../input/sbermarket-internship-competition/train.csv')
sub = pd.read_csv('../input/sbermarket-internship-competition/sample_submission.csv', sep = ",")

# Приведение столбца 'cart' к int
raw['cart'] = raw['cart'].astype(int)

def filter_raw_data(raw, sub):
    users = map(int, (x.split(';')[0] for x in sub['id']))
    user_counts = Counter(users)
    frequent_users = {user for user, count in user_counts.items() if count > 25}
    sub_users = {user for user, count in user_counts.items()}
    filtered_raw = raw[raw['user_id'].isin(frequent_users)]
    filtered_sub = raw[raw['user_id'].isin(sub_users)]
    total_count = sum(user_counts.values())
    frequent_count = sum(count for user, count in user_counts.items() if user in frequent_users)
    proportion = frequent_count / total_count
    
    return filtered_raw, filtered_sub, proportion

filtered_raw, filtered_sub, proportion = filter_raw_data(raw, sub)
print(f"Процент наблюдений, используемый для тренировки: {proportion:.2f}%")

In [90]:
filtered_sub

Unnamed: 0,user_id,order_completed_at,cart
16,3,2015-06-18 16:15:33,399
17,3,2015-07-04 14:05:22,399
18,4,2015-07-08 06:59:04,54
19,4,2015-07-08 06:59:04,55
20,5,2015-07-22 08:16:24,409
...,...,...,...
3123059,12702,2020-09-03 23:45:45,441
3123060,12702,2020-09-03 23:45:45,92
3123061,12702,2020-09-03 23:45:45,431
3123062,12702,2020-09-03 23:45:45,24


In [5]:
%%time
# # filtered_raw = pl.from_pandas(filtered_raw)
# train_raw = ohe_data(filtered_raw)
# train_data = full_history(train_raw)
# Train = create_dataset(train_data)

# # filtered_sub = pl.from_pandas(filtered_sub)
test_sub = ohe_data(filtered_sub)
test_data= full_history(test_sub)
Test = create_dataset(test_data, history_flag = 1)


  order_number_df = order_number_df.with_columns(pl.col("order_number").apply(lambda x: x + 1))
Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("order_number").map_elements(lambda x: ...)
with this one instead:
  + pl.col("order_number") + 1

  order_number_df = order_number_df.with_columns(pl.col("order_number").apply(lambda x: x + 1))


CPU times: user 8min 32s, sys: 2min 49s, total: 11min 22s
Wall time: 1min 4s


In [6]:
Test

Unnamed: 0,user_id,category,id,hour_mean,hour_max,hour_min,hour_std,week_mean,week_max,week_min,...,rating_per_w_sum,rating_per_w_median,rating_per_d_mean,rating_per_d_std,rating_per_d_sum,rating_per_d_median,rating_per_hour_mean,rating_per_hour_std,rating_per_hour_sum,rating_per_hour_median
0,153,0,153;0,14.666667,22,8,3.792223,29.066667,49,5,...,0.773333,0.0,0.090132,0.378244,0.773333,0.0,0.090132,0.378244,0.773333,0.0
1,284,0,284;0,14.157895,23,1,4.728908,28.315789,52,2,...,1.116343,0.0,0.130110,0.458485,1.116343,0.0,0.130110,0.458485,1.116343,0.0
2,895,0,895;0,12.470588,21,7,2.956608,26.647059,52,2,...,0.442042,0.0,0.051520,0.208794,0.442042,0.0,0.051520,0.208794,0.442042,0.0
3,957,0,957;0,13.529412,20,8,4.417545,20.470588,34,6,...,1.259516,0.0,0.146797,0.612990,1.259516,0.0,0.146797,0.612990,1.259516,0.0
4,1346,0,1346;0,12.625000,22,8,4.349329,37.500000,48,10,...,0.882812,0.0,0.102892,0.441037,0.882812,0.0,0.102892,0.441037,0.882812,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11184883,19389,880,19389;880,12.200000,16,9,3.114482,34.200000,36,31,...,3.800000,0.0,0.442890,2.263737,3.800000,0.0,0.442890,2.263737,3.800000,0.0
11184884,19477,880,19477;880,13.666667,19,7,6.110101,34.000000,36,31,...,6.000000,0.0,0.699301,3.143645,6.000000,0.0,0.699301,3.143645,6.000000,0.0
11184885,19682,880,19682;880,14.000000,17,12,2.645751,33.666667,35,32,...,2.555556,0.0,0.297850,2.022086,2.555556,0.0,0.297850,2.022086,2.555556,0.0
11184886,19767,880,19767;880,14.333333,23,6,6.708204,34.111111,36,33,...,0.296296,0.0,0.034533,0.346039,0.296296,0.0,0.034533,0.346039,0.296296,0.0


In [21]:
x_cols = Train.select_dtypes(include=['int', 'int8', 'int32', 'uint32', 'uint8', 'float']).drop(columns = ['target'], axis = 1).columns.tolist()
x_cols_pca = Train.select_dtypes(include=['int', 'int8', 'int32', 'uint32', 'uint8', 'float']).drop(columns = ['user_id', 'target'], axis = 1).columns.tolist()
print(x_cols)
y_cols = ['target']
print(y_cols)

['user_id', 'hour_mean', 'hour_max', 'hour_min', 'hour_std', 'week_mean', 'week_max', 'week_min', 'week_std', 'weekday_mean', 'weekday_max', 'weekday_min', 'weekday_std', 'day_mean', 'day_max', 'day_min', 'day_std', 'month_mean', 'month_max', 'month_min', 'month_std', 'year_mean', 'year_max', 'year_min', 'year_std', 'total_rating_mean', 'total_rating_max', 'total_rating_min', 'total_rating_std', 'ordered_mean', 'ordered_max', 'ordered_min', 'ordered_std', 'total_order_in_cat_mean', 'total_order_in_cat_max', 'total_order_in_cat_min', 'total_order_in_cat_std', 'total_order_num_mean', 'total_order_num_max', 'total_order_num_min', 'total_order_num_std', 'rating_per_m_mean', 'rating_per_m_std', 'rating_per_m_sum', 'rating_per_m_median', 'rating_per_w_mean', 'rating_per_w_std', 'rating_per_w_sum', 'rating_per_w_median', 'rating_per_d_mean', 'rating_per_d_std', 'rating_per_d_sum', 'rating_per_d_median', 'rating_per_hour_mean', 'rating_per_hour_std', 'rating_per_hour_sum', 'rating_per_hour_med

In [9]:
import numpy as np
import pandas as pd
import datetime
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt, numpy as np
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
from sklearn import metrics
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
np.random.seed(42)

In [10]:
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
scaler = RobustScaler()

s_Train = Train.copy()
s_Test = Test.copy()
s_Train[x_cols_pca] = scaler.fit_transform(s_Train[x_cols_pca])
s_Test[x_cols_pca] = scaler.transform(s_Test[x_cols_pca])

pca = PCA(n_components=10)

s_Train_pca = pca.fit_transform(s_Train[x_cols])
s_Test_pca = pca.transform(s_Test[x_cols])
# test_X_scaled = pca.transform(test_X_scaled)

Train_pca = pd.DataFrame(s_Train_pca, columns=[f'pc{i+1}' for i in range(s_Train_pca.shape[1])])
Test_pca = pd.DataFrame(s_Test_pca, columns=[f'pc{i+1}' for i in range(s_Train_pca.shape[1])])
# Train_pca['target'] = Train['target'].values
# Train_pca['id'] = Train['id'].values

In [11]:
s_Train[Train_pca.columns.to_list()] = Train_pca
s_Test[Test_pca.columns.to_list()] = Test_pca

In [14]:
s_Test

Unnamed: 0,user_id,category,id,hour_mean,hour_max,hour_min,hour_std,week_mean,week_max,week_min,...,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10
0,153,0,153;0,0.840580,0.8,0.50,-0.131764,0.402381,0.9375,-0.578947,...,-9661.438391,24.418623,-0.362202,-0.833304,1.116625,0.871740,0.504358,0.722600,-0.394680,-0.818780
1,284,0,284;0,0.707857,1.0,-1.25,0.312207,0.295113,1.1250,-0.736842,...,-9530.438553,24.417897,-0.526686,0.055853,1.296554,0.134439,0.717725,-0.159912,-0.511948,-0.331757
2,895,0,895;0,0.267690,0.6,0.25,-0.527829,0.056723,1.1250,-0.736842,...,-8919.438727,24.417427,-1.623824,1.560442,-0.071487,0.079832,-0.287262,-0.488330,1.457815,-0.502465
3,957,0,957;0,0.543905,0.4,0.50,0.164627,-0.825630,0.0000,-0.526316,...,-8857.438018,24.418469,0.331360,-0.703815,-0.167349,-0.163138,0.208841,-1.033115,0.680857,-0.961515
4,1346,0,1346;0,0.307971,0.8,0.50,0.132294,1.607143,0.8750,-0.315789,...,-8468.438258,24.407191,-0.417748,-0.338557,2.837296,-0.251890,0.699092,0.339546,-0.423524,-0.422657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11184883,19389,880,19389;880,0.197101,-0.4,0.75,-0.453000,1.135714,0.1250,0.789474,...,9574.562545,-5.331153,-1.134763,0.235694,0.873033,2.274734,1.436147,1.586598,0.504634,0.013380
11184884,19477,880,19477;880,0.579710,0.2,0.25,0.966866,1.107143,0.1250,0.789474,...,9662.563017,-5.331153,0.808744,1.023446,0.691884,2.552110,4.407932,0.211402,1.850834,-0.955807
11184885,19682,880,19682;880,0.666667,-0.2,1.50,-0.675169,1.059524,0.0625,0.842105,...,9867.562567,-5.331145,-1.445396,-0.493861,1.566347,2.146775,-1.682456,0.871803,-0.763452,-0.624961
11184886,19767,880,19767;880,0.753623,1.0,0.00,1.250355,1.123016,0.1250,0.894737,...,9952.561589,-5.331135,-5.199216,-0.411623,2.468933,-0.151392,-0.222515,-1.403892,-1.141549,-1.334673


In [None]:
# print('Elbow Method to determine the number of clusters to be formed:')
# warnings.filterwarnings("ignore", message="findfont:.*")
# Elbow_M = KElbowVisualizer(KMeans(), k=21)
# Elbow_M.fit(Train_pca.drop(columns = ['id', 'target'], axis = 1))
# Elbow_M.show()

In [15]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
yhat_kmeans = kmeans.fit_predict(s_Train.drop(columns=['id', 'target', 'user_id', 'category']))
yhat_kmeans_test = kmeans.predict(s_Test.drop(columns=['id','user_id','category']))

# Добавление информации о кластерах в исходный датафрейм
# Train_pca["Clusters"] = yhat_kmeans
s_Train["Clusters"] = yhat_kmeans
s_Test["Clusters"] = yhat_kmeans_test
# Train["Clusters"] = yhat_kmeans

In [16]:
s_Test

Unnamed: 0,user_id,category,id,hour_mean,hour_max,hour_min,hour_std,week_mean,week_max,week_min,...,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,Clusters
0,153,0,153;0,0.840580,0.8,0.50,-0.131764,0.402381,0.9375,-0.578947,...,24.418623,-0.362202,-0.833304,1.116625,0.871740,0.504358,0.722600,-0.394680,-0.818780,2
1,284,0,284;0,0.707857,1.0,-1.25,0.312207,0.295113,1.1250,-0.736842,...,24.417897,-0.526686,0.055853,1.296554,0.134439,0.717725,-0.159912,-0.511948,-0.331757,2
2,895,0,895;0,0.267690,0.6,0.25,-0.527829,0.056723,1.1250,-0.736842,...,24.417427,-1.623824,1.560442,-0.071487,0.079832,-0.287262,-0.488330,1.457815,-0.502465,2
3,957,0,957;0,0.543905,0.4,0.50,0.164627,-0.825630,0.0000,-0.526316,...,24.418469,0.331360,-0.703815,-0.167349,-0.163138,0.208841,-1.033115,0.680857,-0.961515,2
4,1346,0,1346;0,0.307971,0.8,0.50,0.132294,1.607143,0.8750,-0.315789,...,24.407191,-0.417748,-0.338557,2.837296,-0.251890,0.699092,0.339546,-0.423524,-0.422657,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11184883,19389,880,19389;880,0.197101,-0.4,0.75,-0.453000,1.135714,0.1250,0.789474,...,-5.331153,-1.134763,0.235694,0.873033,2.274734,1.436147,1.586598,0.504634,0.013380,4
11184884,19477,880,19477;880,0.579710,0.2,0.25,0.966866,1.107143,0.1250,0.789474,...,-5.331153,0.808744,1.023446,0.691884,2.552110,4.407932,0.211402,1.850834,-0.955807,4
11184885,19682,880,19682;880,0.666667,-0.2,1.50,-0.675169,1.059524,0.0625,0.842105,...,-5.331145,-1.445396,-0.493861,1.566347,2.146775,-1.682456,0.871803,-0.763452,-0.624961,4
11184886,19767,880,19767;880,0.753623,1.0,0.00,1.250355,1.123016,0.1250,0.894737,...,-5.331135,-5.199216,-0.411623,2.468933,-0.151392,-0.222515,-1.403892,-1.141549,-1.334673,4


In [None]:
# Train_pca = Train_pca.drop(columns = ['id', 'target'], axis = 1)
# s_Train[Train_pca.columns.to_list()] = Train_pca

In [17]:
# Train.to_parquet('Train.parquet', index=False)
# s_Train.to_parquet('s_Train.parquet', index=False)
# s_Test.to_parquet('s_Test.parquet', index=False)

## Parquet

In [52]:
# s_Train = pd.read_parquet('/kaggle/input/next-orders/s_Train.parquet')
s_Test = pd.read_parquet('/kaggle/input/next-orders/s_Test.parquet')

In [6]:
N_THREADS = 2 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'target' # Target column name

In [7]:
Train_set, Valid_set = train_test_split(s_Train, test_size = TEST_SIZE,
                                        stratify = None, random_state = 23)
Train_set.reset_index(drop=True, inplace=True)
Valid_set.reset_index(drop=True, inplace=True)

In [8]:
%%time
model0 = BoostCB()
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie)

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=60)
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

pipe1 = LGBSimpleFeatures()

model = BoostCB()

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 487 µs, sys: 3 µs, total: 490 µs
Wall time: 499 µs


In [9]:
def f1 (real, pred, **kwargs):
    return f1_score(real, (pred > 0.2).astype(int), **kwargs)

# ROLES = {'target': TARGET_NAME, 'drop': ['id'], 'category': ['Clusters']}
ROLES = {'target': TARGET_NAME, 'drop': ['id', 'user_id', 'category'], 'category': ['Clusters']}
TASK = Task('binary', metric = f1)
reader = PandasToPandasReader(TASK, cv=N_FOLDS, random_state=RANDOM_STATE)

lama_params = {
    "task": TASK,
    "cpu_limit": N_THREADS,
    "reader_params": {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
}
default_nn_params = {
    "bs": 128, "num_workers": 0, "path_to_save": None, "n_epochs": 1, "freeze_defaults": True
}


In [10]:
from kaggle_secrets import UserSecretsClient
import wandb
from wandb.keras import WandbCallback

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)
wandb.login()

2024-02-28 06:41:34.030900: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 06:41:34.031150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 06:41:34.212133: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mirinyakov2016[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
CONFIG = dict (
    lama_params,
    general_params = {"use_algos": [["linear_12", "lgbm", "denselight"]]},
    tuning_params = {'max_tuning_iter': 20},
    lgb_params = {'default_params': {'num_threads': N_THREADS}},
    nn_params={**default_nn_params,'lr': 0.03},
    infra = "Kaggle",
    competition = 'plant-pathology',
    _wandb_kernel = 'ayut'
)
# Update CONFIG dict with the name of the model.
CONFIG['model_name'] = 'lightAutoML-experiments_w_features_1'
print('Training configuration: ', CONFIG)

# Initialize W&B run
run = wandb.init(project='sber-inter', 
                 config=CONFIG,
                 group='lightAutoML', 
                 job_type='train')

wandb.config.type = 'lightAutoML'
wandb.config.kaggle_competition = 'SberMarket Competition'

Training configuration:  {'task': <lightautoml.tasks.base.Task object at 0x7ec049061a50>, 'cpu_limit': 2, 'reader_params': {'n_jobs': 2, 'cv': 5, 'random_state': 42}, 'general_params': {'use_algos': [['linear_12', 'lgbm', 'denselight']]}, 'tuning_params': {'max_tuning_iter': 20}, 'lgb_params': {'default_params': {'num_threads': 2}}, 'nn_params': {'bs': 128, 'num_workers': 0, 'path_to_save': None, 'n_epochs': 1, 'freeze_defaults': True, 'lr': 0.03}, 'infra': 'Kaggle', 'competition': 'plant-pathology', '_wandb_kernel': 'ayut', 'model_name': 'lightAutoML-experiments_w_features_1'}


In [42]:
%%time 
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.report.report_deco import ReportDeco

# RD = ReportDeco(output_path = 'tabularAutoML_lld_report')

automl = TabularUtilizedAutoML(
    task = TASK,
    timeout = 3600*3,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

# automl = TabularAutoML(
#     **lama_params,
#     general_params = {"use_algos": [["cb", "lgbm", "denselight"]]},
#     timeout = (3600*1000),
#     nn_params={
#         **default_nn_params,
#         "n_epochs": 10,
#         "tuned": False,
#         "tuning_params": {
#             "max_tuning_iter": 50,
#             "fit_on_holdout": True
#         }
#     },
# #         tuning_params = {'max_tuning_iter': 20},
# #         lgb_params = {'default_params': {'num_threads': N_THREADS}},
# #         nn_params={**default_nn_params,'lr': 0.03},
#     #     linear_pipeline_params = pipe,
#     #     gbm_pipeline_params = pipe,
#     #     nn_pipeline_params = pipe
    
# )
# automl = TabularAutoML(task = TASK, 
#                        timeout = 300,
#                        cpu_limit = 4,
#                        reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 23},
#                       )

# automl = AutoML(reader, [
#     [pipeline_lvl1],
#     [pipeline_lvl2],
# ], skip_conn=False)

train_pred = automl.fit_predict(Train_set[['total_order_in_cat_max',
 'ordered_mean',
 'pc2',
 'weekday_std',
 'pc9',
 'pc10',
 'category',
 'pc4',
 'pc5',
 'pc6',
 'month_std',
 'pc3',
 'week_std',
 'hour_std',
 'hour_mean',
 'pc1',
 'pc7', 'target']], roles = ROLES, verbose = 2)

# best catboost params 
# {'task_type': 'CPU', 'thread_count': 4, 'random_seed': 42, 'num_trees': 3000, 'learning_rate': 0.03, 'l2_leaf_reg': 0.009044636094268511, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 4, 'min_data_in_leaf': 7, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Min', 'verbose': 100, 'allow_writing_files': False}
# best linear
#
# best lgbm 
# {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 105, 'feature_fraction': 0.8625799184703501, 'bagging_fraction': 0.5053328530427746, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 7.265259184516205e-05, 'reg_lambda': 0.621571500507215, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 4, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 100, 'random_state': 42, 'min_sum_hessian_in_leaf': 4.636375055852895}
# best denselight
# {'num_workers': 0, 'pin_memory': False, 'max_length': 256, 'is_snap': False, 'input_bn': False, 'max_emb_size': 256, 'bert_name': None, 'pooling': 'cls', 'device': 'cpu', 'use_cont': True, 'use_cat': True, 'use_text': False, 'lang': 'en', 'deterministic': True, 'multigpu': False, 'random_state': 42, 'model': 'denselight', 'model_with_emb': False, 'path_to_save': None, 'verbose_inside': None, 'verbose': 1, 'n_epochs': 30, 'snap_params': {'k': 3, 'early_stopping': True, 'patience': 10, 'swa': True}, 'bs': 1024, 'emb_dropout': 0.1, 'emb_ratio': 3, 'opt': 'Adam', 'opt_params': {'lr': 0.003757084358753148, 'weight_decay': 0}, 'sch': 'ReduceLROnPlateau', 'scheduler_params': {'patience': 5, 'factor': 0.5, 'min_lr': 1e-05}, 'loss': None, 'loss_params': {}, 'loss_on_logits': True, 'clip_grad': False, 'clip_grad_params': {}, 'init_bias': True, 'dataset': 'UniversalDataset', 'tuned': True, 'optimization_search_space': None, 'verbose_bar': False, 'freeze_defaults': True, 'n_out': None, 'hid_factor': [2, 2], 'hidden_size': [512, 256], 'block_config': [2, 2], 'compression': 0.5, 'growth_size': 256, 'bn_factor': 2, 'drop_rate': 0.1, 'noise_std': 0.05, 'num_init_features': None, 'act_fun': 'LeakyReLU', 'use_noise': False, 'use_bn': True, 'embedding_size': 10, 'cat_embedder': 'cat', 'cont_embedder': 'cont', 'stop_by_metric': False, 'tuning_params': {'fit_on_holdout': True, 'max_tuning_iter': 50, 'max_tuning_time': 3600}}

[09:42:10] Start automl [1mutilizator[0m with listed constraints:
[09:42:10] - time: 10800.00 seconds
[09:42:10] - CPU: 2 cores
[09:42:10] - memory: 16 GB

[09:42:10] [1mIf one preset completes earlier, next preset configuration will be started[0m

[09:42:10] Start 0 automl preset configuration:
[09:42:10] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[09:42:10] Stdout logging level is INFO2.
[09:42:10] Task: binary

[09:42:10] Start automl preset with listed constraints:
[09:42:10] - time: 10799.99 seconds
[09:42:10] - CPU: 2 cores
[09:42:10] - memory: 16 GB

[09:42:10] [1mTrain data shape: (7871147, 18)[0m

[09:42:30] Layer [1m1[0m train process start. Time left 10779.81 secs
[09:56:41] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[09:56:42] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[09:57:22] =====

In [None]:
[['total_order_in_cat_max',
 'ordered_mean',
 'pc2',
 'weekday_std',
 'pc9',
 'pc10',
 'category',
 'pc4',
 'pc5',
 'pc6',
 'month_std',
 'pc3',
 'week_std',
 'hour_std',
 'hour_mean',
 'pc1',
 'pc7', 'target']]

In [66]:
# print('Score', "%.5f" % f1(Train_set.target, train_pred.data))
valid_pred = automl.predict(Valid_set[['total_order_in_cat_max',
 'ordered_mean',
 'pc2',
 'weekday_std',
 'pc9',
 'pc10',
 'category',
 'pc4',
 'pc5',
 'pc6',
 'month_std',
 'pc3',
 'week_std',
 'hour_std',
 'hour_mean',
 'pc1',
 'pc7', 'target']])
print('Score on out of folds validation', "%.5f" % f1(Valid_set.target, valid_pred.data))

TypeError: 'numpy.float64' object is not callable

In [46]:
# fast_fi['Feature'].to_list()

In [44]:
fast_fi = automl.get_feature_scores('fast', silent=False)
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)

[12:21:48] No feature importances to show. Please use another calculation method or another preset.


AttributeError: 'NoneType' object has no attribute 'set_index'

In [79]:
import joblib

joblib.dump(automl, 'automl_02.pkl')
# automl=joblib.load('/kaggle/input/next-orders/automl_rd.pkl')


# Initialize a new W&B run
run = wandb.init(project='sber-inter', 
                 config=CONFIG,
                 group='lightAutoML', 
                 job_type='save_experiment')

# Update `wandb.config`
wandb.config.type = 'lightAutoML'
wandb.config.kaggle_competition = 'SberMarket Competition'

# Save model as Model Artifact
artifact = wandb.Artifact(name='automl', type='model')
artifact.add_file('/kaggle/working/automl_01.pkl')
run.log_artifact(artifact)

# Finish W&B run
run.finish()

PicklingError: Can't pickle <function f1 at 0x7ec048a36290>: it's not the same object as __main__.f1

In [75]:
best_score = 0
for i in np.arange(0.01, 1.0, 0.01):
    score = f1 = f1_score(Valid_set.target, (valid_pred.data > i).astype(int))
    if score > best_score:
        best_score = score
        proba_split = i

print('At i =', "%.2f" % proba_split,'score is : ' "%.5f" % best_score)

At i = 0.21 score is : 0.43842


In [None]:
Train

In [53]:
s_Test.head(3)

Unnamed: 0,user_id,category,id,hour_mean,hour_max,hour_min,hour_std,week_mean,week_max,week_min,...,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,Clusters
0,153,0,153;0,0.84058,0.8,0.5,-0.131764,0.402381,0.9375,-0.578947,...,24.418623,-0.362202,-0.833304,1.116625,0.87174,0.504358,0.7226,-0.39468,-0.81878,2
1,284,0,284;0,0.707857,1.0,-1.25,0.312207,0.295113,1.125,-0.736842,...,24.417897,-0.526686,0.055853,1.296554,0.134439,0.717725,-0.159912,-0.511948,-0.331757,2
2,895,0,895;0,0.26769,0.6,0.25,-0.527829,0.056723,1.125,-0.736842,...,24.417427,-1.623824,1.560442,-0.071487,0.079832,-0.287262,-0.48833,1.457815,-0.502465,2


In [56]:
# predictions = automl.predict(s_Test)
print('Train target mean:', "%.5f" % s_Train.target.mean())
print('Test target mean:', "%.5f" % (predictions.data > proba_split).astype(int).mean())

Train target mean: 0.01796
Test target mean: 0.02123


In [76]:
th = proba_split
train_mean = s_Train.target.mean()
test_mean = (predictions.data > th).astype(int).mean()

while test_mean < train_mean:
    th -= 0.004
    test_mean = (predictions.data > th).astype(int).mean()
    
print('Threshold:', "%.4f" % th)
print('Train mean:', "%.5f" % train_mean)
print('New Test mean:', "%.5f" % test_mean)

Threshold: 0.2100
Train mean: 0.01796
New Test mean: 0.02123


In [77]:
s_Test['target'] = (predictions.data > th).astype(int)
submit = pd.merge(sub['id'], s_Test[['id', 'target']], on='id')

In [78]:
import csv

with open('submission02.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(submit.columns)
    for row in submit.values:
        csvwriter.writerow(row)

In [64]:
submit

Unnamed: 0,id,target
0,0;133,0
1,0;5,1
2,0;10,0
3,0;396,0
4,0;14,1
...,...,...
790444,19998;26,0
790445,19998;31,0
790446,19998;29,0
790447,19998;798,0
