In [None]:
# !python3 -m venv myenv
# !source myenv/bin/activate

!pip install --upgrade pip > installations.txt
!pip install pyarrow polars


In [None]:
import socket

# Get the hostname of the local machine
hostname = socket.gethostname()

# Get the local IP address
local_ip = socket.gethostbyname(hostname)

print("Hostname:", hostname)
print("Local IP:", local_ip)


In [None]:
import polars as pl
import numpy as np
import pyarrow as pa
import pandas as pd
import os
import time
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
# from lightautoml.automl.presets.tabular_presets import TabularAutoML
# from lightautoml.tasks import Task

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import sys
sys.path.append('/kaggle/input/next-orders')
import dataset_class, model, run

In [None]:
def ohe_data(raw: pl.DataFrame) -> pl.DataFrame:
    """
    Вовращает ohe матрицу для категорий.

    Args:
        raw (pl.DataFrame): Polars DataFrame.

    Returns:
        (tpl.DataFrame).
    """

    # OHE 'cart', группировка по юзеру и дате заказа по максимальным значениям
    # Таким образом для каждой категории, если она была в этот день в заказе у пользователя, будет 1
    train_raw = raw.to_dummies(columns='cart').group_by(['user_id', 'order_completed_at']).max()
    train_raw = train_raw.sort(['user_id', 'order_completed_at'])

    return train_raw
    
def sep_history(train_raw: pl.DataFrame) -> tuple[pl.DataFrame, pl.DataFrame]:
    """
    Обрабатывает датафрейм polars и возвращает обработынный тренировочный и валидационный датасеты.

    Args:
        raw (pl.DataFrame): Polars DataFrame для обработки.

    Returns:
        tuple[pl.DataFrame, pl.DataFrame]: Обработанные DataFrame для train_data и valid_data.
    """
    
    # Вычисление переменных времени
    train_raw = train_raw.with_columns(pl.col("order_completed_at").str.to_datetime())
    train_raw = train_raw.with_columns(
        [
            (pl.col('user_id').cum_count() - pl.lit(1)).over(['user_id']).alias('order_number'),
            pl.col("order_completed_at").dt.hour().alias("hour"),
            pl.col("order_completed_at").dt.week().alias("week"),
            pl.col("order_completed_at").dt.weekday().alias("weekday"),
            pl.col("order_completed_at").dt.day().alias("day"),
            pl.col("order_completed_at").dt.month().alias("month"),
            pl.col("order_completed_at").dt.year().alias("year"),
        ]
    )
    train_raw = train_raw.drop('order_completed_at')
    
    # Сепарация последнего заказа из истории заказов
    grouped = train_raw.group_by('user_id').agg(pl.max('order_number').alias('max_order_number'))
    merged = train_raw.join(grouped, on='user_id')
    last_order = merged['max_order_number'] == merged['order_number']

    # Разделение на train_data (заказы, кроме последнего) и valid_data (последний заказ) .group_by('user_id').sum()
    train_data = train_raw.filter(~last_order)
    valid_data = train_raw.filter(last_order).drop('hour', 'week', 'weekday', 'day', 'month', 'year')

    return train_data, valid_data

def create_dataset(train_data: pl.DataFrame, valid_data: pl.DataFrame)-> pd.DataFrame:
    """
    Компилирует историю заказов (train_data) и последний заказ (valid_data) в укомплектованный тренировочный pd.Dataframe "Train".

    Args:
        train_data (pl.DataFrame): история заказов (корзина на каждого юзера).
        valid_data (pl.DataFrame): последний заказ (корзина на каждого юзера).

    Returns:
        pd.DataFrame: X + Y for model.
    """
    # Преобразование в длинный формат юзера и категории товара
    train_melt = train_data.drop('order_number').melt(id_vars=['user_id', 'hour', 'week', 'weekday', 'day', 'month', 'year'], variable_name='category', value_name='ordered')
    valid_melt = val_data.drop('order_number',).melt(id_vars=['user_id', ], variable_name='category', value_name='target')

    # Тренировочный датасет
    Train = train_melt
    order_number_df = val_data.select(['user_id', 'order_number']).unique() # Количество заказов у юзера
    dl_tmp = Train.group_by('category').agg(pl.col(['ordered']).sum()) # Количество заказов по категории
    Train = Train.join(order_number_df, on='user_id').rename({"order_number": "total_order_num"})
    Train = Train.join(dl_tmp, on='category').rename({"ordered_right": "total_order_in_cat"})
    # Вычисление рейтинга для каждой записи
    Train = Train.with_columns(
        [
            (pl.col('ordered') / pl.col('total_order_num')).alias('total_rating'),
            (pl.col('user_id').cast(pl.Utf8) + ';' + pl.col('category')).alias('id')
        ]
    )
    # Вычисление рейтинга по времени
    rating_per_hour = Train.group_by(
        ['year', 'month', 'hour', 'user_id'
        ]
    ).agg(((pl.col("total_rating").mean())*100).alias("rating_per_hour"))
    
    rating_per_weekday = Train.group_by(
        ['year', 'month', 'weekday', 'user_id'
        ]
    ).agg(((pl.col("total_rating").mean())*100).alias("rating_per_weekday"))
    
    rating_per_day = Train.group_by(
        ['year', 'month', 'day', 'user_id'
        ]
    ).agg(((pl.col("total_rating").mean())*100).alias("rating_per_day"))
    
    rating_per_month = Train.group_by(
        ['year', 'month', 'user_id'
        ]
    ).agg(((pl.col("total_rating").mean())*100).alias("rating_per_month"))

    Train = Train.join(rating_per_month, on=['year', 'month', 'user_id'])
    Train = Train.join(rating_per_weekday, on=['year', 'month', 'weekday', 'user_id'])
    Train = Train.join(rating_per_day, on=['year', 'month', 'day', 'user_id'])
    Train = Train.join(rating_per_hour, on=['year', 'month', 'hour', 'user_id'])
    
    # Присоединение целевой переменной из valid_melt к Train
    Train = Train.join(valid_melt, on = ['user_id', 'category'])

    # Преобразование в pandas DataFrame
    Train = Train.to_pandas()
    Train['id'] = Train['id'].str.replace('cart_', '')
    Train['category'] = Train['category'].str.replace('cart_', '')

    return Train

In [None]:
# tmp импорт в pandas

raw = pd.read_csv('../input/sbermarket-internship-competition/train.csv')
sub = pd.read_csv('../input/sbermarket-internship-competition/sample_submission.csv', sep = ",")

# Приведение столбца 'cart' к int
raw['cart'] = raw['cart'].astype(int)

def filter_raw_data(raw, sub):
    users = map(int, (x.split(';')[0] for x in sub['id']))
    user_counts = Counter(users)
    frequent_users = {user for user, count in user_counts.items() if count > 25}
    filtered_raw = raw[raw['user_id'].isin(frequent_users)]
    filtered_sub = sub[sub['id'].apply(lambda x: int(x.split(';')[0])).isin(frequent_users)]
    total_count = sum(user_counts.values())
    frequent_count = sum(count for user, count in user_counts.items() if user in frequent_users)
    proportion = frequent_count / total_count
    
    return filtered_raw, filtered_sub, proportion

filtered_raw, filtered_sub, proportion = filter_raw_data(raw, sub)
print(f"Процент наблюдений, используемый для тренировки: {proportion:.2f}%")

In [None]:
%%time
filtered_raw = pl.from_pandas(filtered_raw)
train_raw = ohe_data(filtered_raw)
train_data, val_data = sep_history(train_raw)
Train = create_dataset(train_data, val_data)

In [None]:
Train

In [None]:
!pip install lightautoml

In [None]:
N_THREADS = 8 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TARGET_NAME = 'target' # Target column name
TIMEOUT = 300

In [None]:
Train_set, Valid_set = train_test_split(Train, test_size = TEST_SIZE,
                                        stratify = None, random_state = 23)

In [None]:
from tqdm import tqdm as notebook_tqdm
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector, ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

In [None]:
%%time
model0 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads': N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

pipe = LGBSimpleFeatures()

params_tuner1 = OptunaTuner(n_trials=20, timeout=30) # stop after 20 iterations or after 30 seconds
model1 = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads': N_THREADS}
)
model2 = BoostLGBM(
    default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads': N_THREADS}
)

pipeline_lvl1 = MLPipeline([
    (model1, params_tuner1),
    model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

pipe1 = LGBSimpleFeatures()

model = BoostLGBM(
    default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3, 'num_threads': N_THREADS},
    freeze_defaults=True
)

pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 71 µs, sys: 20 µs, total: 91 µs
Wall time: 104 µs


In [None]:
import optuna
import requests
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch
from copy import deepcopy as copy
import torch.nn as nn
from collections import OrderedDict

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [61]:
%%time 
def f1 (real, pred, **kwargs):
    return f1_score(real, (pred > 0.5).astype(int), **kwargs)

ROLES = {'target': TARGET_NAME, 'drop': ['id', 'user_id', 'category'], 'category': []}
TASK = Task('binary', metric = f1)
reader = PandasToPandasReader(TASK, cv=N_FOLDS, random_state=RANDOM_STATE)

# automl = AutoML(reader, [
#     [pipeline_lvl1],
#     [pipeline_lvl2],
# ], skip_conn=False)

# automl = TabularAutoML(task = task, 
#                        timeout = 300,
#                        cpu_limit = 4,
#                        reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 23},
#                        general_params = {'use_algos': [['linear_l2']]},
#                       )
default_lama_params = {
    "task": TASK,
    "timeout": TIMEOUT,
    "cpu_limit": N_THREADS,
    "reader_params": {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
}
default_nn_params = {
    "bs": 512, "num_workers": 8, "path_to_save": None, "n_epochs": 10, "freeze_defaults": True
}
automl = TabularAutoML(
    **default_lama_params,
    general_params = {"use_algos": [["lgb", "mlp", "dense"]]},
    nn_params = {"0": {**default_nn_params, "n_epochs": 2},
                 "1": {**default_nn_params, "n_epochs": 5}},
)

train_pred = automl.fit_predict(Train_set, roles = ROLES, verbose = 1)
print('Score', "%.5f" % f1(Train_set.target, train_pred.data))

valid_pred = automl.predict(Valid_set)
print('Score on out of folds validation', "%.5f" % f1(Valid_set.target, valid_pred.data))

[08:53:45] Stdout logging level is INFO.
[08:53:45] Task: binary

[08:53:45] Start automl preset with listed constraints:
[08:53:45] - time: 300.00 seconds
[08:53:45] - CPU: 8 cores
[08:53:45] - memory: 16 GB

[08:53:45] [1mTrain data shape: (96959060, 18)[0m

[08:54:59] Layer [1m1[0m train process start. Time left 226.05 secs


KeyboardInterrupt: 

In [108]:
best_score = 0
for i in np.arange(0.01, 1.0, 0.01):
    score = f1 = f1_score(Valid_set.target, (valid_pred.data > i).astype(int))
    if score > best_score:
        best_score = score
        proba_split = i

print('At i =', "%.2f" % proba_split,'score is : ' "%.5f" % best_score)

At i = 0.12 score is : 0.40421


In [49]:
Train

Unnamed: 0,user_id,category,ordered,order_number,rating,id,target,total_ordered
0,0,0,0,2,0.0,0;0,0,12922
1,1,0,0,8,0.0,1;0,0,12922
2,3,0,0,6,0.0,3;0,0,12922
3,4,0,0,7,0.0,4;0,0,12922
4,5,0,0,14,0.0,5;0,0,12922
...,...,...,...,...,...,...,...,...
11184883,19994,99,0,4,0.0,19994;99,0,10373
11184884,19995,99,0,2,0.0,19995;99,0,10373
11184885,19996,99,0,2,0.0,19996;99,0,10373
11184886,19997,99,1,2,0.5,19997;99,0,10373


In [65]:
Test = Train.copy()
Test['order_number'] += 1 
Test['ordered'] = Test['ordered'] + Test['target']
test_total_ordered = Test.groupby('category')['ordered'].sum()
Test['total_ordered'] = Test['category'].map(test_total_ordered)
Test['rating'] = Test['ordered'] / Test['order_number']
Test = Test.drop('target', axis=1)
Test.head(3)

Unnamed: 0,user_id,category,ordered,order_number,rating,id,total_ordered
0,0,0,0,3,0.0,0;0,18387
1,1,0,0,9,0.0,1;0,18387
2,2,0,1,15,0.066667,2;0,18387


In [66]:
predictions = automl.predict(Test)
print('Train target mean:', "%.5f" % Train.target.mean())
print('Test target mean:', "%.5f" % (predictions.data > 0.5).astype(int).mean())

Train target mean: 0.01595
Test target mean: 0.00681


In [67]:
th = 0.5
train_mean = Train.target.mean()
test_mean = (predictions.data > th).astype(int).mean()

while test_mean < train_mean:
    th -= 0.005
    test_mean = (predictions.data > th).astype(int).mean()
    
print('Threshold:', "%.4f" % th)
print('Train mean:', "%.5f" % train_mean)
print('New Test mean:', "%.5f" % test_mean)

Threshold: 0.1600
Train mean: 0.01595
New Test mean: 0.01605


In [95]:
Test['target'] = (predictions.data > th).astype(int)
submit = pd.merge(sub['id'], Test[['id', 'target']], on='id')

In [94]:
import csv

with open('submission.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(submit.columns)
    for row in submit.values:
        csvwriter.writerow(row)

In [91]:
submit

Unnamed: 0,id,target
0,0;133,0
1,0;5,0
2,0;10,0
3,0;396,0
4,0;14,1
...,...,...
790444,19998;26,0
790445,19998;31,0
790446,19998;29,0
790447,19998;798,0
