In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('train_dataset_train (1).csv')
test = pd.read_csv('test_dataset_test (1).csv')

df.head().T

Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
ticket_id,40BD89EC85646EFB69E283F39C298E60,126727A96489CC976A8C08E5CEB00542,D28CE6A9E0E5B6D213470A97CFF32485,015DA44B523C062B5BFEFF3FB0E64B9E,95B19C6F3A504727AC3EA56EB7E3E80F
ticket_type_nm,Пропуск FacePay,СК учащегося 30 дней,БСК дружинника г.Москвы,30 дней,КОШЕЛЕК
entrance_id,2402,110,110,110,110
entrance_nm,Лефортово БКЛ,Войковская ( Южный ),Войковская ( Южный ),Войковская ( Южный ),Войковская ( Южный )
station_id,11007,2006,2006,2006,2006
station_nm,Лефортово,Войковская,Войковская,Войковская,Войковская
line_id,11,2,2,2,2
line_nm,Большая кольцевая,Замоскворецкая,Замоскворецкая,Замоскворецкая,Замоскворецкая
pass_dttm,2022-09-12 05:00:13,2022-09-12 05:00:54,2022-09-12 05:00:55,2022-09-12 05:01:13,2022-09-12 05:02:55


## Предобработка

In [4]:
df['pass_dttm'] = pd.to_datetime(df['pass_dttm'])
df['pass_hour'] = df.pass_dttm.dt.hour
df['pass_weekday'] = df.pass_dttm.dt.weekday
df['pass_minute'] = df.pass_dttm.dt.minute

df['ticket_type_nm'] = df['ticket_type_nm'].astype('category')
df['line_id'] = df['line_id'].astype('category')
df['station_id'] = df['station_id'].astype('category')
df['entrance_id'] = df['entrance_id'].astype('category')


label_encoders = {}

for feature in ['ticket_type_nm','station_id','line_id','entrance_id','label']:
    mapper = {k: i for i, k in enumerate(df[feature].unique())}
    label_encoders[feature] = mapper
    df[feature] = df[feature].apply(lambda x: mapper.get(x,0))
    

df['label'] = df['label'].astype(np.int32)

In [17]:
%%time

from sklearn.model_selection import KFold, StratifiedKFold # k-фолдная валидация
from lightgbm import LGBMClassifier ,LGBMRegressor
from lightgbm import Dataset, train
import warnings

warnings.filterwarnings('ignore')
import gc 

random_state = 42
n_splits = 5 # kfolds
print('num folds', n_splits)
clfs = []
regr = []
targets = 'label'

n_estimators_clf = [10, 15, 20, 20, 20, 20, 20, 20, 20, 20] # best!!
n_estimators_reg = [100, 300, 400, 300, 400, 300, 300, 300, 400, 500] # best!!

kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

n_examples = df.shape[0]
X = df[['ticket_type_nm','line_id','station_id','entrance_id','pass_hour','pass_minute','pass_weekday']][:n_examples]
y = df[targets][:n_examples]
y_reg = df['time_to_under']

cat_features = ['ticket_type_nm','line_id','station_id','entrance_id']


for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    y_train_reg, y_test_reg = y_reg.iloc[train_index], y_reg.iloc[test_index]

    train_dataset = Dataset(data = X_train, label = y_train, categorical_feature=cat_features)
    eval_dataset = Dataset(data = X_test, label = y_test, categorical_feature=cat_features)

    print(f'fold {i}', X_train.shape, y_train.shape)

    lgb_params = {
         'learning_rate' : 0.1*0.9, 'n_estimators' : n_estimators_clf[i] +10, 
        'objective': 'multiclass', 'num_class' : df['label'].nunique(), 

        'uniform_drop' : True,  'boosting': 'gbdt',   # goss 
        'lambda_l2' : 1/100, 'feature_fraction': 0.65, # 0.55
        'bagging_freq': 50, 'min_split_gain': 1/1000,
        'max_bin' : 260,
        'random_seed' : 42, 'drop_seed' : 7575,
        'verbose': -1,
        'nthreads':1
    }

    clf = train(params = lgb_params, train_set = train_dataset,
               verbose_eval = 1, valid_sets=eval_dataset)
    clfs.append(clf)
    
    

    lgb_params = {
         'learning_rate' : 0.1*0.9, 'n_estimators' : n_estimators[i] + 1000, 
        'objective': 'l2', #'num_class' : df['label'].nunique(), 

        'uniform_drop' : True,  'boosting': 'gbdt',   # goss 
        'lambda_l2' : 1/100, 'feature_fraction': 0.63, # 0.55
        'bagging_freq': 125, 'min_split_gain': 1/1000,

        'max_bin' : 260,
        'random_seed' : 42, 'drop_seed' : 7575,
        'verbose': -1,
    }

    train_dataset = Dataset(data = X_train, label = y_train_reg, categorical_feature=cat_features)
    eval_dataset = Dataset(data = X_test, label = y_test_reg, categorical_feature=cat_features)
    reg = train(params = lgb_params, train_set = train_dataset,
               verbose_eval = 100, valid_sets=eval_dataset)

    regr.append(reg)


num folds 5
fold 0 (872816, 7) (872816,)
[1]	valid_0's multi_logloss: 5.19717
[2]	valid_0's multi_logloss: 5.13386
[3]	valid_0's multi_logloss: 5.08147
[4]	valid_0's multi_logloss: 5.03871
[5]	valid_0's multi_logloss: 5.00383
[6]	valid_0's multi_logloss: 4.97205
[7]	valid_0's multi_logloss: 4.94397
[8]	valid_0's multi_logloss: 4.91895
[9]	valid_0's multi_logloss: 4.90268
[10]	valid_0's multi_logloss: 4.87756
[11]	valid_0's multi_logloss: 4.85761
[12]	valid_0's multi_logloss: 4.84091
[13]	valid_0's multi_logloss: 4.82513
[14]	valid_0's multi_logloss: 4.81034
[15]	valid_0's multi_logloss: 4.79668
[16]	valid_0's multi_logloss: 4.78407
[17]	valid_0's multi_logloss: 4.77246
[18]	valid_0's multi_logloss: 4.76139
[19]	valid_0's multi_logloss: 4.75196
[20]	valid_0's multi_logloss: 4.74159
[100]	valid_0's l2: 25109.9
[200]	valid_0's l2: 25032.9
[300]	valid_0's l2: 25014.7
[400]	valid_0's l2: 25003.1
[500]	valid_0's l2: 24990.4
[600]	valid_0's l2: 24983.6
[700]	valid_0's l2: 24976.1
[800]	valid_

## Предсказания

In [20]:
test = pd.read_csv('test_dataset_test (1).csv')
test['pass_dttm'] = pd.to_datetime(test['pass_dttm'])

test['pass_hour'] = test.pass_dttm.dt.hour
test['pass_weekday'] = test.pass_dttm.dt.weekday
test['pass_minute'] = test.pass_dttm.dt.minute

test['ticket_type_nm'] = test['ticket_type_nm'].astype('category')
test['line_id'] = test['line_id'].astype('category')
test['station_id'] = test['station_id'].astype('category')
test['entrance_id'] = test['entrance_id'].astype('category')


for feature in ['ticket_type_nm','station_id','line_id','entrance_id']:
    test[feature] = test[feature].apply(lambda x: label_encoders[feature].get(x,0)).astype('category')
    

In [23]:

from collections import Counter
from tqdm import tqdm

label_decoder = {k:v for v, k in label_encoders['label'].items()}

labels = []
times = []

for chunk in tqdm(np.array_split(test[['ticket_type_nm','line_id','station_id','entrance_id','pass_hour','pass_minute','pass_weekday']], 1000)):
    
    labels_drop = []
    for n, clf in enumerate(clfs):
        chunk[f'label_{n}'] = clf.predict(chunk[['ticket_type_nm','line_id','station_id','entrance_id','pass_hour','pass_minute','pass_weekday']]).argmax(1)
        labels_drop.append(f'label_{n}')
        
    labels.extend(chunk.apply(
        lambda x: Counter([label_decoder[x[f]] for f in labels_drop]).most_common()[0][0], axis=1).values)
    
    regr_drop = []
    for n, reg in enumerate(regr):
        chunk[f'ttu_{n}'] = reg.predict(chunk[['ticket_type_nm','line_id','station_id','entrance_id','pass_hour','pass_minute','pass_weekday']])
        regr_drop.append(f'ttu_{n}')
        
    times.extend(chunk.apply(
        lambda x: np.mean([x[f] for f in regr_drop]), axis=1).values)
    

    #labels.extend(model1.predict(chunk[features]).tolist())
    #times.extend(model2.predict(chunk[features]).tolist())

test['label'] = labels
test['time_to_under'] = times



100%|██████████| 1000/1000 [1:07:18<00:00,  4.04s/it]


In [24]:
test[['id','time_to_under','label']].to_csv('preds11.csv', index=None, sep=',')