In [1]:
!pip install holidays_ru prophet seaborn

Collecting holidays_ru
  Downloading holidays_ru-0.1-py3-none-any.whl (4.1 kB)
Collecting prophet
  Downloading prophet-1.1.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting cmdstanpy>=1.0.4 (from prophet)
  Downloading cmdstanpy-1.2.0-py3-none-any.whl.metadata (3.9 kB)
Collecting stanio~=0.3.0 (from cmdstanpy>=1.0.4->prophet)
  Downloading stanio-0.3.0-py3-none-any.whl.metadata (963 bytes)
Downloading prophet-1.1.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.4/14.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hDownloading cmdstanpy-1.2.0-py3-none-any.whl (93 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.0/93.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stanio-0.3.0-py3-none-any.whl (6.2 kB)
Installing collected packages: stanio, holidays_ru, cmdstanpy,

In [7]:
import holidays_ru
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import mean_absolute_error as MAE

from catboost import CatBoostRegressor, Pool
from catboost import EShapCalcType, EFeaturesSelectionAlgorithm
import prophet

import optuna

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Train data reading and features renaming

In [24]:
train = pd.read_csv('./train.csv', delimiter=';')
train = train.set_axis(['Target', 'Date', 
                        'Break_id', 'Break_start', 'Break_end', 'Break_content', 'Break_distribution',
                        'Programme_name', 'Programme_start', 'Programme_end', 'Programme_category', 
                        'Programme_genre'], axis=1)
train = train.convert_dtypes()
train.head()

Unnamed: 0,Target,Date,Break_id,Break_start,Break_end,Break_content,Break_distribution,Programme_name,Programme_start,Programme_end,Programme_category,Programme_genre
0,614692654,02.01.2023,4870830561,8:17:33,8:21:40,Commercial,Network,"Telekanal ""Dobroe utro""",8:00:13,10:00:14,Morning airplay,Entertainment programs
1,869565217,02.01.2023,4870830614,8:34:45,8:38:52,Commercial,Network,"Telekanal ""Dobroe utro""",8:00:13,10:00:14,Morning airplay,Entertainment programs
2,989505247,02.01.2023,4870830629,8:52:19,8:56:23,Commercial,Network,"Telekanal ""Dobroe utro""",8:00:13,10:00:14,Morning airplay,Entertainment programs
3,884557721,02.01.2023,4870830684,8:56:31,8:57:28,Announcement,Network,"Telekanal ""Dobroe utro""",8:00:13,10:00:14,Morning airplay,Entertainment programs
4,83958021,02.01.2023,4870830685,9:12:04,9:16:13,Commercial,Network,"Telekanal ""Dobroe utro""",8:00:13,10:00:14,Morning airplay,Entertainment programs


In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30682 entries, 0 to 30681
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Target              30682 non-null  string
 1   Date                30682 non-null  string
 2   Break_id            30682 non-null  Int64 
 3   Break_start         30682 non-null  string
 4   Break_end           30682 non-null  string
 5   Break_content       30682 non-null  string
 6   Break_distribution  30682 non-null  string
 7   Programme_name      30682 non-null  string
 8   Programme_start     30682 non-null  string
 9   Programme_end       30682 non-null  string
 10  Programme_category  30682 non-null  string
 11  Programme_genre     30682 non-null  string
dtypes: Int64(1), string(11)
memory usage: 2.8 MB


## Preprocessing and feature engineering

In [26]:
def time_preproc(df, is_test=False):
    if not is_test:
        train['Target'] = train['Target'].apply(lambda x: float(x.replace(',', '.')))

    df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')

    df['Break_start_hour'] = df['Break_start'].apply(lambda x: int(x.split(':')[0]))
    df['Break_start_min'] = df['Break_start'].apply(lambda x: int(x.split(':')[1]))
    df['Break_start_sec'] = df['Break_start'].apply(lambda x: int(x.split(':')[2]))
    df['Break_start_abs'] = 60 * df['Break_start_hour'] + df['Break_start_min']

    df['Break_end_hour'] = df['Break_end'].apply(lambda x: int(x.split(':')[0]))
    df['Break_end_min'] = df['Break_end'].apply(lambda x: int(x.split(':')[1]))
    df['Break_end_sec'] = df['Break_end'].apply(lambda x: int(x.split(':')[2]))
    df['Break_end_abs'] = 60 * df['Break_end_hour'] + df['Break_end_min']

    df['Programme_start_hour'] = df['Programme_start'].apply(lambda x: int(x.split(':')[0]))
    df['Programme_start_min'] = df['Programme_start'].apply(lambda x: int(x.split(':')[1]))
    df['Programme_start_sec'] = df['Programme_start'].apply(lambda x: int(x.split(':')[2]))
    df['Programme_start_abs'] = 60 * df['Programme_start_hour'] + df['Programme_start_min']

    df['Programme_end_hour'] = df['Programme_end'].apply(lambda x: int(x.split(':')[0]))
    df['Programme_end_min'] = df['Programme_end'].apply(lambda x: int(x.split(':')[1]))
    df['Programme_end_sec'] = df['Programme_end'].apply(lambda x: int(x.split(':')[2]))
    df['Programme_end_abs'] = 60 * df['Programme_end_hour'] + df['Programme_end_min']

    df['Ad_block_frequency'] = df.groupby('Programme_name')['Break_start'].rolling('1H').count().values


    df = df.drop(columns=['Break_start', 'Break_end', 'Programme_start', 'Programme_end'])
    return df

In [27]:
train = time_preproc(train)

ValueError: window must be an integer 0 or greater

In [23]:
train

Unnamed: 0,Target,Date,Break_id,Break_content,Break_distribution,Programme_name,Programme_category,Programme_genre,Break_start_hour,Break_start_min,...,Programme_end_abs,Weekday,Weekend,Holiday,Pre_holiday,Break_duration,Programme_duration,From_progamme_start,Until_progamme_end,Break_of_programme_fraction
0,0.614693,2023-01-02,4870830561,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,17,...,600,0,False,True,False,247,7201,1287,6161,0.178725
1,0.869565,2023-01-02,4870830614,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,34,...,600,0,False,True,False,247,7201,2319,5129,0.322039
2,0.989505,2023-01-02,4870830629,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,52,...,600,0,False,True,False,244,7201,3370,4075,0.467991
3,0.884558,2023-01-02,4870830684,Announcement,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,56,...,600,0,False,True,False,57,7201,3435,3823,0.477017
4,0.839580,2023-01-02,4870830685,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,9,12,...,600,0,False,True,False,249,7201,4560,2890,0.633245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30677,0.029985,2023-10-31,5335333196,Announcement,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,27,43,...,1692,1,False,False,False,27,2679,978,1728,0.365062
30678,0.029985,2023-10-31,5335333197,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,27,44,...,1692,1,False,False,False,246,2679,1224,1701,0.456887
30679,0.014993,2023-10-31,5335333212,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,28,2,...,1692,1,False,False,False,246,2679,2307,618,0.861142
30680,0.029985,2023-10-31,5335333228,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,28,26,...,1736,1,False,False,False,245,2646,1077,1814,0.407029


### Basic features

In [21]:
def gen_base_features(df, is_test=False):
    df['Weekday'] = df['Date'].apply(lambda x: x.weekday())
    df['Weekend'] = df['Weekday'] > 4
    df['Holiday'] = df['Date'].apply(lambda x: holidays_ru.check_holiday(x, False))
    df['Pre_holiday'] = df['Date'].apply(lambda x: (not holidays_ru.check_holiday(x, False)) and holidays_ru.check_holiday(x + datetime.timedelta(days=1), False))
    
    df['Break_duration'] = (3600 * df['Break_end_hour'] + 60 * df['Break_end_min'] + df['Break_end_sec']) - \
                          (3600 * df['Break_start_hour'] + 60 * df['Break_start_min'] + df['Break_start_sec'])
                          
    df['Programme_duration'] = (3600 * df['Programme_end_hour'] + 60 * df['Programme_end_min'] + df['Programme_end_sec']) - \
                                  (3600 * df['Programme_start_hour'] + 60 * df['Programme_start_min'] + df['Programme_start_sec'])
    df['From_progamme_start'] = (3600 * df['Break_end_hour'] + 60 * df['Break_end_min'] + df['Break_end_sec']) - \
                                (3600 * df['Programme_start_hour'] + 60 * df['Programme_start_min'] + df['Programme_start_sec'])
    df['Until_progamme_end'] = (3600 * df['Programme_end_hour'] + 60 * df['Programme_end_min'] + df['Programme_end_sec']) - \
                               (3600 * df['Break_start_hour'] + 60 * df['Break_start_min'] + df['Break_start_sec'])
                               
    df['Break_of_programme_fraction'] = df['From_progamme_start'] / df['Programme_duration']
    
    
    return df

In [22]:
train = gen_base_features(train)
train.shape

  if d in MOVED_HOLIDAYS:
  if d in WORK_WEEKENDS:
  if d in MOVED_HOLIDAYS:
  if d in WORK_WEEKENDS:


KeyError: 'Column not found: Break_start'

### TV viewing feature

In [12]:
views = pd.read_csv('./views.csv', delimiter=';')
views = views.replace(np.nan, '0,0')
views['2021'] = views['2021'].apply(lambda x: float(x.replace(',', '.')))
views['2022'] = views['2022'].apply(lambda x: float(x.replace(',', '.')))
views['2023'] = views['2023'].apply(lambda x: float(x.replace(',', '.')))

#Prophet
y = np.hstack([views['2021'].values, views['2022'].values, views['2023'].iloc[:-2].values])
ds = [datetime.date(2021, i, 1) for i in range(1, 13)] + \
     [datetime.date(2022, i, 1) for i in range(1, 13)] + \
     [datetime.date(2023, i, 1) for i in range(1, 11)]
views_prop = pd.DataFrame({'ds': ds,
                           'y': y})
model_prophet = prophet.Prophet(growth='linear', yearly_seasonality=1)
model_prophet.fit(views_prop)
future = model_prophet.make_future_dataframe(periods=1, freq='m')
forecast = model_prophet.predict(future)
views_2023 = views['2023'].iloc[:10].tolist() + [forecast['yhat'].iloc[-1]] 
views_by_month_Prop = {month: views_2023[month-1] for month in range(1,12)}

01:53:07 - cmdstanpy - INFO - Chain [1] start processing
01:53:07 - cmdstanpy - INFO - Chain [1] done processing


### Aggregate statistics by Programme_genre

In [13]:
genre_stats = train.groupby('Programme_genre')['Target'].agg(['mean', 'std', 'min', 'max', 'median']).reset_index()
genre_stats.rename(columns={
    'mean': 'genre_mean_rating', 
    'std': 'genre_std_rating', 
    'min': 'genre_min_rating', 
    'max': 'genre_max_rating', 
    'median': 'genre_median_rating', 
}, inplace=True)
genre_stats

Unnamed: 0,Programme_genre,genre_mean_rating,genre_std_rating,genre_min_rating,genre_max_rating,genre_median_rating
0,Educational program,0.507373,0.225188,0.089955,2.788606,0.47976
1,Entertainment programs,1.093739,0.699036,0.0,4.587706,1.094453
2,Musical programs,0.24027,0.091633,0.044978,0.509745,0.23988
3,News,2.189251,0.782455,0.5997,5.052474,2.098951
4,Social-political programs,0.935144,0.377642,0.089955,3.793103,0.869565


### Aggregate statistics by Programme_category

In [14]:
category_stats = train.groupby('Programme_category')['Target'].agg(['mean', 'std', 'min', 'max', 'median']).reset_index()
category_stats.rename(columns={
    'mean': 'category_mean_rating', 
    'std': 'category_std_rating', 
    'min': 'category_min_rating', 
    'max': 'category_max_rating', 
    'median': 'category_median_rating', 

}, inplace=True)
category_stats

Unnamed: 0,Programme_category,category_mean_rating,category_std_rating,category_min_rating,category_max_rating,category_median_rating
0,Comic program,0.985206,0.319753,0.29985,2.293853,0.944528
1,Daily news,2.189251,0.782455,0.5997,5.052474,2.098951
2,Entertainment talk show,0.812077,0.733601,0.0,2.863568,0.65967
3,Folk music and dances,0.24027,0.091633,0.044978,0.509745,0.23988
4,Geography and countries,0.825791,0.347822,0.224888,2.788606,0.794603
5,Law and justice,1.854707,0.603826,0.569715,3.793103,1.7991
6,"Mixed concert, show, circus performance",1.481145,0.446519,0.569715,3.013493,1.454273
7,Morning airplay,1.112456,0.41889,0.05997,2.278861,1.13943
8,"Program for women, Cookery show",1.04575,0.301341,0.35982,1.829085,1.049475
9,Program on fashion and design,0.782656,0.261707,0.344828,1.754123,0.734633


### Per program features

In [15]:
# Number of ad blocks per program
ad_blocks_per_program = train.groupby('Programme_name')['Break_id'].nunique().reset_index(name='total_ad_blocks_per_program')
# Average duration of ad blocks per program (assuming 'Break_duration' is already calculated in minutes)
avg_ad_duration_per_program = train.groupby('Programme_name')['Break_duration'].mean().reset_index(name='avg_ad_block_duration')

### Average target rating by day of the week

In [16]:
avg_rating_by_day_of_week = train.groupby('Weekday')['Target'].mean().reset_index(name='avg_rating_by_day_of_week')
avg_rating_by_day_of_week

Unnamed: 0,Weekday,avg_rating_by_day_of_week
0,0,0.988667
1,1,0.99582
2,2,1.007417
3,3,0.976581
4,4,1.253812
5,5,0.760885
6,6,0.787756


### Timeslot rating

In [17]:
rating_timeslot = train.groupby('Break_start_hour')['Target'].mean().reset_index(name='avg_rating_by_timeslot')
peak_time = rating_timeslot.loc[rating_timeslot['avg_rating_by_timeslot'].idxmax()]
peak_time

Break_start_hour          21.000000
avg_rating_by_timeslot     2.647665
Name: 16, dtype: float64

### Some extra features

In [18]:
extra_feats = pd.read_csv('./tvrs_reach.csv', delimiter=';')
extra_feats = extra_feats[['Reach_4+', 'Reach_18+', 'TVR_4+', 'TVR_All_18+', 
                          'Total_Channels_TVR_4+', 'Total_Channels_TVR_All_18+']]
for col in extra_feats.columns:
    extra_feats[col] = extra_feats[col].apply(lambda x: float(str(x).replace(',', '.')))
extra_feats.convert_dtypes()
extra_feats

FileNotFoundError: [Errno 2] No such file or directory: './tvrs_reach.csv'

### Merge all features

In [18]:
def gen_more_features(df):
#     for col in extra_feats.columns:
#         df[col] = df['Date'].apply(lambda x: extra_feats.loc[x.month-1, col])
    df['Avg_views'] = df['Date'].apply(lambda x: views_by_month_Prop[x.month])
    df = df.merge(genre_stats, on='Programme_genre', how='left')
    df = df.merge(category_stats, on='Programme_category', how='left')
    df = df.merge(ad_blocks_per_program, on='Programme_name', how='left')
    df = df.merge(avg_ad_duration_per_program, on='Programme_name', how='left')
    df = df.merge(avg_rating_by_day_of_week, on='Weekday', how='left')
    df = df.merge(rating_timeslot, on='Break_start_hour', how='left')
    return df

In [19]:
train = gen_more_features(train)

In [20]:
features_to_drop = ['Break_start_sec', 'Break_end_sec', 
                    'Programme_start_sec', 'Programme_end_sec',
                    'Avg_views']   
# ['Pre_holiday', 'Break_start_sec', 'Break_end_sec', 'Holiday', 
# 'Break_end_min', 'Break_of_programme_fraction', 
# 'Programme_category', 'Break_start_min', 'Programme_genre', 'Break_content']

def drop_features(df):
    return df.drop(columns=features_to_drop)

In [21]:
train = drop_features(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30682 entries, 0 to 30681
Data columns (total 43 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Target                       30682 non-null  float64       
 1   Date                         30682 non-null  datetime64[ns]
 2   Break_id                     30682 non-null  Int64         
 3   Break_content                30682 non-null  string        
 4   Break_distribution           30682 non-null  string        
 5   Programme_name               30682 non-null  string        
 6   Programme_category           30682 non-null  string        
 7   Programme_genre              30682 non-null  string        
 8   Break_start_hour             30682 non-null  int64         
 9   Break_start_min              30682 non-null  int64         
 10  Break_start_abs              30682 non-null  int64         
 11  Break_end_hour               30682 non-nu

In [26]:
cat_features = list(set(['Break_content', 'Break_distribution', 'Programme_name', 'Programme_category', 
                         'Programme_genre', 'Weekday', 'Weekend', 'Holiday', 'Pre_holiday']) \
                    - set(features_to_drop))

In [22]:
# drop Local break dictribution
train = train.drop(index=train[train['Break_distribution'] == 'Local'].index)
# drop zero Targets
train = train.drop(index=train[train['Target'] == 0].index)
train.shape

(30571, 43)

In [23]:
# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [52]:
y = train['Target']
X = train.drop(['Target', 'Date', 'Break_id'], axis=1)
X.shape, y.shape

((30571, 40), (30571,))

In [36]:
roles = {
    'target': 'Target',
    'drop': ['Date','Break_id'],
}

task = Task('reg',loss='mape')

[01:58:45] sklearn doesn't support in general case mape and will not be used.


In [64]:
automl = TabularAutoML(
    task = task,
    timeout = 600,
    cpu_limit = -1,
    reader_params = {'cv': 5, 'random_state': 42},
)

In [65]:
train_idxs.head()

Unnamed: 0,Target,Date,Break_id,Break_content,Break_distribution,Programme_name,Programme_category,Programme_genre,Break_start_hour,Break_start_min,...,genre_median_rating,category_mean_rating,category_std_rating,category_min_rating,category_max_rating,category_median_rating,total_ad_blocks_per_program,avg_ad_block_duration,avg_rating_by_day_of_week,avg_rating_by_timeslot
0,0.614693,2023-01-02,4870830561,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,17,...,1.094453,1.112456,0.41889,0.05997,2.278861,1.13943,7584,88.63067,0.988667,0.965738
1,0.869565,2023-01-02,4870830614,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,34,...,1.094453,1.112456,0.41889,0.05997,2.278861,1.13943,7584,88.63067,0.988667,0.965738
2,0.989505,2023-01-02,4870830629,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,52,...,1.094453,1.112456,0.41889,0.05997,2.278861,1.13943,7584,88.63067,0.988667,0.965738
3,0.884558,2023-01-02,4870830684,Announcement,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,8,56,...,1.094453,1.112456,0.41889,0.05997,2.278861,1.13943,7584,88.63067,0.988667,0.965738
4,0.83958,2023-01-02,4870830685,Commercial,Network,"Telekanal ""Dobroe utro""",Morning airplay,Entertainment programs,9,12,...,1.094453,1.112456,0.41889,0.05997,2.278861,1.13943,7584,88.63067,0.988667,0.787344


In [66]:
out_of_fold_predictions = automl.fit_predict(train_idxs, roles = roles, verbose = 4)

[02:10:47] Stdout logging level is DEBUG.
[02:10:47] Task: reg

[02:10:47] Start automl preset with listed constraints:
[02:10:47] - time: 600.00 seconds
[02:10:47] - CPU: 16 cores
[02:10:47] - memory: 16 GB

[02:10:47] [1mTrain data shape: (27332, 43)[0m

[02:10:50] Feats was rejected during automatic roles guess: []
[02:10:50] Layer [1m1[0m train process start. Time left 597.05 secs


  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


[02:10:51] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[02:10:51] Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57], 'embed_sizes': array([14, 14, 27, 26, 25, 25, 25, 14, 25,  6, 11, 27,  5, 11, 11,  8],
      dtype=int32), 'data_size': 58}
[02:10:51] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[02:10:52] Linear model: C = 1e-05 score = -0.12589581133636868
[02:10:53] Linear model: C = 5e-05 score = -0.09242777404885773
[02:10:53] Linear model: C = 0.0001 score = -0.09242500107850218
[02:10:54] Linear model: C = 0.0005 score = -0.08209646875291723
[02:10:55] Linear model: C = 0.001 score = -0.07956112959329341
[02:10:55] Linear model: C = 0.005 score = -0.07957532814494493
[02:10:55] Linear model: C = 0

  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))
  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


[02:11:10] [100]	valid's l2: 0.0915508
[02:11:11] [200]	valid's l2: 0.0698161
[02:11:12] [300]	valid's l2: 0.0653052
[02:11:13] [400]	valid's l2: 0.0633415
[02:11:13] [500]	valid's l2: 0.0623859
[02:11:14] [600]	valid's l2: 0.0616259
[02:11:14] [700]	valid's l2: 0.0611049
[02:11:15] [800]	valid's l2: 0.060882
[02:11:15] [900]	valid's l2: 0.0607635
[02:11:16] [1000]	valid's l2: 0.0604116
[02:11:16] [1100]	valid's l2: 0.0601497
[02:11:17] [1200]	valid's l2: 0.0600721
[02:11:17] Did not meet early stopping. Best iteration is:
[1200]	valid's l2: 0.0600721
[02:11:17] [1mSelector_LightGBM[0m fitting and predicting completed
[02:11:18] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[02:11:18] Training params: {'task': 'train', 'learning_rate': 0.03, 'num_leaves': 32, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.5, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 16, 'max_bin': 2

  cnts = cnts.append(Series([cnts.shape[0] + 1], index=[np.nan]))


[02:11:18] [100]	valid's l2: 0.0806564
[02:11:20] [200]	valid's l2: 0.0552464
[02:11:20] [300]	valid's l2: 0.0516291
[02:11:21] [400]	valid's l2: 0.051032
[02:11:21] [500]	valid's l2: 0.0507295
[02:11:22] [600]	valid's l2: 0.0503988
[02:11:23] [700]	valid's l2: 0.0499906
[02:11:24] [800]	valid's l2: 0.0496795
[02:11:24] [900]	valid's l2: 0.0496072
[02:11:25] [1000]	valid's l2: 0.0495447
[02:11:26] [1100]	valid's l2: 0.0494732
[02:11:27] [1200]	valid's l2: 0.0493542
[02:11:27] Did not meet early stopping. Best iteration is:
[1200]	valid's l2: 0.0493542
[02:11:27] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
[02:11:27] Training until validation scores don't improve for 200 rounds
[02:11:28] [100]	valid's l2: 0.0789241
[02:11:29] [200]	valid's l2: 0.0539309
[02:11:30] [300]	valid's l2: 0.0517384
[02:11:32] [400]	valid's l2: 0.0509093
[02:11:33] [500]	valid's l2: 0.0504144
[02:11:34] [600]	valid's l2: 0.0502432
[02:11:35] [700]	valid's l2: 0.0500712

In [50]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
valid_idxs = train[train['Date'].apply(lambda x: x.month == 10)]
train_idxs = train[train['Date'].apply(lambda x: x.month < 10)]
#X_train = X.drop(index=train_idx)
#y_train = y.drop(index=train_idx)
#X_valid = X.drop(index=valid_idx)
#y_valid = y.drop(index=valid_idx)
#X_train.shape, y_train.shape, X_valid.shape, y_valid.shape 

## CatBoost

In [None]:
models = []

In [271]:
scale_target = 1  # 0.6482
train_pool = Pool(X_train, label=y_train * scale_target, cat_features=cat_features)
valid_pool = Pool(X_valid, label=y_valid * scale_target, cat_features=cat_features)

ctb_params = dict(iterations=7000,
                   learning_rate=0.007,  #0.0837, #0.02778,
                   depth=7,  #8,  #9
                   l2_leaf_reg=3,
                   bootstrap_type='Bernoulli',
                   subsample=0.6,  #0.9763,  #0.997,
                   colsample_bylevel=0.66,#0.66  # #0.795,
                   loss_function='MAPE',
                   eval_metric='MAPE',
                   metric_period=100,
                   random_strength=0.69, #1  # more bagging to reduce overfitting
#                    task_type='GPU',
#                    devices='0:1',
#                    use_best_model=False,
                   random_state=42,
                   allow_writing_files=False,
                  )
model = CatBoostRegressor(**ctb_params)  # loss_function=MapeObjective(), 
model.fit(train_pool, 
          eval_set=valid_pool,
          early_stopping_rounds=100,
         )



0:	learn: 0.3331300	test: 0.3778650	best: 0.3778650 (0)	total: 27.5ms	remaining: 3m 12s
100:	learn: 0.2188884	test: 0.2449452	best: 0.2449452 (100)	total: 2.57s	remaining: 2m 55s
200:	learn: 0.1769089	test: 0.1862789	best: 0.1862789 (200)	total: 5.02s	remaining: 2m 49s
300:	learn: 0.1614349	test: 0.1600923	best: 0.1600923 (300)	total: 7.47s	remaining: 2m 46s
400:	learn: 0.1541860	test: 0.1465943	best: 0.1465943 (400)	total: 10.3s	remaining: 2m 50s
500:	learn: 0.1500780	test: 0.1393916	best: 0.1393916 (500)	total: 12.9s	remaining: 2m 46s
600:	learn: 0.1474893	test: 0.1348526	best: 0.1348526 (600)	total: 15.4s	remaining: 2m 43s
700:	learn: 0.1454787	test: 0.1315811	best: 0.1315811 (700)	total: 17.8s	remaining: 2m 39s
800:	learn: 0.1436398	test: 0.1291420	best: 0.1291420 (800)	total: 20.2s	remaining: 2m 36s
900:	learn: 0.1422223	test: 0.1273951	best: 0.1273951 (900)	total: 22.6s	remaining: 2m 33s
1000:	learn: 0.1410761	test: 0.1261320	best: 0.1261320 (1000)	total: 25.1s	remaining: 2m 30s


<catboost.core.CatBoostRegressor at 0x7be9ac9b7490>

In [54]:
valid_idxs

Unnamed: 0,Target,Date,Break_id,Break_content,Break_distribution,Programme_name,Programme_category,Programme_genre,Break_start_hour,Break_start_min,...,genre_median_rating,category_mean_rating,category_std_rating,category_min_rating,category_max_rating,category_median_rating,total_ad_blocks_per_program,avg_ad_block_duration,avg_rating_by_day_of_week,avg_rating_by_timeslot
27421,0.074963,2023-10-01,5285512747,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,6,23,...,1.094453,0.812077,0.733601,0.000000,2.863568,0.65967,2738,122.560628,0.787756,1.089443
27422,0.104948,2023-10-01,5285512763,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,6,37,...,1.094453,0.812077,0.733601,0.000000,2.863568,0.65967,2738,122.560628,0.787756,1.089443
27423,0.224888,2023-10-01,5285512781,Announcement,Network,"Igray, Garmon!",Folk music and dances,Musical programs,7,3,...,0.239880,0.240270,0.091633,0.044978,0.509745,0.23988,192,127.640625,0.787756,1.272099
27424,0.239880,2023-10-01,5285512783,Commercial,Network,"Igray, Garmon!",Folk music and dances,Musical programs,7,3,...,0.239880,0.240270,0.091633,0.044978,0.509745,0.23988,192,127.640625,0.787756,1.272099
27425,0.164918,2023-10-01,5285512798,Announcement,Network,"Igray, Garmon!",Folk music and dances,Musical programs,7,18,...,0.239880,0.240270,0.091633,0.044978,0.509745,0.23988,192,127.640625,0.787756,1.272099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30677,0.029985,2023-10-31,5335333196,Announcement,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,27,43,...,1.094453,0.812077,0.733601,0.000000,2.863568,0.65967,2738,122.560628,0.995820,0.040276
30678,0.029985,2023-10-31,5335333197,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,27,44,...,1.094453,0.812077,0.733601,0.000000,2.863568,0.65967,2738,122.560628,0.995820,0.040276
30679,0.014993,2023-10-31,5335333212,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,28,2,...,1.094453,0.812077,0.733601,0.000000,2.863568,0.65967,2738,122.560628,0.995820,0.028613
30680,0.029985,2023-10-31,5335333228,Commercial,Network,Podkast.Lab,Entertainment talk show,Entertainment programs,28,26,...,1.094453,0.812077,0.733601,0.000000,2.863568,0.65967,2738,122.560628,0.995820,0.028613


In [67]:
test_predictions = automl.predict(valid_idxs)

In [70]:
MAPE(valid_idxs['Target'], np.array(test_predictions.data))

0.28893135198988557