In [30]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.report.report_deco import ReportDeco

In [21]:
df = pd.read_csv('../data/train.csv')
print(len(df))
df.head()


7111


Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8


In [22]:
N_THREADS = 4 #number of vCPUs for LightAutoML model creation
N_FOLDS = 5 #number of folds in LightAutoML inner CV
RANDOM_STATE = 27
#TIMEOUT = 1800 #limit in seconds for model to train
TARGET_NAME = 'target' #target column name in dataset

In [23]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [25]:
df.shape

(7111, 16)

In [17]:
## Task and metrics
def rmsle_metric(y_true, y_pred, sample_weight, **kwargs):
    mask = (sample_weight > 1)
    return mean_squared_log_error(y_true[mask], np.clip(y_pred[mask], 0, None), **kwargs) ** 0.5

task = Task('reg', loss = 'rmsle', metric = rmsle_metric, greater_is_better=False)

In [28]:
targets_and_drop = {
    'target_carbon_monoxide': ['target_benzene', 'target_nitrogen_oxides'],
    'target_benzene': ['target_carbon_monoxide', 'target_nitrogen_oxides'],
    'target_nitrogen_oxides': ['target_carbon_monoxide', 'target_benzene']
}

roles = {
    # delete day of month from features
    DatetimeRole(base_date=False, base_feats=True, seasonality=('d', 'wd', 'hour')): 'date_time'
}


In [None]:
sample_sub = pd.read_csv('../data/sample_submission.csv')
test_data = pd.read_csv('../data/test.csv')
pseudolabels = pd.read_csv('../data/submission.csv')
for col in ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']:
    test_data[col] = pseudolabels[col]
df = pd.concat([df, test_data]).reset_index(drop = True)
#Small feature engineering part using lags
    
df['temperature_lag_3'] = df['deg_C'] - df['deg_C'].shift(periods=3, fill_value=0)
df['temperature_lag_6'] = df['deg_C'] - df['deg_C'].shift(periods=6, fill_value=0)

df['humidity_lag_3'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=3, fill_value=0)
df['humidity_lag_6'] = df['absolute_humidity'] - df['absolute_humidity'].shift(periods=6, fill_value=0)
train_data, test_data = df.iloc[:(len(df) - len(test_data)), :], df.iloc[(len(df) - len(test_data)):, :]
df['weight'] = [1.001] * (len(df) - len(test_data)) + [0.999] * len(test_data)
roles['weights'] = 'weight'


importances = {}
dt = pd.to_datetime(df['date_time'])
for targ in tqdm(targets_and_drop):
    print('='*50, '='*50, sep = '\n')
    automl = TabularAutoML(task = task, 
                           timeout = 1800,
                           cpu_limit = N_THREADS,
                           reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                           general_params={'use_algos': [['lgb', 'lgb_tuned', 'cb', 'cb_tuned']]}
                          )

    roles['target'] = targ
    roles['drop'] = targets_and_drop[targ]

    if targ == 'target_nitrogen_oxides':
        oof_pred = automl.fit_predict(df[dt >= np.datetime64('2010-09-01')], roles = roles)
    else:
        oof_pred = automl.fit_predict(df, roles = roles)
    print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))
    
    # Fast feature importances calculation
    fast_fi = automl.get_feature_scores('fast')
    importances[targ] = fast_fi
    
    test_pred = automl.predict(test_data)
    print('Prediction for te_data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))
    
    sample_sub[targ] = np.clip(test_pred.data[:, 0], 0, None)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


Start automl preset with listed constraints:
- time: 1800 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (18346, 17)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 1793.0227210521698 secs
Start fitting Selector_LightGBM ...

===== Start working with fold 0 for Selector_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.011589	valid's Opt metric: 0.110749
[200]	valid's l2: 0.00685279	valid's Opt metric: 0.0865373
[300]	valid's l2: 0.00583189	valid's Opt metric: 0.0800132
[400]	valid's l2: 0.00540835	valid's Opt metric: 0.077174
[500]	valid's l2: 0.00517929	valid's Opt metric: 0.0756243
[600]	valid's l2: 0.00502173	valid's Opt metric: 0.0745556
[700]	valid's l2: 0.00489069	valid's Opt metric: 0.0736509
[800]	valid's l2: 0.00478436	valid's Opt metric: 0.0729103
[900]	valid's l2: 0.00471125	valid's Opt metric: 0.0723988
[1000]	valid's l2: 0.00463794	valid's Opt metric: 0.0718846
[11

[2700]	valid's l2: 0.00397029	valid's Opt metric: 0.0666174
[2800]	valid's l2: 0.0039573	valid's Opt metric: 0.0665179
[2900]	valid's l2: 0.00394782	valid's Opt metric: 0.0664461
[3000]	valid's l2: 0.00394043	valid's Opt metric: 0.0663916
Did not meet early stopping. Best iteration is:
[2995]	valid's l2: 0.00394018	valid's Opt metric: 0.0663892

===== Start working with fold 3 for Lvl_0_Pipe_0_Mod_0_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.0111124	valid's Opt metric: 0.109423
[200]	valid's l2: 0.00648903	valid's Opt metric: 0.0847278
[300]	valid's l2: 0.00570737	valid's Opt metric: 0.0795833
[400]	valid's l2: 0.00535187	valid's Opt metric: 0.0771144
[500]	valid's l2: 0.0051262	valid's Opt metric: 0.0755078
[600]	valid's l2: 0.00497365	valid's Opt metric: 0.0744103
[700]	valid's l2: 0.00486659	valid's Opt metric: 0.0736339
[800]	valid's l2: 0.004782	valid's Opt metric: 0.0730255
[900]	valid's l2: 0.00473021	valid's Opt metric: 0.0

[2500]	valid's l2: 0.00404067	valid's Opt metric: 0.0678699
[2600]	valid's l2: 0.00403894	valid's Opt metric: 0.0678556
[2700]	valid's l2: 0.00403905	valid's Opt metric: 0.0678569
[2800]	valid's l2: 0.00403839	valid's Opt metric: 0.0678517
[2900]	valid's l2: 0.00403721	valid's Opt metric: 0.067842
[3000]	valid's l2: 0.00403699	valid's Opt metric: 0.0678403
Did not meet early stopping. Best iteration is:
[2960]	valid's l2: 0.00403678	valid's Opt metric: 0.0678384
Lvl_0_Pipe_0_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_0_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.00993696	valid's Opt metric: 0.103505
[200]	valid's l2: 0.00566762	valid's Opt metric: 0.0794511
[300]	valid's l2: 0.00508856	valid's Opt metric: 0.0754441
[400]	valid's l2: 0.00483048	valid's Opt metric: 0.0735826
[500]	valid's l2: 0.00465353	valid's Opt metric: 0.072282

[2000]	valid's l2: 0.00542494	valid's Opt metric: 0.0777246
[2100]	valid's l2: 0.00541621	valid's Opt metric: 0.0776667
[2200]	valid's l2: 0.00541621	valid's Opt metric: 0.0776667
Early stopping, best iteration is:
[2041]	valid's l2: 0.00541621	valid's Opt metric: 0.0776667
Lvl_0_Pipe_0_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_0_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.0103917	valid's Opt metric: 0.105754
[200]	valid's l2: 0.00594335	valid's Opt metric: 0.0811764
[300]	valid's l2: 0.00530082	valid's Opt metric: 0.076805
[400]	valid's l2: 0.00499617	valid's Opt metric: 0.0746329
[500]	valid's l2: 0.00482478	valid's Opt metric: 0.0734004
[600]	valid's l2: 0.00469887	valid's Opt metric: 0.0724865
[700]	valid's l2: 0.00460084	valid's Opt metric: 0.0717708
[800]	valid's l2: 0.00453154	valid's Opt metric: 0.0712719
[900]	valid's l

[400]	valid's l2: 0.00467419	valid's Opt metric: 0.0726887
[500]	valid's l2: 0.0045394	valid's Opt metric: 0.0716908
[600]	valid's l2: 0.00442663	valid's Opt metric: 0.0708378
[700]	valid's l2: 0.0043631	valid's Opt metric: 0.0703592
[800]	valid's l2: 0.00432317	valid's Opt metric: 0.0700622
[900]	valid's l2: 0.00428149	valid's Opt metric: 0.069747
[1000]	valid's l2: 0.00425744	valid's Opt metric: 0.0695682
[1100]	valid's l2: 0.00423649	valid's Opt metric: 0.0694116
[1200]	valid's l2: 0.00422181	valid's Opt metric: 0.0693015
[1300]	valid's l2: 0.00420794	valid's Opt metric: 0.0691962
[1400]	valid's l2: 0.00419994	valid's Opt metric: 0.0691382
[1500]	valid's l2: 0.00419503	valid's Opt metric: 0.0691045
[1600]	valid's l2: 0.00418873	valid's Opt metric: 0.0690576
[1700]	valid's l2: 0.00418224	valid's Opt metric: 0.0690087
[1800]	valid's l2: 0.00417746	valid's Opt metric: 0.0689726
[1900]	valid's l2: 0.00417242	valid's Opt metric: 0.068934
[2000]	valid's l2: 0.00416879	valid's Opt metric: 


===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l2: 0.00899901	valid's Opt metric: 0.0986806
[200]	valid's l2: 0.00519613	valid's Opt metric: 0.0764907
[300]	valid's l2: 0.00471615	valid's Opt metric: 0.0730442
[400]	valid's l2: 0.00449762	valid's Opt metric: 0.0714107
[500]	valid's l2: 0.00438426	valid's Opt metric: 0.0705544
[600]	valid's l2: 0.00431003	valid's Opt metric: 0.0699895
[700]	valid's l2: 0.00425798	valid's Opt metric: 0.06959
[800]	valid's l2: 0.00421574	valid's Opt metric: 0.0692597
[900]	valid's l2: 0.00418238	valid's Opt metric: 0.0689977
[1000]	valid's l2: 0.00416138	valid's Opt metric: 0.0688335
[1100]	valid's l2: 0.00414604	valid's Opt metric: 0.0687143
[1200]	valid's l2: 0.0041344	valid's Opt metric: 0.068623
[1300]	valid's l2: 0.00412324	valid's Opt metric: 0.0685348
[1400]	valid's l2: 0.00411585	valid's Opt metric: 0.0684773
[1500]	valid's l2: 0.00411358	valid's O

[900]	valid's l2: 0.00405016	valid's Opt metric: 0.0677176
[1000]	valid's l2: 0.00404104	valid's Opt metric: 0.0676522
[1100]	valid's l2: 0.00403556	valid's Opt metric: 0.0676136
[1200]	valid's l2: 0.00403205	valid's Opt metric: 0.0675894
[1300]	valid's l2: 0.00402909	valid's Opt metric: 0.0675684
[1400]	valid's l2: 0.00402522	valid's Opt metric: 0.0675389
[1500]	valid's l2: 0.00402197	valid's Opt metric: 0.0675142
[1600]	valid's l2: 0.00402068	valid's Opt metric: 0.0675049
Early stopping, best iteration is:
[1566]	valid's l2: 0.0040201	valid's Opt metric: 0.0674997

===== Start working with fold 2 for Lvl_0_Pipe_0_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's l2: 0.0050279	valid's Opt metric: 0.0746982
[200]	valid's l2: 0.00440293	valid's Opt metric: 0.070135
[300]	valid's l2: 0.00420452	valid's Opt metric: 0.0686508
[400]	valid's l2: 0.00411671	valid's Opt metric: 0.0680121
[500]	valid's l2: 0.00405056	valid's Opt metric: 0.0675117


In [None]:
for targ in targets_and_drop:
    plt.figure(figsize = (20, 10))
    importances[targ].set_index('Feature')['Importance'].plot.bar()
    plt.title('Feature importances for {} model'.format(targ))
    plt.grid(True)
    plt.show()

In [None]:
sample_sub.to_csv('../submission/lightautoml_submisson.csv', index = False)