In [244]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from collections import Counter
import random
import torch
from model import TimeSeriesDetector, TimeSeriesPredictor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from plotting import plot_detection, plot_ts
from catboost import CatBoostRegressor

warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed=7575):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.random.manual_seed(seed)


seed_everything()

In [169]:
train_data = pd.read_csv("Задача3. Датасет 1. Тренировочный.csv", encoding="cp1251", sep=';', skiprows=[1, 2])

### TRAIN

In [56]:
train_data.index.tolist()

[Timestamp('2019-01-01 00:00:00'),
 Timestamp('2019-01-01 00:01:00'),
 Timestamp('2019-01-01 00:02:00'),
 Timestamp('2019-01-01 00:03:00'),
 Timestamp('2019-01-01 00:04:00'),
 Timestamp('2019-01-01 00:05:00'),
 Timestamp('2019-01-01 00:06:00'),
 Timestamp('2019-01-01 00:07:00'),
 Timestamp('2019-01-01 00:08:00'),
 Timestamp('2019-01-01 00:09:00'),
 Timestamp('2019-01-01 00:10:00'),
 Timestamp('2019-01-01 00:11:00'),
 Timestamp('2019-01-01 00:12:00'),
 Timestamp('2019-01-01 00:13:00'),
 Timestamp('2019-01-01 00:14:00'),
 Timestamp('2019-01-01 00:15:00'),
 Timestamp('2019-01-01 00:16:00'),
 Timestamp('2019-01-01 00:17:00'),
 Timestamp('2019-01-01 00:18:00'),
 Timestamp('2019-01-01 00:19:00'),
 Timestamp('2019-01-01 00:20:00'),
 Timestamp('2019-01-01 00:21:00'),
 Timestamp('2019-01-01 00:22:00'),
 Timestamp('2019-01-01 00:23:00'),
 Timestamp('2019-01-01 00:24:00'),
 Timestamp('2019-01-01 00:25:00'),
 Timestamp('2019-01-01 00:26:00'),
 Timestamp('2019-01-01 00:27:00'),
 Timestamp('2019-01-

In [170]:
train_data['Параметр'] = pd.to_datetime(train_data['Параметр'], infer_datetime_format=True)

In [171]:
train_data.set_index("Параметр", inplace=True)

In [198]:
detector = TimeSeriesDetector(
    granularity='PT1M',
    num_lags=60,
    model=CatBoostRegressor,
)



In [199]:
train_data[train_data["Маркер"] == 4]

Unnamed: 0_level_0,х001,х002,х003,х004,х005,х006,х007,х008,х009,х010,...,х063,х064,х065,х066,х067,х068,х069,х070,х071,Маркер
Параметр,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-19 04:32:00,190.054,24.4,49.991,49.971,2998.938,49.968,75.0,60.3,62.4,58.9,...,73.3,70.8,68.1,68.8,3200.0,37.9,40.6,42.7,26.7,4
2019-06-19 04:33:00,190.054,24.4,50.008,49.988,3000.378,49.988,75.2,60.4,62.5,59.0,...,73.4,70.8,68.1,68.8,3200.0,37.9,40.6,42.7,26.7,4


In [200]:
# ts_train, ts_test = train_data.loc[:1000, 'х013'], train_data.loc[1000:1200, 'х013']
ts_train, ts_test = train_data.loc["2019-06-18 08:21:00": "2019-06-19 00:42:00", 'х013'], train_data.loc['2019-06-19 00:42:00': "2019-06-19 04:33:00", 'х013']

In [201]:
detector.fit(ts_train)

detector.fit_statistics(ts_train)

Learning rate set to 0.040421
0:	learn: 0.5057786	total: 876us	remaining: 875ms
1:	learn: 0.4899550	total: 1.65ms	remaining: 826ms
2:	learn: 0.4750127	total: 2.32ms	remaining: 770ms
3:	learn: 0.4607552	total: 3.01ms	remaining: 749ms
4:	learn: 0.4455652	total: 3.78ms	remaining: 752ms
5:	learn: 0.4318557	total: 4.79ms	remaining: 794ms
6:	learn: 0.4187493	total: 5.52ms	remaining: 783ms
7:	learn: 0.4048365	total: 6.29ms	remaining: 780ms
8:	learn: 0.3927521	total: 7.16ms	remaining: 789ms
9:	learn: 0.3807235	total: 8.05ms	remaining: 797ms
10:	learn: 0.3693973	total: 8.93ms	remaining: 803ms
11:	learn: 0.3588264	total: 9.74ms	remaining: 802ms
12:	learn: 0.3484379	total: 10.5ms	remaining: 794ms
13:	learn: 0.3380696	total: 11.1ms	remaining: 782ms
14:	learn: 0.3280573	total: 11.8ms	remaining: 772ms
15:	learn: 0.3181921	total: 12.4ms	remaining: 764ms
16:	learn: 0.3094275	total: 13ms	remaining: 754ms
17:	learn: 0.3008557	total: 13.6ms	remaining: 744ms
18:	learn: 0.2919440	total: 14.3ms	remaining: 7

In [202]:
preds = detector.predict_batch(ts_train, ts_test)

In [203]:
pred_next = detector.predict_next(ts_train, 232)

In [204]:
lower, upper = detector.get_prediction_intervals(preds)

In [205]:
anoms = detector.detect(ts_test, preds)

In [206]:
plot_detection(ts_test, upper, lower, pred_next)

In [207]:
pred_next

2019-06-19 00:43:00    53.186740
2019-06-19 00:44:00    53.213129
2019-06-19 00:45:00    53.232399
2019-06-19 00:46:00    53.211448
2019-06-19 00:47:00    53.180373
                         ...    
2019-06-19 04:30:00    53.171803
2019-06-19 04:31:00    53.171803
2019-06-19 04:32:00    53.171803
2019-06-19 04:33:00    53.171803
2019-06-19 04:34:00    53.171803
Length: 232, dtype: float64

In [193]:
ts_test

Параметр
2019-06-19 00:42:00    53.2
2019-06-19 00:43:00    53.1
2019-06-19 00:44:00    53.2
2019-06-19 00:45:00    53.2
2019-06-19 00:46:00    53.3
                       ... 
2019-06-19 04:29:00    54.0
2019-06-19 04:30:00    54.0
2019-06-19 04:31:00    54.0
2019-06-19 04:32:00    54.0
2019-06-19 04:33:00    54.3
Name: х013, Length: 232, dtype: float64

### TEST

In [209]:
test_data = pd.read_csv("Задача3. Датасет 3. Контрольный для участников.csv", encoding="cp1251", sep=';', skiprows=[1])



In [210]:
test_data

Unnamed: 0.1,Unnamed: 0,х001,х002,х003,х004,х005,х006,х007,х008,х009,...,х062,х063,х064,х065,х066,х067,х068,х069,х070,х071
0,06.10.2019 13:00,131.683,12.1,50.036,50.015,3001.731,50.014,66.0,66.1,67.6,...,60.0,70.4,68.9,66.3,66.7,3276.7,35.7,29.9,41.9,18.9
1,06.10.2019 13:01,131.683,12.1,50.035,50.014,3001.733,50.014,66.4,66.2,67.7,...,60.1,70.4,68.9,66.3,66.8,3276.7,35.8,30.0,41.9,18.9
2,06.10.2019 13:02,131.683,12.1,50.036,50.014,3001.731,50.012,66.6,66.4,67.8,...,60.1,70.4,68.9,66.3,66.7,3276.7,35.7,30.1,41.9,18.9
3,06.10.2019 13:03,131.683,12.1,50.015,49.994,3000.575,49.994,66.7,66.5,67.9,...,60.1,70.5,69.0,66.3,66.8,3276.7,35.6,30.1,41.9,18.8
4,06.10.2019 13:04,131.683,12.1,50.011,49.990,3000.561,49.991,66.7,66.4,67.9,...,60.1,70.5,69.0,66.3,66.8,3276.7,35.7,30.2,41.9,18.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36656,,,,,,,,,,,...,,,,,,,,,,
36657,,,,,,,,,,,...,,,,,,,,,,
36658,,,,,,,,,,,...,,,,,,,,,,
36659,,,,,,,,,,,...,,,,,,,,,,


In [211]:
test_data = test_data.rename(columns={'Unnamed: 0': "Параметр"})
test_data['Параметр'] = pd.to_datetime(test_data['Параметр'], infer_datetime_format=True)


In [213]:
test_data.set_index("Параметр", inplace=True)

In [264]:
# ts_train = test_data.loc[:, 'х013'].dropna()

ts_train, ts_test = test_data.loc[: , 'х021'].dropna(), test_data.loc[: , 'х021'].dropna()

In [265]:
detector.fit(ts_train)

detector.fit_statistics(ts_train)
preds = detector.predict_batch(ts_train, ts_test)
pred_next = detector.predict_next(ts_train, 232)

Learning rate set to 0.055101
0:	learn: 0.4942869	total: 1.74ms	remaining: 1.73s
1:	learn: 0.4704898	total: 3.4ms	remaining: 1.7s
2:	learn: 0.4480806	total: 4.91ms	remaining: 1.63s
3:	learn: 0.4264107	total: 6.59ms	remaining: 1.64s
4:	learn: 0.4057045	total: 8.1ms	remaining: 1.61s
5:	learn: 0.3861802	total: 9.63ms	remaining: 1.6s
6:	learn: 0.3676276	total: 11.4ms	remaining: 1.62s
7:	learn: 0.3503031	total: 12.8ms	remaining: 1.59s
8:	learn: 0.3342640	total: 14.1ms	remaining: 1.55s
9:	learn: 0.3184067	total: 15.4ms	remaining: 1.53s
10:	learn: 0.3035004	total: 16.8ms	remaining: 1.51s
11:	learn: 0.2902539	total: 18.1ms	remaining: 1.49s
12:	learn: 0.2770547	total: 19.4ms	remaining: 1.47s
13:	learn: 0.2646351	total: 20.7ms	remaining: 1.46s
14:	learn: 0.2532556	total: 23ms	remaining: 1.51s
15:	learn: 0.2423886	total: 24.3ms	remaining: 1.49s
16:	learn: 0.2322775	total: 25.9ms	remaining: 1.5s
17:	learn: 0.2229275	total: 27.2ms	remaining: 1.48s
18:	learn: 0.2136943	total: 28.5ms	remaining: 1.47s

In [242]:
plot_ts(ts_test, pred_next)

2019-11-10 03:11:00    50.719591
2019-11-10 03:12:00    50.723438
2019-11-10 03:13:00    50.763326
2019-11-10 03:14:00    50.921229
2019-11-10 03:15:00    50.935264
                         ...    
2019-11-10 06:58:00    52.099552
2019-11-10 06:59:00    52.099552
2019-11-10 07:00:00    52.099552
2019-11-10 07:01:00    52.099552
2019-11-10 07:02:00    52.099552
Length: 232, dtype: float64

In [267]:
predictor = TimeSeriesPredictor(
    granularity="PT1M",
    num_lags=92,
    model=CatBoostRegressor,
    verbose=0
    # mappers=datetime_mappers,
)

In [268]:
lags_matrix = predictor.transform_into_matrix(ts_train)
lags_matrix

Unnamed: 0_level_0,lag_92,lag_91,lag_90,lag_89,lag_88,lag_87,lag_86,lag_85,lag_84,lag_83,...,lag_9,lag_8,lag_7,lag_6,lag_5,lag_4,lag_3,lag_2,lag_1,lag_0
Параметр,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-10 14:32:00,47.8,47.8,47.8,47.8,47.8,47.8,47.8,47.8,47.8,47.7,...,48.9,48.9,49.0,49.0,48.9,48.9,49.0,49.0,48.9,48.9
2019-06-10 14:33:00,47.8,47.8,47.8,47.8,47.8,47.8,47.8,47.8,47.7,47.7,...,48.9,49.0,49.0,48.9,48.9,49.0,49.0,48.9,48.9,48.9
2019-06-10 14:34:00,47.8,47.8,47.8,47.8,47.8,47.8,47.8,47.7,47.7,47.7,...,49.0,49.0,48.9,48.9,49.0,49.0,48.9,48.9,48.9,48.9
2019-06-10 14:35:00,47.8,47.8,47.8,47.8,47.8,47.8,47.7,47.7,47.7,47.7,...,49.0,48.9,48.9,49.0,49.0,48.9,48.9,48.9,48.9,49.0
2019-06-10 14:36:00,47.8,47.8,47.8,47.8,47.8,47.7,47.7,47.7,47.7,47.6,...,48.9,48.9,49.0,49.0,48.9,48.9,48.9,48.9,49.0,48.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-11-10 03:06:00,50.5,50.4,50.4,50.5,50.9,50.5,50.2,50.3,50.7,50.5,...,49.3,49.1,49.1,49.1,49.1,49.0,49.1,49.0,49.0,48.9
2019-11-10 03:07:00,50.4,50.4,50.5,50.9,50.5,50.2,50.3,50.7,50.5,50.3,...,49.1,49.1,49.1,49.1,49.0,49.1,49.0,49.0,48.9,48.9
2019-11-10 03:08:00,50.4,50.5,50.9,50.5,50.2,50.3,50.7,50.5,50.3,50.4,...,49.1,49.1,49.1,49.0,49.1,49.0,49.0,48.9,48.9,48.9
2019-11-10 03:09:00,50.5,50.9,50.5,50.2,50.3,50.7,50.5,50.3,50.4,50.5,...,49.1,49.1,49.0,49.1,49.0,49.0,48.9,48.9,48.9,48.9


In [269]:
predictor.fit(ts_train)
prediction = predictor.predict_next(ts_train, n_steps=200)

In [270]:
plot_ts(ts_test, prediction)

In [257]:
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.signal import periodogram, detrend

def get_season_period(ts):
    ts = pd.Series(detrend(ts), ts.index)
    f, Pxx = periodogram(ts)
    Pxx = list(map(lambda x: x.real, Pxx))
    ziped = list(zip(f, Pxx))
    ziped.sort(key=lambda x: x[1])
    highest_freqs = [x[0] for x in ziped[-100:]]
    season_periods = [round(1/(x+0.001)) for x in highest_freqs]
    for period in reversed(season_periods):
        if 4 < period < 100:
            return int(period)

In [266]:
get_season_period(ts_train)

92

In [281]:
prediction

2019-11-10 03:11:00    48.960774
2019-11-10 03:12:00    49.012879
2019-11-10 03:13:00    48.989451
2019-11-10 03:14:00    49.109376
2019-11-10 03:15:00    49.135198
                         ...    
2019-11-10 06:26:00    50.106824
2019-11-10 06:27:00    50.106824
2019-11-10 06:28:00    50.106824
2019-11-10 06:29:00    50.106824
2019-11-10 06:30:00    50.106824
Length: 200, dtype: float64