# Настройка

In [None]:
import sys
ENV_COLAB = 'google.colab' in sys.modules

if ENV_COLAB:

    !pip install catboost
    !pip install ipywidgets
    !jupyter nbextension enable --py widgetsnbextension

    print('Environment: Google Colab')
!pip install shap

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error

from datetime import datetime
import datetime, os

import pickle
import matplotlib.pyplot as plt
%matplotlib inline 
from tqdm import tqdm

np.random.seed(2021)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
zfile = 'drive/MyDrive/Study/final_2022_125.zip'
!unzip $zfile

Archive:  drive/MyDrive/Study/final_2022_125.zip
replace final_2022_125.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
df0=pd.read_csv('final_2022_125.csv')
df0['dt'] = pd.to_datetime(df0['dt'], format="%Y-%m-%d")
df0['year']   = df0['dt'].dt.year
df0['year_week'] = df0['year'].astype(str) + df0['week'].astype(str)
df0['year_month'] = df0['year'].astype(str) + df0['month'].astype(str)

In [None]:
group_day = df0.groupby(['base', 'dt'])['2'].count().unstack(fill_value=0)
group_day

In [None]:
group_day   = df0.groupby(['base', 'dt'])['2'].count().unstack(fill_value=0)
group_month = df0.groupby(['base', 'year_month'])['2'].count().unstack(fill_value=0)
group_week = df0.groupby(['base', 'year_week'])['2'].count().unstack(fill_value=0)


In [None]:
dt_range  = pd.date_range(df0.dt.min(), df0.dt.max(), freq="D")

df0.dt.min(), df0.dt.max()

(Timestamp('2020-01-01 00:00:00'), Timestamp('2022-05-10 00:00:00'))

In [None]:
range_day  = pd.date_range(df0.dt.min(), df0.dt.max(), freq="D")

In [None]:

start_period_day = indate - pd.offsets.DateOffset(7)
end_period_day   = indate - pd.offsets.DateOffset(1)
start_period_day, end_period_day

start_period_month = indate - pd.offsets.MonthBegin(3)
end_period_month   = indate - pd.offsets.MonthEnd(0)
start_period_month, end_period_month

(Timestamp('2022-03-01 00:00:00', freq='D'),
 Timestamp('2022-05-31 00:00:00', freq='D'))

In [None]:
def get_features(df: pd.DataFrame, indate: pd.Timestamp) -> pd.DataFrame:


 
    start_period_day = indate - pd.offsets.DateOffset(7)
    end_period_day   = indate - pd.offsets.DateOffset(1)


    df_day = group_day.loc[:, :end_period_day]
    # df_month = group_month.loc[:, :_month]


    features = pd.DataFrame([], index=df_day.index)

    features["date"]    = indate
    features["month"]   = start_period_day.month
    features["day"]     = start_period_day.day
    features["weekday"] = start_period_day.weekday()+1
    features['target']  = group_day.loc[:, indate]
    features[[f"day-{i}" for i in range(7, 0, -1)]] = df_day.loc[:, start_period_day:end_period_day].copy()

    rolling = df_day.rolling(2, axis=1, min_periods=1)
    features[[f"day2_m-{i}" for i in range(7, 0, -1)]] = rolling.mean().loc[:, start_period_day:end_period_day]

    return features

In [None]:
tr_range  = pd.date_range(range_day[30], range_day[-181] , freq="D")
val_range = pd.date_range(range_day[-180], range_day[-91] , freq="D")
ts_range  = pd.date_range(range_day[-90], range_day[-1] + pd.offsets.DateOffset(2), freq="D")
tr_range

DatetimeIndex(['2020-01-31', '2020-02-01', '2020-02-02', '2020-02-03',
               '2020-02-04', '2020-02-05', '2020-02-06', '2020-02-07',
               '2020-02-08', '2020-02-09',
               ...
               '2021-11-02', '2021-11-03', '2021-11-04', '2021-11-05',
               '2021-11-06', '2021-11-07', '2021-11-08', '2021-11-09',
               '2021-11-10', '2021-11-11'],
              dtype='datetime64[ns]', length=651, freq='D')

In [None]:
full_features = {}


for dataset, dataset_range in zip(["tr", "val", "ts"], [tr_range, val_range, ts_range]):
    dataset_features = []
    for target_month in dataset_range:
        try:
            features = get_features(group_day, target_month)
            dataset_features.append(features.reset_index())
        except:
            pass


    full_features[dataset] = pd.concat(dataset_features, ignore_index=True)

In [None]:
CAT_COLS = ['base','month','day','weekday']
FTS_COLS = {'base',
        'day',
        'day-1',
        'day-2',
        'day-3',
        'day-4',
        'day-5',
        'day-6',
        'day-7',
        'day2_m-1',
        'day2_m-2',
        'day2_m-3',
        'day2_m-4',
        'day2_m-5',
        'day2_m-6',
        'day2_m-7',
        'month',
        'weekday'}

model = CatBoostRegressor(iterations=1000,
                          early_stopping_rounds=30,
                          depth=6,
                          cat_features=CAT_COLS,
                          random_state=2022,
                          verbose=10)

model.fit(full_features["tr"][FTS_COLS], full_features["tr"]['target'],
          eval_set=(full_features["val"][FTS_COLS], full_features["val"]['target']))



Learning rate set to 0.083766
0:	learn: 6.0492155	test: 8.6630525	best: 8.6630525 (0)	total: 26.1ms	remaining: 26.1s
10:	learn: 3.5618767	test: 5.6465479	best: 5.6465479 (10)	total: 266ms	remaining: 23.9s
20:	learn: 2.8370242	test: 4.7300049	best: 4.7300049 (20)	total: 515ms	remaining: 24s
30:	learn: 2.6463279	test: 4.4792418	best: 4.4792418 (30)	total: 756ms	remaining: 23.6s
40:	learn: 2.5780079	test: 4.4265364	best: 4.4130616 (37)	total: 1.01s	remaining: 23.6s
50:	learn: 2.5402448	test: 4.3945992	best: 4.3945992 (50)	total: 1.23s	remaining: 22.9s
60:	learn: 2.5113098	test: 4.3735844	best: 4.3735844 (60)	total: 1.46s	remaining: 22.5s
70:	learn: 2.4889694	test: 4.3694618	best: 4.3694618 (70)	total: 1.72s	remaining: 22.6s
80:	learn: 2.4709863	test: 4.3588428	best: 4.3588428 (80)	total: 1.97s	remaining: 22.3s
90:	learn: 2.4548361	test: 4.3658184	best: 4.3565108 (85)	total: 2.15s	remaining: 21.5s
100:	learn: 2.4377197	test: 4.3734266	best: 4.3565108 (85)	total: 2.38s	remaining: 21.2s
110:

<catboost.core.CatBoostRegressor at 0x7f66a88b4210>

In [None]:
model.save_model('model.gbm')    # extension not required.

# And then, later load - 
from catboost import CatBoostClassifier
model = CatBoostRegressor()      # parameters not required.
model.load_model('model.gbm')

<catboost.core.CatBoostRegressor at 0x7f66a8882990>

In [None]:
tr_preds = model.predict(full_features["tr"][FTS_COLS])
val_preds = model.predict(full_features["val"][FTS_COLS])
ts_preds = model.predict(full_features["ts"][FTS_COLS])

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

col = 'ts'


y_true = full_features[col]["target"]
y_pred =  model.predict(full_features[col][FTS_COLS])
r2_score(y_true, y_pred)



0.8010439761105692

In [None]:
full_features["ts"]

Unnamed: 0,base,date,month,day,weekday,target,day-7,day-6,day-5,day-4,day-3,day-2,day-1,day2_m-7,day2_m-6,day2_m-5,day2_m-4,day2_m-3,day2_m-2,day2_m-1
0,Ардатовская ЦРБ,2022-02-10,2,3,4,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Арзамасская РБ,2022-02-10,2,3,4,11,13,5,8,14,9,11,14,11.0,9.0,6.5,11.0,11.5,10.0,12.5
2,БСМП г. Арзамас,2022-02-10,2,3,4,4,2,8,9,4,7,12,13,4.5,5.0,8.5,6.5,5.5,9.5,12.5
3,БСМП г. Дзержинск Центральная,2022-02-10,2,3,4,10,17,2,19,12,22,4,4,14.5,9.5,10.5,15.5,17.0,13.0,4.0
4,БСМП г. Дзержинск Юго-Западная,2022-02-10,2,3,4,19,5,24,7,13,12,30,7,11.0,14.5,15.5,10.0,12.5,21.0,18.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2698,Тонкинская ЦРБ,2022-05-10,5,3,2,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2699,Уренская ЦРБ,2022-05-10,5,3,2,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2700,Чкаловская ЦРБ,2022-05-10,5,3,2,0,6,2,0,3,3,0,0,5.5,4.0,1.0,1.5,3.0,1.5,0.0
2701,Шатковская ЦРБ,2022-05-10,5,3,2,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
ans = full_features["ts"]
ans['pred']= model.predict(full_features["ts"][FTS_COLS])
ans['dif'] = (ans['target']-ans['pred'])
ans[['target','pred', 'dif']]

Unnamed: 0,target,pred,dif
0,0,0.039587,-0.039587
1,11,10.745539,0.254461
2,4,6.984923,-2.984923
3,10,8.620628,1.379372
4,19,14.278589,4.721411
...,...,...,...
2698,0,0.028143,-0.028143
2699,0,0.028143,-0.028143
2700,0,1.771572,-1.771572
2701,0,0.028143,-0.028143


In [None]:
ans[ans['dif']>0]['dif'].sum(), ans[ans['dif']<0]['dif'].sum()

(2287.5885510680655, -2784.6173006623976)

In [None]:
ans[ans['dif']>0]['dif'].sum()/ans['target'].sum() , ans[ans['dif']<0]['dif'].sum()/ans['target'].sum()

(0.17671599467501473, -0.21511141758689822)