In [1]:
from catboost import CatBoostRegressor, Pool
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import pickle

pd.options.display.max_columns = 200

In [2]:
def read_file(path):
    df = pd.read_excel(path)
    df.columns = df.iloc[4]
    df = df.iloc[5:][df.columns[:-1]]

    columns = df.columns.tolist()

    columns[3] = 'продажи'
    columns[16] = 'заболеваемость'
    columns[33] = 'ТВ реклама, руб'
    columns[35] = 'интернет реклама, руб'
    columns[51] = 'wordstat'

    df.columns = columns
    df.loc[df['неделя']==53, 'неделя'] = 52

    df['Начало нед'] = pd.to_datetime(df['Начало нед'])
    df['ТВ реклама, руб'] = df['ТВ реклама, руб'].replace(' ', 0).astype(float)

    return df

df = read_file('data/train.xlsx')
df.loc[df['неделя']==53, 'неделя'] = 52

  df['ТВ реклама, руб'] = df['ТВ реклама, руб'].replace(' ', 0).astype(float)


In [3]:
def create_features(df):

    week_info = df.groupby('неделя')['продажи'].agg(['sum', 'count'])
    week_info.columns = [f'week_{x}' for x in week_info.columns]

    df[f'target'] = df['продажи'] / df['продажи'].shift(1)

    df['feature_illnesses'] = df['заболеваемость'].shift(1)

    df['feature_last_5_wordstat'] = df['wordstat'].shift(1).rolling(window=5).mean()
    df['feature_last_15_wordstat'] = df['wordstat'].shift(1).rolling(window=15).mean()

    df['feature_last_5_adv'] = df['ТВ реклама, руб'].shift(1).rolling(window=5).mean()
    df['feature_last_15_adv'] = df['ТВ реклама, руб'].shift(1).rolling(window=15).mean()

    df['feature_last_5_internet_adv'] = df['интернет реклама, руб'].shift(1).rolling(window=5).mean()
    df['feature_last_15_internet__adv'] = df['интернет реклама, руб'].shift(1).rolling(window=15).mean()

    df['feature_week'] = df['неделя']

    for i in range(1, 16, 4):
        df[f'feature_month_diff_{i}'] = df['продажи'].shift(i) / df['продажи'].shift(i+4)

    for i in [7, 13, 20]:
        df[f'feature_big_diff_{i}_weeks'] = df['продажи'].shift(1) / df['продажи'].shift(i)

    bad_cols = [col for col in df.columns if 'feature' not in col and 'target' not in col]
    df.drop(bad_cols, axis=1, inplace=True)

    feature_names = {'feature_illnesses': 'к-во больных', 
 'feature_last_5_wordstat': 'wordstat за 5 недель', 
 'feature_last_15_wordstat': 'wordstat за 15 недель', 
 'feature_last_5_adv': 'тв реклама за 5 недель', 
 'feature_last_15_adv': 'тв реклама за 15 недель', 
 'feature_last_5_internet_adv': 'интернет реклама за 5 недель', 
 'feature_last_15_internet__adv': 'интернет реклама за 15 недель', 
 'feature_week': 'номер недели', 
 'feature_month_diff_1': 'изменение продаж месяц назад', 
 'feature_month_diff_5': 'изменение продаж 2 месяца назад', 
 'feature_month_diff_9': 'изменение продаж 3 месяца назад', 
 'feature_month_diff_13': 'изменение продаж 4 месяца назад', 
 'feature_big_diff_7_weeks': 'изменение продаж за 2 месяца', 
 'feature_big_diff_13_weeks': 'изменение продаж за 3 месяца', 
 'feature_big_diff_20_weeks': 'изменение продаж за 4 месяца'}
    
    df.rename(feature_names, axis=1, inplace=True)
    
    return df

df = create_features(df)

In [4]:
df = df[df['target'].notna()].reset_index(drop=True)
df = df.loc[20:].reset_index(drop=True)

In [6]:
test_size = 0.4

all_models = []
all_metrics = []
all_preds = []

cur_df = df[(df[f'target'].notna())&(df['интернет реклама за 5 недель'] > 0)]

train_data, valid_data = cur_df[:int(len(cur_df) * (1-test_size))], cur_df[int(len(cur_df) * (1-test_size)):]

X_train, y_train = train_data.drop([x for x in train_data.columns if 'target' in x], axis=1), train_data[f'target']
X_valid, y_valid = valid_data.drop([x for x in valid_data.columns if 'target' in x], axis=1), valid_data[f'target']


model = CatBoostRegressor(
    depth=2,
    learning_rate=0.03,
    iterations=3000,
    loss_function='MAPE',
    eval_metric='R2',
    custom_metric=['R2'],

    random_state=42,
    thread_count=5
)

model.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=1000)
all_models.append(model)

preds = model.predict(X_valid)
all_preds.append(preds)

r2_metric = metrics.r2_score(y_valid, preds)
mape_metric = metrics.mean_absolute_percentage_error(y_valid, preds)
all_metrics.append([r2_metric, mape_metric])

print(f'R2: {round(r2_metric, 5)}')
print(f'MAPE: {round(mape_metric, 5)}')

print()

0:	learn: -0.0165471	test: -0.0303781	best: -0.0303781 (0)	total: 54.3ms	remaining: 2m 42s
1000:	learn: 0.2505797	test: 0.1152500	best: 0.1666043 (232)	total: 156ms	remaining: 312ms
2000:	learn: 0.3026822	test: 0.0574917	best: 0.1666043 (232)	total: 260ms	remaining: 130ms
2999:	learn: 0.3272152	test: 0.0295856	best: 0.1666043 (232)	total: 363ms	remaining: 0us

bestTest = 0.1666043263
bestIteration = 232

Shrink model to first 233 iterations.
R2: 0.1666
MAPE: 0.05413



In [7]:
model.save_model("info_model.cbm")

In [8]:
pd.DataFrame({
    'name': X_train.columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

Unnamed: 0,name,imp
2,wordstat за 15 недель,19.391677
5,интернет реклама за 5 недель,18.364273
14,изменение продаж за 4 месяца,10.773646
9,изменение продаж 2 месяца назад,9.562077
8,изменение продаж месяц назад,8.896219
1,wordstat за 5 недель,7.937431
0,к-во больных,4.82875
7,номер недели,4.582916
12,изменение продаж за 2 месяца,3.141997
13,изменение продаж за 3 месяца,2.707544


In [9]:
print(f'R2: {round(np.mean([x[0] for x in all_metrics]), 5)}')
print(f'MAPE: {round(np.mean([x[1] for x in all_metrics]), 5)}')

R2: 0.1666
MAPE: 0.05413


R2: 0.42683
MAPE: 0.10619

-----

In [10]:
from catboost import CatBoostRegressor, Pool
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import pickle

pd.options.display.max_columns = 200

In [11]:
model = CatBoostRegressor()
model.load_model('info_model.cbm')

<catboost.core.CatBoostRegressor at 0x17c91e1d0>

In [12]:
def read_file(path):
    df = pd.read_excel(path)
    df.columns = df.iloc[4]
    df = df.iloc[5:][df.columns[:-1]]

    columns = df.columns.tolist()

    columns[3] = 'продажи'
    columns[16] = 'заболеваемость'
    columns[33] = 'ТВ реклама, руб'
    columns[35] = 'интернет реклама, руб'
    columns[51] = 'wordstat'
    
    df.columns = columns
    df.loc[df['неделя']==53, 'неделя'] = 52
    df['ТВ реклама, руб'] = df['ТВ реклама, руб'].replace(' ', 0).astype(float)

    return df

df = read_file('data/train.xlsx')
df.loc[df['неделя']==53, 'неделя'] = 52

  df['ТВ реклама, руб'] = df['ТВ реклама, руб'].replace(' ', 0).astype(float)


In [13]:
def create_features(df):

    week_info = df.groupby('неделя')['продажи'].agg(['sum', 'count'])
    week_info.columns = [f'week_{x}' for x in week_info.columns]

    df[f'target'] = df['продажи'] / df['продажи'].shift(1)

    df['feature_illnesses'] = df['заболеваемость'].shift(1)

    df['feature_last_5_wordstat'] = df['wordstat'].shift(1).rolling(window=5).mean()
    df['feature_last_15_wordstat'] = df['wordstat'].shift(1).rolling(window=15).mean()

    df['feature_last_5_adv'] = df['ТВ реклама, руб'].shift(1).rolling(window=5).mean()
    df['feature_last_15_adv'] = df['ТВ реклама, руб'].shift(1).rolling(window=15).mean()

    df['feature_last_5_internet_adv'] = df['интернет реклама, руб'].shift(1).rolling(window=5).mean()
    df['feature_last_15_internet__adv'] = df['интернет реклама, руб'].shift(1).rolling(window=15).mean()

    df['feature_week'] = df['неделя']

    for i in range(1, 16, 4):
        df[f'feature_month_diff_{i}'] = df['продажи'].shift(i) / df['продажи'].shift(i+4)

    for i in [7, 13, 20]:
        df[f'feature_big_diff_{i}_weeks'] = df['продажи'].shift(1) / df['продажи'].shift(i)

    bad_cols = [col for col in df.columns if 'feature' not in col and 'target' not in col]
    df.drop(bad_cols, axis=1, inplace=True)

    feature_names = {'feature_illnesses': 'к-во больных', 
 'feature_last_5_wordstat': 'wordstat за 5 недель', 
 'feature_last_15_wordstat': 'wordstat за 15 недель', 
 'feature_last_5_adv': 'тв реклама за 5 недель', 
 'feature_last_15_adv': 'тв реклама за 15 недель', 
 'feature_last_5_internet_adv': 'интернет реклама за 5 недель', 
 'feature_last_15_internet__adv': 'интернет реклама за 15 недель', 
 'feature_week': 'номер недели', 
 'feature_month_diff_1': 'изменение продаж месяц назад', 
 'feature_month_diff_5': 'изменение продаж 2 месяца назад', 
 'feature_month_diff_9': 'изменение продаж 3 месяца назад', 
 'feature_month_diff_13': 'изменение продаж 4 месяца назад', 
 'feature_big_diff_7_weeks': 'изменение продаж за 2 месяца', 
 'feature_big_diff_13_weeks': 'изменение продаж за 3 месяца', 
 'feature_big_diff_20_weeks': 'изменение продаж за 4 месяца'}
    
    df.rename(feature_names, axis=1, inplace=True)
    
    return df

df = create_features(df)
df = df.reset_index(drop=True)

In [14]:
df = df[df['target'].notna()].dropna()

In [15]:
model.predict(df[model.feature_names_])

array([1.02594427, 1.02671828, 1.02269647, 1.00462712, 1.00676511,
       1.00455851, 1.002644  , 1.01205664, 1.00754238, 1.0101105 ,
       1.01250213, 1.01882529, 1.02682883, 1.02372159, 1.02675534,
       1.03343361, 1.03661995, 1.0063366 , 0.97789853, 1.00470885,
       1.00568259, 1.031695  , 1.03966278, 1.04740535, 1.05225327,
       1.02644069, 1.01149522, 1.00412553, 0.99860107, 0.99537975,
       0.99310063, 0.99274977, 1.00524531, 1.00551772, 1.03498634,
       1.02521169, 1.02932401, 1.02153879, 0.97373162, 0.96878162,
       0.98832103, 1.04313752, 1.03167284, 1.01096905, 0.87337585,
       0.87074757, 0.86603803, 0.96868633, 0.97535719, 0.98528722,
       0.9992825 , 0.9786123 , 0.97576812, 0.9759422 , 1.00381779,
       1.00826757, 1.04799553, 1.016713  , 1.03409295, 1.03936388,
       1.032134  , 1.03959841, 1.02503798, 1.02484092, 1.01520938,
       1.01789751, 1.02131658, 1.05194639, 1.04733353, 1.00916092,
       0.97438845, 0.99309048, 1.00368636, 0.99956416, 0.99168