In [1]:
import os
os.chdir(os.path.dirname(os.getcwd()))

import pandas as pd
import numpy as np

import datetime
from dateutil.relativedelta import relativedelta 

import random

from catboost import CatBoostClassifier

from tqdm.auto import tqdm

from utils import add_time_series_features, add_master_data_mappings, evaluate

tqdm.pandas()

seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)

In [2]:
data_path = "./data/fact_train_test.csv"
data = pd.read_csv(data_path, sep=";", decimal=",", encoding="windows-1251")
data

Unnamed: 0,period,rps,podrod,filial,client_sap_id,freight_id,sender_station_id,recipient_station_id,sender_organisation_id,real_weight,real_wagon_count
0,2012-07-01,1,5,1,328,1193,30252,13005,10036,71.0,1
1,2012-10-01,1,5,1,328,1193,30252,11376,10036,210.0,3
2,2014-03-01,0,1,1,328,3472,30252,29548,10036,67.0,1
3,2014-03-01,0,1,1,328,3472,30252,29158,10036,67.0,1
4,2014-03-01,0,2,1,328,3472,30252,27484,10036,66.0,1
...,...,...,...,...,...,...,...,...,...,...,...
3559227,2023-03-01,1,5,2,1346,1482,31438,31482,0,69.0,1
3559228,2023-03-01,1,5,2,1346,1492,31438,36091,27275,70.0,1
3559229,2023-03-01,1,5,2,1346,1492,31438,35450,27275,70.0,1
3559230,2023-03-01,1,5,2,1346,1492,31438,31482,0,207.0,3


### Preprocessing

In [3]:
data["period"] = pd.to_datetime(data["period"], format="%Y-%m-%d", errors='coerce')

In [4]:
date_min = data["period"].min()
date_max = data["period"].max()
data["months_from_start"] = (data["period"].dt.year - date_min.year) * 12 + (data["period"].dt.month - date_min.month)
data

data = data.drop(columns=["period"])

In [5]:
accuracy_granularity = [
    # "period",
    "rps",
    # "holding_name",
    # "sender_department_name",
    # "recipient_department_name",
    
    "client_sap_id",
    "freight_id",
    "sender_station_id",
    "recipient_station_id",
    "sender_organisation_id",
    "podrod",
    "filial",
]

In [6]:
data_grouped = data.groupby(accuracy_granularity).agg(list)
data_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,real_weight,real_wagon_count,months_from_start
rps,client_sap_id,freight_id,sender_station_id,recipient_station_id,sender_organisation_id,podrod,filial,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,-1,0,2977,38567,7152,0,2,[31.0],[1],[83]
0,-1,0,3801,26930,7152,1,2,[90.0],[4],[76]
0,-1,0,3845,28207,8535,1,2,[4.0],[1],[74]
0,-1,0,4602,31057,0,0,2,[40.0],[2],[55]
0,-1,0,4871,25250,0,1,2,"[68.0, 68.0]","[1, 1]","[62, 63]"
...,...,...,...,...,...,...,...,...,...,...
1,2415,2997,13005,33625,10441,5,2,"[136.0, 138.0]","[2, 2]","[69, 71]"
1,2415,2997,13005,37870,10441,5,2,"[70.0, 137.0]","[1, 2]","[70, 71]"
1,2415,3471,32800,12142,7178,5,2,[67.0],[1],[80]
1,2415,3471,32800,12840,7178,5,2,[67.0],[1],[76]


In [7]:
not_grouped_indexes = list(data_grouped.columns)
not_grouped_indexes

['real_weight', 'real_wagon_count', 'months_from_start']

In [8]:
for months in data_grouped["months_from_start"].tolist():
    if len(set(months)) != len(months):
        raise

In [9]:
# data_grouped = data_grouped.reset_index()
# data_grouped

In [10]:
k_ns = 3

def negative_sampling(column):
    if column.name == "real_weight":
        column = [line + [0] * k_ns for line in column]
    elif column.name == "real_wagon_count":
        column = [line + [0] * k_ns for line in column]
    elif column.name == "months_from_start":
        column = [line + random.sample(list(set(range(54, 128 + 1)) - set(line)), k_ns) for line in column]
    return column

In [11]:
data_grouped[not_grouped_indexes] = data_grouped[not_grouped_indexes].progress_apply(negative_sampling)

  0%|          | 0/3 [00:00<?, ?it/s]

NameError: name 'data_with_ns' is not defined

In [None]:
data_with_ns = data_grouped.explode(not_grouped_indexes).reset_index()

In [None]:
data_with_ns["is_history"] = ((data_with_ns["real_weight"] != 0) | (data_with_ns["real_wagon_count"] != 0)).astype(int)

In [None]:
data_with_ns = add_master_data_mappings(data_with_ns)

In [None]:
data_with_ns

In [None]:
data_with_ns["period"] = (date_min + data_with_ns["months_from_start"].progress_apply(lambda x: relativedelta(months=x)))
data_with_ns

In [None]:
data_with_ns, date_features = add_time_series_features(data_with_ns)
data_with_ns = data_with_ns.drop(columns=["period_day", "period_seconds"])

In [None]:
data_with_ns = data_with_ns.fillna(-1)

In [None]:
data_with_ns["is_test"] = (data_with_ns["months_from_start"] >= 124).astype(int)
data_with_ns = data_with_ns[data_with_ns["months_from_start"] >= 54]
data_with_ns

### CatBoost

In [None]:
train = data_with_ns[data_with_ns["is_test"] == 0]
test = data_with_ns[data_with_ns["is_test"] == 1]

X_train = train.drop(columns=["real_weight", "real_wagon_count", "is_history"])
y_train = train["is_history"].tolist()
X_test = test.drop(columns=["real_weight", "real_wagon_count", "is_history"])
y_test = test["is_history"].tolist()

In [None]:
model = CatBoostClassifier(iterations=1000,
                           eval_metric="Accuracy",
                           use_best_model=True,
                           random_seed=42,
                           max_depth=2,
                           # learning_rate=0.1
)

model.fit(X_train,
          y_train,
          eval_set=(X_test, y_test),
          verbose=100)

In [None]:
forecast = X_test.copy()
forecast["forecast_wagon_count"] = model.predict(X_test)
forecast = forecast[forecast["forecast_wagon_count"] > 0]

fact = X_test.copy()
fact["real_wagon_count"] = y_test

In [23]:
model = CatBoostClassifier(iterations=120,
                           eval_metric="Accuracy",
                           # use_best_model=True,
                           random_seed=42,
                           max_depth=2,
                           learning_rate=0.273988
)

full_X = data_with_ns.drop(columns=["real_weight", "real_wagon_count", "is_history"])
full_y = data_with_ns["is_history"].tolist()

model.fit(full_X,
          full_y,
          verbose=50
)

0:	learn: 0.5599015	total: 169ms	remaining: 20.1s
50:	learn: 0.7335760	total: 8.37s	remaining: 11.3s
100:	learn: 0.7466654	total: 16.5s	remaining: 3.1s
119:	learn: 0.7489742	total: 19.4s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x173e7a850>

### Predict

In [24]:
data_grouped = data_grouped.drop(columns=["real_weight", "real_wagon_count"])
data_grouped["months_from_start"] = [[129, 130, 131, 132, 133] for _ in range(data_grouped.shape[0])]
data_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,months_from_start
rps,client_sap_id,freight_id,sender_station_id,recipient_station_id,sender_organisation_id,podrod,filial,Unnamed: 8_level_1
0,-1,0,2977,38567,7152,0,2,"[129, 130, 131, 132, 133]"
0,-1,0,3801,26930,7152,1,2,"[129, 130, 131, 132, 133]"
0,-1,0,3845,28207,8535,1,2,"[129, 130, 131, 132, 133]"
0,-1,0,4602,31057,0,0,2,"[129, 130, 131, 132, 133]"
0,-1,0,4871,25250,0,1,2,"[129, 130, 131, 132, 133]"
...,...,...,...,...,...,...,...,...
1,2415,2997,13005,33625,10441,5,2,"[129, 130, 131, 132, 133]"
1,2415,2997,13005,37870,10441,5,2,"[129, 130, 131, 132, 133]"
1,2415,3471,32800,12142,7178,5,2,"[129, 130, 131, 132, 133]"
1,2415,3471,32800,12840,7178,5,2,"[129, 130, 131, 132, 133]"


In [25]:
data_future_months = data_grouped.explode(["months_from_start"]).reset_index()

In [27]:
data_future_months = add_master_data_mappings(data_future_months)

In [28]:
data_future_months["period"] = (date_min + data_future_months["months_from_start"].progress_apply(lambda x: relativedelta(months=x)))
data_future_months

  0%|          | 0/5127490 [00:00<?, ?it/s]

Unnamed: 0,rps,client_sap_id,freight_id,sender_station_id,recipient_station_id,sender_organisation_id,podrod,filial,months_from_start,holding_name,freight_group_name,sender_department_name,sender_railway_name,recipient_department_name,recipient_railway_name,period
0,0,-1,0,2977,38567,7152,0,2,129,,0,27,25,76,6,2023-04-01 00:00:00
1,0,-1,0,2977,38567,7152,0,2,130,,0,27,25,76,6,2023-05-01 00:00:00
2,0,-1,0,2977,38567,7152,0,2,131,,0,27,25,76,6,2023-06-01 00:00:00
3,0,-1,0,2977,38567,7152,0,2,132,,0,27,25,76,6,2023-07-01 00:00:00
4,0,-1,0,2977,38567,7152,0,2,133,,0,27,25,76,6,2023-08-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5127485,1,2416,1691,27096,27484,10579,5,2,129,1935.0,39,86,39,94,39,2023-04-01 00:00:00
5127486,1,2416,1691,27096,27484,10579,5,2,130,1935.0,39,86,39,94,39,2023-05-01 00:00:00
5127487,1,2416,1691,27096,27484,10579,5,2,131,1935.0,39,86,39,94,39,2023-06-01 00:00:00
5127488,1,2416,1691,27096,27484,10579,5,2,132,1935.0,39,86,39,94,39,2023-07-01 00:00:00


In [29]:
data_future_months, date_features = add_time_series_features(data_future_months)
data_future_months = data_future_months.drop(columns=["period_day", "period_seconds"])

In [None]:
data_future_months = data_future_months.fillna(-1)

In [None]:
# Make prediction by cbclf

In [None]:
# Set real_wagon count like last value 