# Libs, data importing

In [1]:
import bamt.Networks as Nets
import bamt.Preprocessors as pp

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from pgmpy.estimators import K2Score
import holidays

In [2]:
data_train = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/train_preprocessed.csv')
data_train.drop('Unnamed: 0', axis=1, inplace=True)


In [3]:
data_test = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/test.csv')

In [4]:
data_val = pd.read_csv('/home/jerzy/Documents/IndustrialML/data/valid.csv')
# drop row 0101000020E610000000000000000000000000000000000000
data_val = data_val[data_val['point'] != '0101000020E610000000000000000000000000000000000000']

In [5]:
data_unique_points = data_train['point'].unique()
test_unique_points = data_test['point'].unique()
valid_unique_points = data_val['point'].unique()

# Split data into year, month, day, hour and adding holidays, weekends, weekdays

Add datetime column to test and valid datasets

In [6]:
data_test['datetime'] = pd.to_datetime(data_test['hour'], unit='s')
data_val['datetime'] = pd.to_datetime(data_val['hour'], unit='s')
data_train['datetime'] = pd.to_datetime(data_train['time'])

In [7]:
data_train

Unnamed: 0,point,lon,lat,time,timestamp,num_posts,datetime
0,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-22 11:00:00,1574420400,1,2019-11-22 11:00:00
1,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-22 12:00:00,1574424000,1,2019-11-22 12:00:00
2,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-23 13:00:00,1574514000,1,2019-11-23 13:00:00
3,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-25 16:00:00,1574697600,1,2019-11-25 16:00:00
4,0101000020E610000002A5EC7AB31D3E4097654065F8EA...,30.116020,59.835705,2019-11-27 10:00:00,1574848800,1,2019-11-27 10:00:00
...,...,...,...,...,...,...,...
3625651,0101000020E6100000FF9D4C0EC3853E4094782B2D0DF3...,30.522508,59.898840,2019-12-06 04:00:00,3151209600,2,2019-12-06 04:00:00
3625652,0101000020E6100000FF9D4C0EC3853E4094782B2D0DF3...,30.522508,59.898840,2019-12-23 08:00:00,1577088000,1,2019-12-23 08:00:00
3625653,0101000020E6100000FF9D4C0EC3853E40ABD94A1972EF...,30.522508,59.870670,2019-06-10 09:00:00,1560157200,1,2019-06-10 09:00:00
3625654,0101000020E6100000FF9D4C0EC3853E40ABD94A1972EF...,30.522508,59.870670,2019-10-06 14:00:00,1570370400,1,2019-10-06 14:00:00


Add yyyy, mm, dd, hh to test and valid

In [8]:
data_train["hour"] = data_train["datetime"].dt.hour
data_train["weekday"] = data_train["datetime"].dt.weekday

data_test["dayhour"] = data_test["datetime"].dt.hour
data_test["weekday"] = data_test["datetime"].dt.weekday

data_val["dayhour"] = data_val["datetime"].dt.hour
data_val["weekday"] = data_val["datetime"].dt.weekday

Add holidays

In [9]:
holidays_russia = holidays.country_holidays('RU', years = [2019, 2020])

data_train['is_holiday'] = data_train['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_test['is_holiday'] = data_test['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)
data_val['is_holiday'] = data_val['datetime'].apply(lambda x: 1 if x in holidays_russia else 0)

In [10]:
data_train = data_train.drop(['time'], axis=1)

# Bayesian Network learning

In [11]:
# rename train columns in order to match test and validation data

data_train = data_train.rename(columns={'timestamp': 'hour', 'num_posts': 'sum', 'hour': 'dayhour'})

In [12]:
cols = ['hour', 'dayhour', 'weekday', 'is_holiday', 'sum']
data_train_bn = data_train[cols]
data_train_bn[['weekday', 'is_holiday']] = data_train_bn[['weekday', 'is_holiday']].astype(str)
data_train_bn[['hour', 'dayhour', 'sum']] = data_train_bn[['hour', 'dayhour', 'sum']].astype(float)
data_train_bn.dtypes

hour          float64
dayhour       float64
weekday        object
is_holiday     object
sum           float64
dtype: object

In [13]:
encoder = preprocessing.LabelEncoder()
discretizer = preprocessing.KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')

p = pp.Preprocessor([('encoder', encoder), ('discretizer', discretizer)])
discretized_data, est = p.apply(data_train_bn)

In [14]:
bn = Nets.HybridBN(has_logit=True, use_mixture=True) # init BN
info = p.info
info

{'types': {'hour': 'cont',
  'dayhour': 'cont',
  'weekday': 'disc',
  'is_holiday': 'disc',
  'sum': 'cont'},
 'signs': {'hour': 'pos', 'dayhour': 'pos', 'sum': 'pos'}}

In [15]:
params = {'init_nodes': ['hour', 'dayhour', 'weekday', 'is_holiday', 'sum'],
          'init_edges':[('hour', 'sum'), ('dayhour', 'sum'), ('is_holiday', 'sum'), ('weekday', 'sum')]}
bn.add_nodes(info)
bn.add_edges(discretized_data,  scoring_function=('K2',K2Score), params=params)

In [16]:
bn.get_info()

Unnamed: 0,name,node_type,data_type,parents,parents_types
0,hour,MixtureGaussian,cont,[],[]
1,dayhour,MixtureGaussian,cont,[],[]
2,weekday,Discrete,disc,[],[]
3,is_holiday,Discrete,disc,[],[]
4,sum,ConditionalMixtureGaussian,cont,"[hour, weekday, is_holiday]","[cont, disc, disc]"


In [17]:
bn.plot('bn_posts.html')

In [18]:
def custom_metric(y_true, y_pred):
    return abs(y_true - y_pred) / y_true

def flatten(l):
    return [item for sublist in l for item in sublist]

def bn_model_fit_predict(data_train: pd.DataFrame, data_test: pd.DataFrame, bn):

    data_test_bn_error = []
    test_unique_points = data_test['point'].unique()

    data_test[['weekday', 'is_holiday']] = data_test[['weekday', 'is_holiday']].astype(str)
    data_test[['hour', 'dayhour', 'sum']] = data_test[['hour', 'dayhour', 'sum']].astype(float)

    for point in test_unique_points:
        data_train_point = data_train[data_train['point'] == point]
        data_test_point = data_test[data_test['point'] == point]

        train = data_train_point[['hour', 'dayhour', 'weekday', 'is_holiday', 'sum']]
        X_test = data_test_point[['hour', 'dayhour', 'weekday', 'is_holiday']]
        y_test = data_test_point['sum']

        print(train)
        print(X_test)

        bn.fit_parameters(train)
        y_pred = np.round(bn.predict(X_test, 4))
        data_test_bn_error.append(custom_metric(y_test.astype(float), y_pred['sum'].astype(float)))
    
    return data_test_bn_error



In [19]:
data_train_bn['point'] = data_train['point']

data_test_bn_error = bn_model_fit_predict(data_train_bn, data_test, bn)
data_test_bn_error = flatten(data_test_bn_error)
data_test['bn_error'] = data_test_bn_error
data_test['bn_error'].mean()

                 hour  dayhour weekday is_holiday  sum
2193016  3.092602e+09      0.0       1          1  2.0
2193017  7.731522e+09      1.0       1          1  5.0
2193018  7.731558e+09      3.0       1          1  5.0
2193019  1.546315e+09      4.0       1          1  1.0
2193020  1.546319e+09      5.0       1          1  1.0
...               ...      ...     ...        ...  ...
2200197  3.160980e+09     17.0       4          0  2.0
2200198  1.106346e+10     18.0       4          0  7.0
2200199  3.160994e+09     19.0       4          0  2.0
2200200  7.902504e+09     20.0       4          0  5.0
2200201  4.741513e+09     21.0       4          0  3.0

[7186 rows x 5 columns]
             hour  dayhour weekday is_holiday
0    1.582711e+09     10.0       2          0
205  1.582103e+09      9.0       2          0
233  1.581001e+09     15.0       3          0
555  1.582369e+09     11.0       5          0
572  1.582488e+09     20.0       6          1
667  1.581016e+09     19.0       3     

100%|██████████| 6/6 [00:00<00:00, 80.58it/s]
100%|██████████| 1/1 [00:00<00:00, 638.11it/s]
100%|██████████| 1/1 [00:00<00:00, 609.02it/s]
100%|██████████| 1/1 [00:00<00:00, 805.67it/s]
100%|██████████| 1/1 [00:00<00:00, 666.19it/s]
100%|██████████| 1/1 [00:00<00:00, 898.14it/s]
100%|██████████| 1/1 [00:00<00:00, 700.10it/s]


TypeError: loop of ufunc does not support argument 0 of type dict which has no callable rint method