In [44]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/db-regio-dataset/regular_travel_test.csv
/kaggle/input/db-regio-dataset/regular_route_definitions.csv
/kaggle/input/db-regio-dataset/regular_travel.csv
/kaggle/input/db-regio-dataset/wetter2019.csv
/kaggle/input/db-regio-dataset/bus_stops.csv
/kaggle/input/db-regio-dataset/regular_travel_raw.csv
/kaggle/input/db-regio-dataset/wdw_queries.csv
/kaggle/input/db-regio-dataset/on_demand_travel_raw.csv
/kaggle/input/db-regio-dataset/on_demand_travel_test.csv
/kaggle/input/db-regio-dataset/on_demand_travel.csv


# Training without Weekends / Weather features

In [1]:
from typing import Union

import numpy as np
import pandas as pd
from pandas.core.dtypes.common import is_datetime_or_timedelta_dtype
from sklearn.model_selection._split import _BaseKFold


class GroupedTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator

    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals, in train/test sets.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate. Test splits are treated preferentially,
    i.e. if there are not enough samples the test split is filled first.
    This splitter works on a date range, which is not necessarily aligned to the
    available data if it has missing days.

    ----------

    train_window : int, default=21
        Maximum size for a single training set.

    test_window : int, default=7
        Used to set the size of the test set.

    train_gap : int, default=0
        Gap (in days) before the training set.

    test_gap : int, default=0
        Gap (in days) between the training and test set.
    """

    def __init__(self, train_window: int = 21, test_window=7, train_gap: int = 0, test_gap: int = 0):
        self.train_window = train_window
        self.test_window = test_window
        self.train_gap = train_gap
        self.test_gap = test_gap
        self.n_folds_ = None

    def split(self, X: pd.DataFrame, y, dates:Union[pd.Series, np.ndarray], *_):
        """Generate indices to split data into training and test set according to provided dates.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where `n_samples` is the number of samples
            and `n_features` is the number of features.

        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.

        dates : array-like of shape (n_samples,)
            Dates of the samples, can be passed to sklearn via the `groups` parameter.

        Yields
        ------
        train : ndarray
            The training set indices for that split.

        test : ndarray
            The testing set indices for that split.
        """
        if not is_datetime_or_timedelta_dtype(dates):
            dates = pd.to_datetime(dates)

        start_date, end_date = dates.min(), dates.max()
        date_range = pd.date_range(start_date, end_date)
        n_dates = len(date_range)

        indices = np.arange(n_dates)
        train_starts = range(0, n_dates, self.train_window + self.test_window + self.train_gap + self.test_gap)

        self.n_folds_ = len(train_starts)

        for train_start in train_starts:
            avail_days = min(n_dates - train_start,
                             self.train_window + self.test_window + self.train_gap + self.test_gap)
            test_start = max(train_start, train_start + avail_days - self.test_window - self.train_gap)
            train_dates = date_range[indices[train_start: test_start - self.test_gap]]
            test_dates = date_range[test_start: train_start + avail_days - self.train_gap]
            train_indices = np.where(np.isin(dates, train_dates))[0]
            test_indices = np.where(np.isin(dates, test_dates))[0]
            if len(test_dates) < self.test_window:
                continue
            yield list(train_indices), list(test_indices)


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [78]:

regular_travel = pd.read_csv('D:/Uni Passau/Data Science Lab/data-science-lab-team-04-main/experiments/data/db_regio/regular_travel.csv', parse_dates=['date'])

# first we remove the dummy test set rows
regular_travel_train = regular_travel.dropna(subset = ['Passengers'])

y = np.digitize(regular_travel_train.Passengers, bins=(1, 2, 3))
regular_travel_train["Passengers_bined"] = y
print(regular_travel.shape)
regular_travel = regular_travel.merge(regular_travel_train[["date", "hour", "EZone", "Passengers_bined"]], on=["date", "hour", "EZone"], how="left")
print(regular_travel.shape)
# regular_travel["Passengers"] = np.digitize(regular_travel.Passengers, bins=(1, 2, 3))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regular_travel_train["Passengers_bined"] = y


(420000, 4)
(420000, 5)


In [None]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
labelencoder = labelencoder.fit(regular_travel['EZone'])
regular_travel['EZone'] = labelencoder.transform(regular_travel['EZone'])


In [None]:
class_weights={0:1,1:3,2:6,3:4}
tscv = GroupedTimeSeriesSplit(train_window=21, test_window=7, train_gap = 0)
est = RandomForestClassifier(class_weight=class_weights)

In [None]:
majority_class=regular_travel[regular_travel['Passengers']==0]
len(majority_class)

In [None]:
predictions = []
for train_ind, test_ind in tscv.split(regular_travel, y=regular_travel.Passengers_bined, dates = regular_travel.date):
    est = est.fit(regular_travel[["hour", "EZone"]].iloc[train_ind], regular_travel.Passengers_bined.iloc[train_ind])
    prediction_df = regular_travel[["date", "hour", "EZone"]].iloc[test_ind].copy()
    prediction_df["Passengers_bined"] = est.predict(regular_travel[["hour", "EZone"]].iloc[test_ind])
    predictions.append(prediction_df)
    
predictions = pd.concat(predictions)
predictions.to_csv('regular_submission.csv', index=False)
predictions.head(

In [71]:
predictions["Passengers_bined"].value_counts()

0.0    79121
1.0    10682
3.0     9954
2.0     9443
Name: Passengers_bined, dtype: int64

In [72]:
# Decode EZone
predictions["EZone"] = labelencoder.inverse_transform(predictions["EZone"])
predictions["Passengers_bined"] = predictions["Passengers_bined"].astype(int)

In [73]:
regular_travel_test = pd.read_csv('D:/Uni Passau/Data Science Lab/data-science-lab-team-04-main/experiments/data/db_regio/regular_travel_test.csv', parse_dates=['date'])
regular_travel_test.head()

Unnamed: 0,date,EZone,hour,Passengers
0,2019-01-22,"15964 - Salzweg, Außenstelle LRA",0,0
1,2019-01-22,"15964 - Salzweg, Außenstelle LRA",1,0
2,2019-01-22,"15964 - Salzweg, Außenstelle LRA",2,0
3,2019-01-22,"15964 - Salzweg, Außenstelle LRA",3,0
4,2019-01-22,"15964 - Salzweg, Außenstelle LRA",4,0


In [74]:
regular_travel_test["Actual Passengers"] = np.digitize(regular_travel_test["Passengers"], bins=(1,2,3))
regular_travel_test.drop("Passengers", axis=1, inplace=True)

In [75]:
# merge predictions and test
merged_data = predictions.merge(regular_travel_test, on=["date", "EZone", "hour"], how="inner")
merged_data

Unnamed: 0,date,hour,EZone,Passengers_bined,Actual Passengers
0,2019-01-22,0,"15964 - Salzweg, Außenstelle LRA",0,0
1,2019-01-22,1,"15964 - Salzweg, Außenstelle LRA",0,0
2,2019-01-22,2,"15964 - Salzweg, Außenstelle LRA",0,0
3,2019-01-22,3,"15964 - Salzweg, Außenstelle LRA",0,0
4,2019-01-22,4,"15964 - Salzweg, Außenstelle LRA",0,0
...,...,...,...,...,...
109195,2019-12-16,19,"9750 - Passau, Reisebüro Niedermayer",1,0
109196,2019-12-16,20,"9750 - Passau, Reisebüro Niedermayer",0,0
109197,2019-12-16,21,"9750 - Passau, Reisebüro Niedermayer",0,0
109198,2019-12-16,22,"9750 - Passau, Reisebüro Niedermayer",0,0


In [76]:
f1_score(merged_data["Actual Passengers"], merged_data["Passengers_bined"], average='micro')

0.7247069597069598

In [77]:
from sklearn.metrics import classification_report
print(classification_report(merged_data["Actual Passengers"], merged_data["Passengers_bined"]))

              precision    recall  f1-score   support

           0       0.92      0.82      0.86     88868
           1       0.20      0.22      0.21      9726
           2       0.11      0.24      0.15      4397
           3       0.34      0.54      0.42      6209

    accuracy                           0.72    109200
   macro avg       0.39      0.45      0.41    109200
weighted avg       0.79      0.72      0.75    109200



# Training with Temperature feature
## Merge Weather data

In [13]:
df_weather = pd.read_csv('D:/Uni Passau/Data Science Lab/data-science-lab-team-04-main/experiments/data/db_regio/wetter2019.csv', parse_dates=['MESS_DATUM'])

In [14]:
df_weather

Unnamed: 0.1,Unnamed: 0,MESS_DATUM,TT_TU,RF_TU,RS_IND,R1,F
0,0,2019-01-01 00:00:00,3.2,100.0,1,0.3,2.0
1,1,2019-01-01 01:00:00,4.2,100.0,1,0.1,2.9
2,2,2019-01-01 02:00:00,4.4,100.0,1,0.1,3.3
3,3,2019-01-01 03:00:00,4.4,100.0,1,0.1,4.7
4,4,2019-01-01 04:00:00,4.2,100.0,1,0.0,4.4
...,...,...,...,...,...,...,...
8748,8748,2019-12-31 19:00:00,1.4,79.0,0,0.0,3.1
8749,8749,2019-12-31 20:00:00,0.1,85.0,0,0.0,3.4
8750,8750,2019-12-31 21:00:00,0.1,84.0,0,0.0,3.3
8751,8751,2019-12-31 22:00:00,-0.7,89.0,0,0.0,2.1


In [15]:
df_weather.drop("Unnamed: 0", axis=1, inplace=True)
df_weather["hour"] = df_weather["MESS_DATUM"].dt.hour
df_weather['date'] = df_weather["MESS_DATUM"].dt.date
df_weather.drop("MESS_DATUM", axis=1, inplace=True)
df_weather

Unnamed: 0,TT_TU,RF_TU,RS_IND,R1,F,hour,date
0,3.2,100.0,1,0.3,2.0,0,2019-01-01
1,4.2,100.0,1,0.1,2.9,1,2019-01-01
2,4.4,100.0,1,0.1,3.3,2,2019-01-01
3,4.4,100.0,1,0.1,4.7,3,2019-01-01
4,4.2,100.0,1,0.0,4.4,4,2019-01-01
...,...,...,...,...,...,...,...
8748,1.4,79.0,0,0.0,3.1,19,2019-12-31
8749,0.1,85.0,0,0.0,3.4,20,2019-12-31
8750,0.1,84.0,0,0.0,3.3,21,2019-12-31
8751,-0.7,89.0,0,0.0,2.1,22,2019-12-31


In [16]:
df_weather['date'] = pd.to_datetime(df_weather['date'])

In [38]:
regular_travel = regular_travel.merge(df_weather[["date", "hour","TT_TU"]], on=["date", "hour"], how="left")
regular_travel.head()

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,TT_TU
0,2019-01-01,0,0,0.0,0.0,3.2
1,2019-01-01,0,1,0.0,0.0,4.2
2,2019-01-01,0,2,0.0,0.0,4.4
3,2019-01-01,0,3,0.0,0.0,4.4
4,2019-01-01,0,4,0.0,0.0,4.2


In [39]:
regular_travel

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,TT_TU
0,2019-01-01,0,0,0.0,0.0,3.2
1,2019-01-01,0,1,0.0,0.0,4.2
2,2019-01-01,0,2,0.0,0.0,4.4
3,2019-01-01,0,3,0.0,0.0,4.4
4,2019-01-01,0,4,0.0,0.0,4.2
...,...,...,...,...,...,...
419995,2019-12-16,49,19,,,4.6
419996,2019-12-16,49,20,,,3.9
419997,2019-12-16,49,21,,,3.3
419998,2019-12-16,49,22,,,3.0


## Training

In [43]:
regular_travel.isna().sum()

date                     0
EZone                    0
hour                     0
Passengers          109200
Passengers_bined    109200
TT_TU                  350
dtype: int64

In [44]:
regular_travel[regular_travel['TT_TU'].isna()==True].index

Int64Index([68422, 68446, 68470, 68494, 68518, 68542, 68566, 68590, 68614,
            68638,
            ...
            70759, 70760, 70761, 70762, 70780, 70781, 70783, 70784, 70785,
            70786],
           dtype='int64', length=350)

In [21]:
regular_travel.index
    

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            419990, 419991, 419992, 419993, 419994, 419995, 419996, 419997,
            419998, 419999],
           dtype='int64', length=420000)

In [45]:
for i in regular_travel[regular_travel['TT_TU'].isna()==True].index:
    regular_travel['TT_TU'][i]=regular_travel['TT_TU'][i-24]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regular_travel['TT_TU'][i]=regular_travel['TT_TU'][i-24]


In [46]:
regular_travel['TT_TU'].isna().sum()

0

In [47]:
regular_travel

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,TT_TU
0,2019-01-01,0,0,0.0,0.0,3.2
1,2019-01-01,0,1,0.0,0.0,4.2
2,2019-01-01,0,2,0.0,0.0,4.4
3,2019-01-01,0,3,0.0,0.0,4.4
4,2019-01-01,0,4,0.0,0.0,4.2
...,...,...,...,...,...,...
419995,2019-12-16,49,19,,,4.6
419996,2019-12-16,49,20,,,3.9
419997,2019-12-16,49,21,,,3.3
419998,2019-12-16,49,22,,,3.0


In [49]:
predictions1 = []
for train_ind, test_ind in tscv.split(regular_travel, y=regular_travel.Passengers_bined, dates = regular_travel.date):
    est = est.fit(regular_travel[["hour", "EZone","TT_TU"]].iloc[train_ind], regular_travel.Passengers_bined.iloc[train_ind])
    prediction_df = regular_travel[["date", "hour", "EZone","TT_TU"]].iloc[test_ind].copy()
    prediction_df["Passengers_bined"] = est.predict(regular_travel[["hour", "EZone","TT_TU"]].iloc[test_ind])
    predictions1.append(prediction_df)
    
predictions1 = pd.concat(predictions1)
predictions1.to_csv('regular_submission1.csv', index=False)
predictions1.head()

Unnamed: 0,date,hour,EZone,TT_TU,Passengers_bined
25200,2019-01-22,0,0,-7.7,0.0
25201,2019-01-22,1,0,-7.3,0.0
25202,2019-01-22,2,0,-6.9,0.0
25203,2019-01-22,3,0,-7.2,0.0
25204,2019-01-22,4,0,-7.3,0.0


In [51]:
predictions1["Passengers_bined"].value_counts()

0.0    94549
1.0     6642
3.0     5269
2.0     2740
Name: Passengers_bined, dtype: int64

In [52]:
# Decode EZone
predictions1["EZone"] = labelencoder.inverse_transform(predictions1["EZone"])
predictions1["Passengers_bined"] = predictions1["Passengers_bined"].astype(int)

In [53]:
predictions1.head()

Unnamed: 0,date,hour,EZone,TT_TU,Passengers_bined
25200,2019-01-22,0,"15964 - Salzweg, Außenstelle LRA",-7.7,0
25201,2019-01-22,1,"15964 - Salzweg, Außenstelle LRA",-7.3,0
25202,2019-01-22,2,"15964 - Salzweg, Außenstelle LRA",-6.9,0
25203,2019-01-22,3,"15964 - Salzweg, Außenstelle LRA",-7.2,0
25204,2019-01-22,4,"15964 - Salzweg, Außenstelle LRA",-7.3,0


In [54]:
regular_travel_test.head()

Unnamed: 0,date,EZone,hour,Actual Passengers
0,2019-01-22,"15964 - Salzweg, Außenstelle LRA",0,0
1,2019-01-22,"15964 - Salzweg, Außenstelle LRA",1,0
2,2019-01-22,"15964 - Salzweg, Außenstelle LRA",2,0
3,2019-01-22,"15964 - Salzweg, Außenstelle LRA",3,0
4,2019-01-22,"15964 - Salzweg, Außenstelle LRA",4,0


In [55]:
# merge predictions and test
merged_data1 = predictions1.merge(regular_travel_test, on=["date", "EZone", "hour"], how="inner")
merged_data1

Unnamed: 0,date,hour,EZone,TT_TU,Passengers_bined,Actual Passengers
0,2019-01-22,0,"15964 - Salzweg, Außenstelle LRA",-7.7,0,0
1,2019-01-22,1,"15964 - Salzweg, Außenstelle LRA",-7.3,0,0
2,2019-01-22,2,"15964 - Salzweg, Außenstelle LRA",-6.9,0,0
3,2019-01-22,3,"15964 - Salzweg, Außenstelle LRA",-7.2,0,0
4,2019-01-22,4,"15964 - Salzweg, Außenstelle LRA",-7.3,0,0
...,...,...,...,...,...,...
109195,2019-12-16,19,"9750 - Passau, Reisebüro Niedermayer",4.6,0,0
109196,2019-12-16,20,"9750 - Passau, Reisebüro Niedermayer",3.9,0,0
109197,2019-12-16,21,"9750 - Passau, Reisebüro Niedermayer",3.3,0,0
109198,2019-12-16,22,"9750 - Passau, Reisebüro Niedermayer",3.0,0,0


In [61]:
print("Metrics with Temp:\n",classification_report(merged_data1["Actual Passengers"], merged_data1["Passengers_bined"]))

Metrics with Temp:
               precision    recall  f1-score   support

           0       0.86      0.91      0.88     88868
           1       0.18      0.13      0.15      9726
           2       0.10      0.06      0.08      4397
           3       0.35      0.30      0.32      6209

    accuracy                           0.77    109200
   macro avg       0.37      0.35      0.36    109200
weighted avg       0.74      0.77      0.75    109200



In [62]:
print("Metrics without Temp:\n",classification_report(merged_data["Actual Passengers"], merged_data["Passengers_bined"]))

Metrics without Temp:
               precision    recall  f1-score   support

           0       0.85      0.97      0.90     88868
           1       0.20      0.04      0.07      9726
           2       0.15      0.01      0.02      4397
           3       0.45      0.42      0.43      6209

    accuracy                           0.81    109200
   macro avg       0.41      0.36      0.36    109200
weighted avg       0.74      0.81      0.77    109200



# Training with weekend feature

Preprocessing

In [64]:
regular_travel

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,TT_TU
0,2019-01-01,0,0,0.0,0.0,3.2
1,2019-01-01,0,1,0.0,0.0,4.2
2,2019-01-01,0,2,0.0,0.0,4.4
3,2019-01-01,0,3,0.0,0.0,4.4
4,2019-01-01,0,4,0.0,0.0,4.2
...,...,...,...,...,...,...
419995,2019-12-16,49,19,,,4.6
419996,2019-12-16,49,20,,,3.9
419997,2019-12-16,49,21,,,3.3
419998,2019-12-16,49,22,,,3.0


In [65]:
#drop Temp feature
regular_travel.drop(columns=['TT_TU'],inplace=True)
regular_travel.head()

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined
0,2019-01-01,0,0,0.0,0.0
1,2019-01-01,0,1,0.0,0.0
2,2019-01-01,0,2,0.0,0.0
3,2019-01-01,0,3,0.0,0.0
4,2019-01-01,0,4,0.0,0.0


In [66]:
#add is_weekend feature 
regular_travel['is_weekend']=regular_travel['date'].dt.dayofweek>4

In [67]:
regular_travel

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,is_weekend
0,2019-01-01,0,0,0.0,0.0,False
1,2019-01-01,0,1,0.0,0.0,False
2,2019-01-01,0,2,0.0,0.0,False
3,2019-01-01,0,3,0.0,0.0,False
4,2019-01-01,0,4,0.0,0.0,False
...,...,...,...,...,...,...
419995,2019-12-16,49,19,,,False
419996,2019-12-16,49,20,,,False
419997,2019-12-16,49,21,,,False
419998,2019-12-16,49,22,,,False


In [77]:
labelencoder_wekend=LabelEncoder()
regular_travel['is_weekend']=labelencoder_wekend.fit_transform(regular_travel['is_weekend'])

In [78]:
regular_travel

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,is_weekend
0,2019-01-01,0,0,0.0,0.0,0
1,2019-01-01,0,1,0.0,0.0,0
2,2019-01-01,0,2,0.0,0.0,0
3,2019-01-01,0,3,0.0,0.0,0
4,2019-01-01,0,4,0.0,0.0,0
...,...,...,...,...,...,...
419995,2019-12-16,49,19,,,0
419996,2019-12-16,49,20,,,0
419997,2019-12-16,49,21,,,0
419998,2019-12-16,49,22,,,0


# training

In [79]:
predictions2 = []
for train_ind, test_ind in tscv.split(regular_travel, y=regular_travel.Passengers_bined, dates = regular_travel.date):
    est = est.fit(regular_travel[["hour", "EZone","is_weekend"]].iloc[train_ind], regular_travel.Passengers_bined.iloc[train_ind])
    prediction_df = regular_travel[["date", "hour", "EZone","is_weekend"]].iloc[test_ind].copy()
    prediction_df["Passengers_bined"] = est.predict(regular_travel[["hour", "EZone","is_weekend"]].iloc[test_ind])
    predictions2.append(prediction_df)
    
predictions2 = pd.concat(predictions2)
predictions2.to_csv('regular_submission2.csv', index=False)
predictions2.head()

Unnamed: 0,date,hour,EZone,is_weekend,Passengers_bined
25200,2019-01-22,0,0,0,0.0
25201,2019-01-22,1,0,0,0.0
25202,2019-01-22,2,0,0,0.0
25203,2019-01-22,3,0,0,0.0
25204,2019-01-22,4,0,0,0.0


In [80]:
predictions2['Passengers_bined'].value_counts()

0.0    97117
3.0     6453
1.0     4511
2.0     1119
Name: Passengers_bined, dtype: int64

In [83]:
# Decode EZone
predictions2["EZone"] = labelencoder.inverse_transform(predictions2["EZone"])
predictions2["Passengers_bined"] = predictions2["Passengers_bined"].astype(int)
predictions2["is_weekend"] = labelencoder_wekend.inverse_transform(predictions2["is_weekend"])


In [84]:
predictions2.head()

Unnamed: 0,date,hour,EZone,is_weekend,Passengers_bined
25200,2019-01-22,0,"15964 - Salzweg, Außenstelle LRA",False,0
25201,2019-01-22,1,"15964 - Salzweg, Außenstelle LRA",False,0
25202,2019-01-22,2,"15964 - Salzweg, Außenstelle LRA",False,0
25203,2019-01-22,3,"15964 - Salzweg, Außenstelle LRA",False,0
25204,2019-01-22,4,"15964 - Salzweg, Außenstelle LRA",False,0


In [85]:
# merge predictions and test
merged_data2 = predictions2.merge(regular_travel_test, on=["date", "EZone", "hour"], how="inner")
merged_data2

Unnamed: 0,date,hour,EZone,is_weekend,Passengers_bined,Actual Passengers
0,2019-01-22,0,"15964 - Salzweg, Außenstelle LRA",False,0,0
1,2019-01-22,1,"15964 - Salzweg, Außenstelle LRA",False,0,0
2,2019-01-22,2,"15964 - Salzweg, Außenstelle LRA",False,0,0
3,2019-01-22,3,"15964 - Salzweg, Außenstelle LRA",False,0,0
4,2019-01-22,4,"15964 - Salzweg, Außenstelle LRA",False,0,0
...,...,...,...,...,...,...
109195,2019-12-16,19,"9750 - Passau, Reisebüro Niedermayer",False,1,0
109196,2019-12-16,20,"9750 - Passau, Reisebüro Niedermayer",False,0,0
109197,2019-12-16,21,"9750 - Passau, Reisebüro Niedermayer",False,0,0
109198,2019-12-16,22,"9750 - Passau, Reisebüro Niedermayer",False,0,0


In [86]:
print("Metrics with weekend feature:\n",classification_report(merged_data2["Actual Passengers"], merged_data2["Passengers_bined"]))

Metrics with weekend feature:
               precision    recall  f1-score   support

           0       0.87      0.95      0.91     88868
           1       0.24      0.11      0.15      9726
           2       0.15      0.04      0.06      4397
           3       0.47      0.49      0.48      6209

    accuracy                           0.82    109200
   macro avg       0.43      0.40      0.40    109200
weighted avg       0.76      0.82      0.78    109200



In [87]:
print("Metrics without weekend feature:\n",classification_report(merged_data["Actual Passengers"], merged_data["Passengers_bined"]))

Metrics without weekend feature:
               precision    recall  f1-score   support

           0       0.85      0.97      0.90     88868
           1       0.20      0.04      0.07      9726
           2       0.15      0.01      0.02      4397
           3       0.45      0.42      0.43      6209

    accuracy                           0.81    109200
   macro avg       0.41      0.36      0.36    109200
weighted avg       0.74      0.81      0.77    109200



# Training with holiday feature

In [88]:
regular_travel.head()

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,is_weekend
0,2019-01-01,0,0,0.0,0.0,0
1,2019-01-01,0,1,0.0,0.0,0
2,2019-01-01,0,2,0.0,0.0,0
3,2019-01-01,0,3,0.0,0.0,0
4,2019-01-01,0,4,0.0,0.0,0


In [89]:
#drop is_weekend feature
regular_travel.drop(columns=['is_weekend'],inplace=True)

In [90]:
regular_travel.head()

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined
0,2019-01-01,0,0,0.0,0.0
1,2019-01-01,0,1,0.0,0.0
2,2019-01-01,0,2,0.0,0.0
3,2019-01-01,0,3,0.0,0.0
4,2019-01-01,0,4,0.0,0.0


In [91]:
select_dates=['2019-01-01','2019-01-06','2019-03-08','2019-04-19','2019-04-21','2019-04-22','2019-05-01','2019-05-30','2019-06-09','2019-06-10','2019-06-20','2019-08-15','2019-10-31','2019-10-03','2019-04-21','2019-11-01','2019-11-20','2019-12-25','2019-12-26']

In [95]:
#hold_reg_data=time_index_reg_data.loc[np.in1d(time_index_reg_data.index.date,pd.to_datetime(select_dates).date)]
#hold_reg_data.head()
regular_travel['is_holiday']=regular_travel['date'].isin(select_dates)    

In [97]:
regular_travel.head()

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,is_holiday
0,2019-01-01,0,0,0.0,0.0,True
1,2019-01-01,0,1,0.0,0.0,True
2,2019-01-01,0,2,0.0,0.0,True
3,2019-01-01,0,3,0.0,0.0,True
4,2019-01-01,0,4,0.0,0.0,True


In [98]:
labelencoder_holiday=LabelEncoder()
regular_travel['is_holiday']=labelencoder_holiday.fit_transform(regular_travel['is_holiday'])
regular_travel.head()

Unnamed: 0,date,EZone,hour,Passengers,Passengers_bined,is_holiday
0,2019-01-01,0,0,0.0,0.0,1
1,2019-01-01,0,1,0.0,0.0,1
2,2019-01-01,0,2,0.0,0.0,1
3,2019-01-01,0,3,0.0,0.0,1
4,2019-01-01,0,4,0.0,0.0,1


In [99]:
predictions3 = []
for train_ind, test_ind in tscv.split(regular_travel, y=regular_travel.Passengers_bined, dates = regular_travel.date):
    est = est.fit(regular_travel[["hour", "EZone","is_holiday"]].iloc[train_ind], regular_travel.Passengers_bined.iloc[train_ind])
    prediction_df = regular_travel[["date", "hour", "EZone","is_holiday"]].iloc[test_ind].copy()
    prediction_df["Passengers_bined"] = est.predict(regular_travel[["hour", "EZone","is_holiday"]].iloc[test_ind])
    predictions3.append(prediction_df)
    
predictions3 = pd.concat(predictions3)
predictions3.to_csv('regular_submission3.csv', index=False)
predictions3.head()

Unnamed: 0,date,hour,EZone,is_holiday,Passengers_bined
25200,2019-01-22,0,0,0,0.0
25201,2019-01-22,1,0,0,0.0
25202,2019-01-22,2,0,0,0.0
25203,2019-01-22,3,0,0,0.0
25204,2019-01-22,4,0,0,0.0


In [105]:
predictions3['Passengers_bined'].value_counts()

0    100618
3      6062
1      2051
2       469
Name: Passengers_bined, dtype: int64

In [104]:
# Decode EZone
predictions3["EZone"] = labelencoder.inverse_transform(predictions3["EZone"])


ValueError: y contains previously unseen labels: ['15964 - Salzweg, Außenstelle LRA' '4000 - Passau, Hbf'
 '4001 - Passau, Kl. Exerzierplatz' '4010 - Passau, Am Schanzl(Busbucht)'
 '4016 - Passau, Römerplatz' '4028 - Passau, Nagelschmiedgasse'
 '4029 - Passau, Rathaus' '4030 - Passau, Am Schanzlturm'
 '4100 - Pocking, Bahnhof' '4111 - Pocking, Rathaus'
 '4120 - Bad Füssing, Füssinger Hof' '4146 - Ortenburg, Marktplatz'
 '4150 - Bad Griesbach, Stadtplatz'
 '4170 - Bad Griesbach-Therme,Parkhotel' '4200 - Vilshofen, Bahnhof'
 '4214 - Dommelstadl' '4220 - Neuhaus a. Inn, Kriegerdenkmal'
 '4225 - Hartkirchen, Mariensäule' '4306 - Bad Füssing, Kath. Kirche'
 '4380 - Fürstenzell, MVZ' '4382 - Fürstenzell, Marktplatz'
 '4471 - Aldersbach, Pfarrgarten' '4491 - Tiefenbach, R.-Töpfl-Str.'
 '4563 - Tittling, Marktplatz' '4607 - Ruderting, Buchbauer'
 '4614 - Neukirchen v.W., Rathaus' '4677 - Tittling, Eurospar'
 '4794 - Salzweg, Rathaus' '4805 - Straßkirchen, Gh. Girmindl'
 '4806 - Salzweg, Postgasse' '4807 - Angl, Bäckerei Vogl'
 '4809 - Kreuzstraße Tankstelle' '4810 - Büchlberg, Post'
 '4815 - Büchlberg, BBW' '4880 - Franklbach' '4881 - Schreiberfeld'
 '4883 - Hutthurm, Bayerwaldstraße' '5381 - Passau, Lindau'
 '5384 - Erlau, Ortsmitte' '5387 - Obernzell, Abzw Bf'
 '5388 - Obernzell, Marktplatz' '5396 - Untergriesbach, Marktplatz'
 '5425 - Breitenberg, Passauer-Str.' '5444 - Wegscheid, Rathaus'
 '5455 - Hauzenberg, Post' '5459 - Hauzenberg, Abzw Schulstr.'
 '6462 - Aidenbach, Marktplatz' '7601 - Pocking, Obi'
 '8766 - Thyrnau, Edlfurtner' '9750 - Passau, Reisebüro Niedermayer']

In [107]:
predictions3["Passengers_bined"] = predictions3["Passengers_bined"].astype(int)
predictions3["is_holiday"] = labelencoder_holiday.inverse_transform(predictions3["is_holiday"])


In [108]:
merged_data3 = predictions3.merge(regular_travel_test, on=["date", "EZone", "hour"], how="inner")
merged_data3

Unnamed: 0,date,hour,EZone,is_holiday,Passengers_bined,Actual Passengers
0,2019-01-22,0,"15964 - Salzweg, Außenstelle LRA",False,0,0
1,2019-01-22,1,"15964 - Salzweg, Außenstelle LRA",False,0,0
2,2019-01-22,2,"15964 - Salzweg, Außenstelle LRA",False,0,0
3,2019-01-22,3,"15964 - Salzweg, Außenstelle LRA",False,0,0
4,2019-01-22,4,"15964 - Salzweg, Außenstelle LRA",False,0,0
...,...,...,...,...,...,...
109195,2019-12-16,19,"9750 - Passau, Reisebüro Niedermayer",False,0,0
109196,2019-12-16,20,"9750 - Passau, Reisebüro Niedermayer",False,0,0
109197,2019-12-16,21,"9750 - Passau, Reisebüro Niedermayer",False,0,0
109198,2019-12-16,22,"9750 - Passau, Reisebüro Niedermayer",False,0,0


In [109]:
print("Metrics with holiday feature:\n",classification_report(merged_data3["Actual Passengers"], merged_data3["Passengers_bined"]))

Metrics with holiday feature:
               precision    recall  f1-score   support

           0       0.85      0.96      0.90     88868
           1       0.21      0.04      0.07      9726
           2       0.15      0.02      0.03      4397
           3       0.44      0.43      0.44      6209

    accuracy                           0.81    109200
   macro avg       0.41      0.36      0.36    109200
weighted avg       0.74      0.81      0.77    109200



In [110]:
print("Metrics without holiday feature:\n",classification_report(merged_data["Actual Passengers"], merged_data["Passengers_bined"]))

Metrics without holiday feature:
               precision    recall  f1-score   support

           0       0.85      0.97      0.90     88868
           1       0.20      0.04      0.07      9726
           2       0.15      0.01      0.02      4397
           3       0.45      0.42      0.43      6209

    accuracy                           0.81    109200
   macro avg       0.41      0.36      0.36    109200
weighted avg       0.74      0.81      0.77    109200

