In [48]:
import os
from argparse import ArgumentParser
import logging
from typing import NoReturn
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.model_selection as sk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
import evaluation_scripts.eval_passengers_up as eval

TRAIN_BUS_CSV_PATH = "data/train_bus_schedule.csv"
X_PASSENGER = "data/X_passengers_up.csv"
X_TRIP = "data/X_trip_duration.csv"
ENCODER = "windows-1255"
RANDOM_STATE = 42

train_bus = pd.read_csv(TRAIN_BUS_CSV_PATH, encoding=ENCODER)
x_passenger = pd.read_csv(X_PASSENGER, encoding=ENCODER)
sample_size = 0.05  # 5% of the data
xgboost_sample_size = 0.15
baseline = train_bus.sample(frac=sample_size, random_state=RANDOM_STATE)
remaining_data = train_bus.drop(baseline.index)

xgboost_sample = remaining_data.sample(frac=xgboost_sample_size, random_state=RANDOM_STATE)
xgboost_X = xgboost_sample[x_passenger.columns]
xgboost_y = xgboost_sample["passengers_up"]
remaining_data = remaining_data.drop(xgboost_sample.index)

In [36]:
xgboost_X

Unnamed: 0,trip_id,part,trip_id_unique_station,trip_id_unique,line_id,direction,alternative,cluster,station_index,station_id,station_name,arrival_time,door_closing_time,arrival_is_estimated,latitude,longitude,passengers_continue,mekadem_nipuach_luz,passengers_continue_menupach
13984,310854,ב,310854b39,310854b,27087,1,#,חולון עירוני ומטרופוליני+תחרות חולון,39,36060,רוטשילד/הרצל,14:43:00,14:43:00,False,32.026660,34.742650,7,2.250000,15.750000
214847,314556,ג,314556c30,314556c,23076,2,0,אונו-אלעד,30,31674,"רוטשילד/קק""ל",09:02:00,09:02:00,False,32.080040,34.884193,16,2.000000,32.000000
114203,210684,ב,210684b29,210684b,27004,2,0,תל אביב,29,21446,עירוני ה'/בן יהודה,20:22:16,,True,32.095050,34.775550,4,0.545455,2.181818
104210,118336,א,118336a17,118336a,16024,2,#,שרון חולון מרחבי,17,26831,שדרות ביאליק/עירייה,17:56:00,17:57:00,False,32.144710,34.842830,7,2.800000,19.600000
116209,218076,א,218076a11,218076a,25005,1,0,תל אביב,11,25614,היכל התרבות/דיזנגוף,19:19:00,19:20:00,False,32.074360,34.778230,27,2.300000,62.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96739,314002,ג,314002c12,314002c,21037,2,0,אונו-אלעד,12,31333,דרך העצמאות/יעקב בר סימנטוב,14:58:00,14:58:00,False,32.028145,34.884773,16,3.000000,48.000000
52865,415530,א,415530a8,415530a,13067,2,0,מזרחי-רמת גן,8,21319,הירדן/ אלוף שדה,18:38:00,18:39:00,False,32.059310,34.828290,16,2.500000,40.000000
190748,313824,א,313824a26,313824a,18019,1,0,"דרומי-ראשל""צ-חולון",26,36172,"כ""ט בנובמבר/אלי כהן",14:04:00,14:04:00,False,32.009040,34.757060,25,2.125000,53.125000
118327,311414,א,311414a21,311414a,12172,2,0,"דרומי-ראשל""צ-חולון",21,36387,ההסתדרות/מוהליבר,07:11:00,07:11:00,False,32.016490,34.782760,33,3.000000,99.000000


In [37]:
# Save the trip_id_unique_station column
trip_id_unique_station = xgboost_X["trip_id_unique_station"].copy()

# Convert to datetime
X = xgboost_X.copy()  # To avoid SettingWithCopyWarning
X.loc[:, 'door_closing_time'] = pd.to_datetime(X['door_closing_time'])
X.loc[:, 'arrival_time'] = pd.to_datetime(X['arrival_time'])

In [39]:
X

Unnamed: 0,trip_id,part,trip_id_unique_station,trip_id_unique,line_id,direction,alternative,cluster,station_index,station_id,station_name,arrival_time,door_closing_time,arrival_is_estimated,latitude,longitude,passengers_continue,mekadem_nipuach_luz,passengers_continue_menupach
13984,310854,ב,310854b39,310854b,27087,1,#,חולון עירוני ומטרופוליני+תחרות חולון,39,36060,רוטשילד/הרצל,2024-07-04 14:43:00,2024-07-04 14:43:00,False,32.026660,34.742650,7,2.250000,15.750000
214847,314556,ג,314556c30,314556c,23076,2,0,אונו-אלעד,30,31674,"רוטשילד/קק""ל",2024-07-04 09:02:00,2024-07-04 09:02:00,False,32.080040,34.884193,16,2.000000,32.000000
114203,210684,ב,210684b29,210684b,27004,2,0,תל אביב,29,21446,עירוני ה'/בן יהודה,2024-07-04 20:22:16,NaT,True,32.095050,34.775550,4,0.545455,2.181818
104210,118336,א,118336a17,118336a,16024,2,#,שרון חולון מרחבי,17,26831,שדרות ביאליק/עירייה,2024-07-04 17:56:00,2024-07-04 17:57:00,False,32.144710,34.842830,7,2.800000,19.600000
116209,218076,א,218076a11,218076a,25005,1,0,תל אביב,11,25614,היכל התרבות/דיזנגוף,2024-07-04 19:19:00,2024-07-04 19:20:00,False,32.074360,34.778230,27,2.300000,62.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96739,314002,ג,314002c12,314002c,21037,2,0,אונו-אלעד,12,31333,דרך העצמאות/יעקב בר סימנטוב,2024-07-04 14:58:00,2024-07-04 14:58:00,False,32.028145,34.884773,16,3.000000,48.000000
52865,415530,א,415530a8,415530a,13067,2,0,מזרחי-רמת גן,8,21319,הירדן/ אלוף שדה,2024-07-04 18:38:00,2024-07-04 18:39:00,False,32.059310,34.828290,16,2.500000,40.000000
190748,313824,א,313824a26,313824a,18019,1,0,"דרומי-ראשל""צ-חולון",26,36172,"כ""ט בנובמבר/אלי כהן",2024-07-04 14:04:00,2024-07-04 14:04:00,False,32.009040,34.757060,25,2.125000,53.125000
118327,311414,א,311414a21,311414a,12172,2,0,"דרומי-ראשל""צ-חולון",21,36387,ההסתדרות/מוהליבר,2024-07-04 07:11:00,2024-07-04 07:11:00,False,32.016490,34.782760,33,3.000000,99.000000


In [40]:
# Create door delta columns
X["door_close_delta"] = None
mask_notna = X["door_closing_time"].notna()
X.loc[mask_notna, 'door_close_delta'] = (
        X.loc[mask_notna, 'door_closing_time'] - X.loc[mask_notna, 'arrival_time']
).dt.total_seconds()
door_delta_mean = X["door_close_delta"].mean()
X["door_close_delta"] = X["door_close_delta"].fillna(door_delta_mean)

In [41]:
X

Unnamed: 0,trip_id,part,trip_id_unique_station,trip_id_unique,line_id,direction,alternative,cluster,station_index,station_id,station_name,arrival_time,door_closing_time,arrival_is_estimated,latitude,longitude,passengers_continue,mekadem_nipuach_luz,passengers_continue_menupach,door_close_delta
13984,310854,ב,310854b39,310854b,27087,1,#,חולון עירוני ומטרופוליני+תחרות חולון,39,36060,רוטשילד/הרצל,2024-07-04 14:43:00,2024-07-04 14:43:00,False,32.026660,34.742650,7,2.250000,15.750000,0.000000
214847,314556,ג,314556c30,314556c,23076,2,0,אונו-אלעד,30,31674,"רוטשילד/קק""ל",2024-07-04 09:02:00,2024-07-04 09:02:00,False,32.080040,34.884193,16,2.000000,32.000000,0.000000
114203,210684,ב,210684b29,210684b,27004,2,0,תל אביב,29,21446,עירוני ה'/בן יהודה,2024-07-04 20:22:16,NaT,True,32.095050,34.775550,4,0.545455,2.181818,12.362385
104210,118336,א,118336a17,118336a,16024,2,#,שרון חולון מרחבי,17,26831,שדרות ביאליק/עירייה,2024-07-04 17:56:00,2024-07-04 17:57:00,False,32.144710,34.842830,7,2.800000,19.600000,60.000000
116209,218076,א,218076a11,218076a,25005,1,0,תל אביב,11,25614,היכל התרבות/דיזנגוף,2024-07-04 19:19:00,2024-07-04 19:20:00,False,32.074360,34.778230,27,2.300000,62.100000,60.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96739,314002,ג,314002c12,314002c,21037,2,0,אונו-אלעד,12,31333,דרך העצמאות/יעקב בר סימנטוב,2024-07-04 14:58:00,2024-07-04 14:58:00,False,32.028145,34.884773,16,3.000000,48.000000,0.000000
52865,415530,א,415530a8,415530a,13067,2,0,מזרחי-רמת גן,8,21319,הירדן/ אלוף שדה,2024-07-04 18:38:00,2024-07-04 18:39:00,False,32.059310,34.828290,16,2.500000,40.000000,60.000000
190748,313824,א,313824a26,313824a,18019,1,0,"דרומי-ראשל""צ-חולון",26,36172,"כ""ט בנובמבר/אלי כהן",2024-07-04 14:04:00,2024-07-04 14:04:00,False,32.009040,34.757060,25,2.125000,53.125000,0.000000
118327,311414,א,311414a21,311414a,12172,2,0,"דרומי-ראשל""צ-חולון",21,36387,ההסתדרות/מוהליבר,2024-07-04 07:11:00,2024-07-04 07:11:00,False,32.016490,34.782760,33,3.000000,99.000000,0.000000


In [44]:
# Categorize arrival time
arrival_hours = X['arrival_time'].dt.hour
percentiles = arrival_hours.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
percentile_values = percentiles.loc[
    ['10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%']
].values
labels = [f'{int(value)}' for value in percentile_values]
labels.insert(0, '0')
X['arrival_time_label'] = pd.cut(arrival_hours,
                                 bins=[0] + list(percentile_values) + [24],
                                 labels=labels,
                                 include_lowest=True)

Index(['trip_id', 'part', 'trip_id_unique_station', 'trip_id_unique',
       'line_id', 'direction', 'alternative', 'cluster', 'station_index',
       'station_id', 'station_name', 'arrival_time', 'door_closing_time',
       'arrival_is_estimated', 'latitude', 'longitude', 'passengers_continue',
       'mekadem_nipuach_luz', 'passengers_continue_menupach',
       'door_close_delta', 'arrival_time_label'],
      dtype='object')
Index(['trip_id', 'part', 'trip_id_unique_station', 'trip_id_unique',
       'line_id', 'direction', 'alternative', 'cluster', 'station_index',
       'station_id', 'station_name', 'arrival_time', 'door_closing_time',
       'arrival_is_estimated', 'latitude', 'longitude', 'passengers_continue',
       'mekadem_nipuach_luz', 'passengers_continue_menupach',
       'door_close_delta', 'arrival_time_label'],
      dtype='object')
Index(['trip_id', 'part', 'trip_id_unique_station', 'trip_id_unique',
       'line_id', 'direction', 'alternative', 'cluster', 'station_in

In [50]:
X

Unnamed: 0,trip_id,part,trip_id_unique_station,trip_id_unique,line_id,direction,alternative,cluster,station_index,station_id,...,arrival_time,door_closing_time,arrival_is_estimated,latitude,longitude,passengers_continue,mekadem_nipuach_luz,passengers_continue_menupach,door_close_delta,arrival_time_label
13984,310854,ב,310854b39,310854b,27087,1,#,חולון עירוני ומטרופוליני+תחרות חולון,39,36060,...,2024-07-04 14:43:00,2024-07-04 14:43:00,False,32.026660,34.742650,7,2.250000,15.750000,0.000000,13
214847,314556,ג,314556c30,314556c,23076,2,0,אונו-אלעד,30,31674,...,2024-07-04 09:02:00,2024-07-04 09:02:00,False,32.080040,34.884193,16,2.000000,32.000000,0.000000,8
114203,210684,ב,210684b29,210684b,27004,2,0,תל אביב,29,21446,...,2024-07-04 20:22:16,NaT,True,32.095050,34.775550,4,0.545455,2.181818,12.362385,18
104210,118336,א,118336a17,118336a,16024,2,#,שרון חולון מרחבי,17,26831,...,2024-07-04 17:56:00,2024-07-04 17:57:00,False,32.144710,34.842830,7,2.800000,19.600000,60.000000,16
116209,218076,א,218076a11,218076a,25005,1,0,תל אביב,11,25614,...,2024-07-04 19:19:00,2024-07-04 19:20:00,False,32.074360,34.778230,27,2.300000,62.100000,60.000000,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96739,314002,ג,314002c12,314002c,21037,2,0,אונו-אלעד,12,31333,...,2024-07-04 14:58:00,2024-07-04 14:58:00,False,32.028145,34.884773,16,3.000000,48.000000,0.000000,13
52865,415530,א,415530a8,415530a,13067,2,0,מזרחי-רמת גן,8,21319,...,2024-07-04 18:38:00,2024-07-04 18:39:00,False,32.059310,34.828290,16,2.500000,40.000000,60.000000,16
190748,313824,א,313824a26,313824a,18019,1,0,"דרומי-ראשל""צ-חולון",26,36172,...,2024-07-04 14:04:00,2024-07-04 14:04:00,False,32.009040,34.757060,25,2.125000,53.125000,0.000000,13
118327,311414,א,311414a21,311414a,12172,2,0,"דרומי-ראשל""צ-חולון",21,36387,...,2024-07-04 07:11:00,2024-07-04 07:11:00,False,32.016490,34.782760,33,3.000000,99.000000,0.000000,6
