In [1]:
import pandas as pd 
import numpy as np 
import sklearn.model_selection as sk
from sklearn.preprocessing import LabelEncoder

In [2]:
TRAIN_BUS_CSV_PATH = "data/train_bus_schedule.csv"
X_PASSENGER = "data/X_passengers_up.csv"
X_TRIP = "data/X_trip_duration.csv"
ENCODER = "windows-1255"
RANDOM_STATE = 42

In [3]:
train_bus = pd.read_csv(TRAIN_BUS_CSV_PATH, encoding=ENCODER)
x_passenger = pd.read_csv(X_PASSENGER, encoding=ENCODER)
x_trip_duration = pd.read_csv(X_TRIP, encoding=ENCODER)

data splitting

In [4]:
y = train_bus["passengers_up"]

In [5]:

sample_size = 0.05  # 5% of the data
baseline = train_bus.sample(frac=sample_size, random_state=RANDOM_STATE)
remaining_data = train_bus.drop(baseline.index)

base line - linearregression

In [6]:
x_base_line = baseline[x_passenger.columns]
y_base_line = baseline["passengers_up"]

pre process data

In [None]:
import pandas as pd


columns_with_nan = [col for col in x_base_line.columns if x_base_line[col].isna().any()]
print(columns_with_nan)


In [None]:


# Assuming x_base_line is your DataFrame
# Convert relevant columns to datetime if needed
x_base_line['door_closing_time'] = pd.to_datetime(x_base_line['door_closing_time'])
x_base_line['arrival_time'] = pd.to_datetime(x_base_line['arrival_time'])


# If you need to convert the result to seconds or another format, you can further process it


In [None]:
x_base_line["door_close_delta"] = None
x_base_line.loc[x_base_line["door_closing_time"].notna(),['door_close_delta']] =(x_base_line.loc[x_base_line["door_closing_time"].notna(),'door_closing_time'] - x_base_line.loc[x_base_line["door_closing_time"].notna(),'arrival_time']).dt.total_seconds()



In [None]:
door_delta_mean = x_base_line["door_close_delta"].mean()
x_base_line["door_close_delta"] = x_base_line["door_close_delta"].fillna(door_delta_mean)

In [None]:
x_base_line['arrival_time'].dt.hour.describe(percentiles=(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9))

In [None]:


x_base_line['arrival_time'] = pd.to_datetime(x_base_line['arrival_time'])
arrival_hours = x_base_line['arrival_time'].dt.hour
percentiles = arrival_hours.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
percentile_values = percentiles.loc[['10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%']].values
labels = []
previous = 0
for value in percentile_values:
    labels.append(f'{int(previous)}-{int(value)}')
    previous = value
labels.append(f'{int(percentile_values[-1])}-24')
x_base_line['arrival_time_label'] = pd.cut(arrival_hours, 
                                           bins=[0] + list(percentile_values) + [24], 
                                           labels=labels, 
                                           include_lowest=True)



In [None]:

# Convert arrival_time to datetime
x_base_line['arrival_time'] = pd.to_datetime(x_base_line['arrival_time'])

# Extract the hour from 'arrival_time' for percentile calculation
arrival_hours = x_base_line['arrival_time'].dt.hour

# Calculate the percentiles
percentiles = arrival_hours.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
percentile_values = percentiles.loc[['10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%']].values

# Create labels for the bins based on lower bounds
labels = [f'{int(value)}' for value in percentile_values]
labels.insert(0, '0')

# Use pd.cut to categorize arrival hours into bins based on the calculated percentiles
x_base_line['arrival_time_label'] = pd.cut(arrival_hours, 
                                           bins=[0] + list(percentile_values) + [24], 
                                           labels=labels, 
                                           include_lowest=True)

# Now x_base_line['arrival_time_label'] contains the categorized labels based on lower bounds
print(x_base_line[['arrival_time', 'arrival_time_label']])


In [None]:
# Label Encoding
label_encoder = LabelEncoder()
x_base_line['part_encoded'] = label_encoder.fit_transform(x_base_line['part'])
x_base_line['alternative_encoded'] = label_encoder.fit_transform(x_base_line['alternative'])

In [None]:
del x_base_line["arrival_time"]
del x_base_line["door_closing_time"]
del x_base_line["cluster"]
del x_base_line["station_name"]
del x_base_line["part"]
del x_base_line["trip_id_unique"]
del x_base_line["trip_id_unique_station"]
del x_base_line["alternative"]

spliting

In [None]:
X_train,X_test,y_train,y_test = sk.train_test_split(x_base_line,y_base_line,test_size=0.25,random_state=RANDOM_STATE)

training

In [None]:
for col in X_train.columns:
    try:
        X_train[col].astype(float)
    except Exception as err:
        print(col)
        print (err)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
mse

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

model_dt = DecisionTreeRegressor(random_state=RANDOM_STATE)
model_dt.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = model_dt.predict(X_test)

# Calculate performance metrics
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print('Decision Tree Regression')
print(f'Mean Squared Error: {mse_dt}')
print(f'R^2 Score: {r2_dt}')
print(f'Predictions: {y_pred_dt}')
print(f'Actual values: {y_test.values}')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Sample DataFrames (replace these with your actual DataFrames)
# Assuming X_train, X_test, y_train, y_test are already defined

# Range of polynomial degrees to test
degrees = range(1, 4)  # Testing degrees 1 to 5

# Initialize lists to store MSE values for each degree
mse_values = []

# Iterate over each degree
for degree in degrees:
    # Generate polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    # Initialize and train the Polynomial Regression model
    model_poly = LinearRegression()
    model_poly.fit(X_train_poly, y_train)
    
    # Predict on the test set
    y_pred_poly = model_poly.predict(X_test_poly)
    
    # Calculate MSE
    mse_poly = mean_squared_error(y_test, y_pred_poly)
    
    # Append MSE to list
    mse_values.append(mse_poly)
    
    # Print MSE for each degree


# duration 

In [12]:
train_bus.columns

Index(['trip_id', 'part', 'trip_id_unique_station', 'trip_id_unique',
       'line_id', 'direction', 'alternative', 'cluster', 'station_index',
       'station_id', 'station_name', 'arrival_time', 'door_closing_time',
       'arrival_is_estimated', 'latitude', 'longitude', 'passengers_up',
       'passengers_continue', 'mekadem_nipuach_luz',
       'passengers_continue_menupach'],
      dtype='object')

In [79]:
lines_for_baseline = train_bus["trip_id_unique"].drop_duplicates().sample(frac = 0.05,random_state= RANDOM_STATE)

In [83]:
dur_baseline  = train_bus[train_bus["trip_id_unique"].isin(lines_for_baseline)]

In [69]:
dur_baseline.groupby("trip_id_unique")["trip_id_unique_station"].nunique().describe()

count    5793.000000
mean       39.031762
std        10.133366
min         7.000000
25%        32.000000
50%        39.000000
75%        46.000000
max        71.000000
Name: trip_id_unique_station, dtype: float64

In [85]:
min_max_time = dur_baseline.groupby("trip_id_unique")["arrival_time"].agg({"min","max"}).reset_index()

In [86]:
min_max_time["max"] = pd.to_datetime(min_max_time["max"])
min_max_time["min"] = pd.to_datetime(min_max_time["min"])

  min_max_time["max"] = pd.to_datetime(min_max_time["max"])
  min_max_time["min"] = pd.to_datetime(min_max_time["min"])


In [87]:
min_max_time["delta"] = (min_max_time["max"] - min_max_time["min"])/pd.Timedelta(1,"m")
min_max_time["delta"] =round(min_max_time["delta"],2)

In [88]:
min_max_time["delta"].describe(percentiles =(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9))

count     290.000000
mean       73.596310
std       115.318865
min        23.000000
10%        42.000000
20%        48.000000
30%        54.000000
40%        58.000000
50%        62.000000
60%        67.538000
70%        74.000000
80%        81.424000
90%        91.100000
max      1438.000000
Name: delta, dtype: float64

In [102]:
dur_baseline[x_trip_duration.columns]   

Unnamed: 0,trip_id,part,trip_id_unique_station,trip_id_unique,line_id,direction,alternative,cluster,station_index,station_id,station_name,arrival_time,door_closing_time,arrival_is_estimated,latitude,longitude,passengers_up,passengers_continue,mekadem_nipuach_luz,passengers_continue_menupach
327,114196,ג,114196c1,114196c,81001,1,0,"פ""ת-ת""א",1,36780,"ת. מרכזית פ""ת/רציפים עירוני",12:59:00,13:00:00,False,32.094936,34.886530,4,4,3.181818,12.727273
328,114196,ג,114196c2,114196c,81001,1,0,"פ""ת-ת""א",2,35336,בר כוכבא/ליברכט,13:01:00,13:02:00,False,32.091160,34.885384,1,4,3.181818,12.727273
329,114196,ג,114196c3,114196c,81001,1,0,"פ""ת-ת""א",3,31291,שוק עירוני/ברון הירש,13:04:00,13:04:00,False,32.089928,34.882760,5,9,3.181818,28.636364
330,114196,ג,114196c4,114196c,81001,1,0,"פ""ת-ת""א",4,36720,ז'בוטינסקי/רוטשילד,13:05:00,13:05:00,False,32.090057,34.879875,3,12,3.181818,38.181818
331,114196,ג,114196c5,114196c,81001,1,0,"פ""ת-ת""א",5,32263,ז'בוטינסקי/ אורלוב,13:06:14,,True,32.090560,34.876840,0,12,3.181818,38.181818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224837,113773,ג,113773c36,113773c,15066,1,0,"פ""ת-ת""א",36,25543,טרומפלדור/פינסקר,16:22:49,,True,32.075170,34.770527,0,4,3.125000,12.500000
224838,113773,ג,113773c37,113773c,15066,1,0,"פ""ת-ת""א",37,20133,טרומפלדור/בן יהודה,16:24:14,,True,32.075516,34.767296,0,4,3.125000,12.500000
224839,113773,ג,113773c38,113773c,15066,1,0,"פ""ת-ת""א",38,20660,חוף ירושלים,16:26:00,16:26:00,False,32.073296,34.764793,2,5,3.125000,15.625000
224840,113773,ג,113773c39,113773c,15066,1,0,"פ""ת-ת""א",39,20659,'חוף בננה ביץ,16:27:00,16:27:00,False,32.070602,34.763817,0,4,3.125000,12.500000


In [99]:
dur_baseline[x_trip_duration.columns].groupby("trip_id_unique")["passengers_up"].sum().describe()

count    290.000000
mean      42.124138
std       30.535788
min        1.000000
25%       20.250000
50%       36.000000
75%       57.000000
max      228.000000
Name: passengers_up, dtype: float64

In [None]:
dur_baseline[x_trip_duration.columns].groupby("")

In [100]:
dur_baseline[x_trip_duration.columns]["passengers_up"].describe()

count    11208.000000
mean         1.089936
std          2.124385
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max         35.000000
Name: passengers_up, dtype: float64

In [101]:
dur_baseline[x_trip_duration.columns]

Unnamed: 0,trip_id,part,trip_id_unique_station,trip_id_unique,line_id,direction,alternative,cluster,station_index,station_id,station_name,arrival_time,door_closing_time,arrival_is_estimated,latitude,longitude,passengers_up,passengers_continue,mekadem_nipuach_luz,passengers_continue_menupach
327,114196,ג,114196c1,114196c,81001,1,0,"פ""ת-ת""א",1,36780,"ת. מרכזית פ""ת/רציפים עירוני",12:59:00,13:00:00,False,32.094936,34.886530,4,4,3.181818,12.727273
328,114196,ג,114196c2,114196c,81001,1,0,"פ""ת-ת""א",2,35336,בר כוכבא/ליברכט,13:01:00,13:02:00,False,32.091160,34.885384,1,4,3.181818,12.727273
329,114196,ג,114196c3,114196c,81001,1,0,"פ""ת-ת""א",3,31291,שוק עירוני/ברון הירש,13:04:00,13:04:00,False,32.089928,34.882760,5,9,3.181818,28.636364
330,114196,ג,114196c4,114196c,81001,1,0,"פ""ת-ת""א",4,36720,ז'בוטינסקי/רוטשילד,13:05:00,13:05:00,False,32.090057,34.879875,3,12,3.181818,38.181818
331,114196,ג,114196c5,114196c,81001,1,0,"פ""ת-ת""א",5,32263,ז'בוטינסקי/ אורלוב,13:06:14,,True,32.090560,34.876840,0,12,3.181818,38.181818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224837,113773,ג,113773c36,113773c,15066,1,0,"פ""ת-ת""א",36,25543,טרומפלדור/פינסקר,16:22:49,,True,32.075170,34.770527,0,4,3.125000,12.500000
224838,113773,ג,113773c37,113773c,15066,1,0,"פ""ת-ת""א",37,20133,טרומפלדור/בן יהודה,16:24:14,,True,32.075516,34.767296,0,4,3.125000,12.500000
224839,113773,ג,113773c38,113773c,15066,1,0,"פ""ת-ת""א",38,20660,חוף ירושלים,16:26:00,16:26:00,False,32.073296,34.764793,2,5,3.125000,15.625000
224840,113773,ג,113773c39,113773c,15066,1,0,"פ""ת-ת""א",39,20659,'חוף בננה ביץ,16:27:00,16:27:00,False,32.070602,34.763817,0,4,3.125000,12.500000
