In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.impute import KNNImputer

# Step 1: Load the datasets
bus_data = pd.read_csv("bus_bts.csv")
weather_data = pd.read_csv("jeju_weather_dataset_english_corrected.csv")

# Step 2: Drop unnecessary columns from weather data
weather_data = weather_data.drop(['Snow Depth (cm)', '3-Hour Snow Depth (cm)', 'Ground Condition Code'], axis=1)

# Step 3: Drop rows with too many missing values (less than 70% valid data)
row_threshold = int(weather_data.shape[1] * 0.7)
weather_data = weather_data.dropna(thresh=row_threshold)

# Step 4: Convert numeric columns and handle missing values
numeric_columns = [
    'Temperature (°C)', 'Precipitation (mm)', 'Wind Speed (m/s)', 
    'Wind Direction (16 Directions)', 'Humidity (%)', 'Sea Level Pressure (hPa)',
    'Dew Point Temperature (°C)', 'Visibility (10km)', 'Evaporation (10m)', 
    'Ground Temperature (°C)', 'Soil Temperature at 5cm (°C)',
    'Soil Temperature at 10cm (°C)', 'Soil Temperature at 20cm (°C)',
    'Soil Temperature at 30cm (°C)'
]

# Convert columns to numeric, coercing errors to NaN
for column in numeric_columns:
    weather_data[column] = pd.to_numeric(weather_data[column], errors='coerce')

# Step 5: Apply KNN Imputer to fill missing values
imputer = KNNImputer(n_neighbors=5)
weather_data[numeric_columns] = imputer.fit_transform(weather_data[numeric_columns])

# Step 6: Convert date and time columns to datetime in both datasets
bus_data['geton_datetime'] = pd.to_datetime(bus_data['geton_date'] + ' ' + bus_data['geton_time'])
weather_data['datetime'] = pd.to_datetime(weather_data['Date and Time'])  # Adjust if your column name is different

# Step 7: Extract date, day of the week, and hour in both datasets
bus_data['geton_date'] = bus_data['geton_datetime'].dt.date  # Extract date
bus_data['day_of_week'] = bus_data['geton_datetime'].dt.dayofweek
bus_data['hour'] = bus_data['geton_datetime'].dt.hour

weather_data['weather_date'] = weather_data['datetime'].dt.date  # Extract date
weather_data['hour'] = weather_data['datetime'].dt.hour
weather_data['day_of_week'] = weather_data['datetime'].dt.dayofweek

# Step 8: Aggregate passenger counts by station, date, day, and hour
station_data = bus_data.groupby(['geton_station_name', 'geton_date', 'day_of_week', 'hour']).size().reset_index(name='passenger_count')

# Step 9: Merge station_data with weather_data on date, hour, and day_of_week
merged_data = pd.merge(
    station_data, weather_data, 
    left_on=['geton_date', 'hour', 'day_of_week'],
    right_on=['weather_date', 'hour', 'day_of_week'],
    how='left'
)

# Drop redundant columns after the merge if needed
merged_data = merged_data.drop(columns=['weather_date', 'datetime'])

# Display the merged dataset to confirm
print(merged_data.head())


  geton_station_name  geton_date  day_of_week  hour  passenger_count  Region  \
0             (구)구판장  2019-09-02            0     7                1     184   
1             (구)구판장  2019-09-02            0     8                2     184   
2             (구)구판장  2019-09-02            0    10                2     184   
3             (구)구판장  2019-09-03            1     7                2     184   
4             (구)구판장  2019-09-03            1     8                1     184   

      Date and Time  Temperature (°C)  Precipitation (mm)  Wind Speed (m/s)  \
0  2019-09-02 07:00              23.3                 1.8               2.9   
1  2019-09-02 08:00              22.3                 3.9               6.1   
2  2019-09-02 10:00              22.5                 1.5               1.2   
3  2019-09-03 07:00              22.6                 5.1               0.7   
4  2019-09-03 08:00              23.4                 2.6               1.4   

   ...  Cloud Cover (10 Levels)  Weather Phe

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Step 1: Drop irrelevant columns
# Define columns to keep based on relevance for prediction
columns_to_keep = [
    'geton_station_name', 'geton_date', 'day_of_week', 'hour', 'passenger_count', 
    'Temperature (°C)', 'Precipitation (mm)', 'Wind Speed (m/s)', 'Humidity (%)', 
    'Sea Level Pressure (hPa)', 'Cloud Cover (10 Levels)', 'Weather Phenomenon'
]
merged_data = merged_data[columns_to_keep]

In [14]:
# Display the shape of merged_data
print("Shape of merged_data:", merged_data.shape)

# Check for null values in each column
null_values = merged_data.isnull().sum()
print("Number of null values in each column:")
print(null_values)

# Alternatively, to see only columns with null values:
null_columns = null_values[null_values > 0]
print("Columns with null values:")
print(null_columns)


Shape of merged_data: (259591, 12)
Number of null values in each column:
geton_station_name              0
geton_date                      0
day_of_week                     0
hour                            0
passenger_count                 0
Temperature (°C)                0
Precipitation (mm)              0
Wind Speed (m/s)                0
Humidity (%)                    0
Sea Level Pressure (hPa)        0
Cloud Cover (10 Levels)         0
Weather Phenomenon          25652
dtype: int64
Columns with null values:
Weather Phenomenon    25652
dtype: int64


In [15]:
# Drop the 'Weather Phenomenon' column
merged_data.drop(columns=['Weather Phenomenon'], inplace=True)

# Confirm that the column has been removed
print("Columns in merged_data after dropping 'Weather Phenomenon':")
print(merged_data.columns)


Columns in merged_data after dropping 'Weather Phenomenon':
Index(['geton_station_name', 'geton_date', 'day_of_week', 'hour',
       'passenger_count', 'Temperature (°C)', 'Precipitation (mm)',
       'Wind Speed (m/s)', 'Humidity (%)', 'Sea Level Pressure (hPa)',
       'Cloud Cover (10 Levels)'],
      dtype='object')


In [20]:
# Save the modified merged_data into a new DataFrame named modeldata
modeldata = merged_data.copy()

# Confirm the contents of modeldata
print("Columns in modeldata:")
print(modeldata.columns)


Columns in modeldata:
Index(['geton_station_name', 'geton_date', 'day_of_week', 'hour',
       'passenger_count', 'Temperature (°C)', 'Precipitation (mm)',
       'Wind Speed (m/s)', 'Humidity (%)', 'Sea Level Pressure (hPa)',
       'Cloud Cover (10 Levels)'],
      dtype='object')


In [21]:
# Drop the `geton_date` column to avoid issues with datetime data types
modeldata = modeldata.drop(columns=['geton_date'])

# Step 1: One-hot encode categorical columns
modeldata = pd.get_dummies(modeldata, columns=['geton_station_name', 'day_of_week'], drop_first=True)

# Separate features and target
features = modeldata.drop(columns=['passenger_count']).values
target = modeldata['passenger_count'].values

In [22]:
# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Step 3: Scale features (StandardScaler) and target (MinMaxScaler)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [23]:
# Scale target (passenger_count) with MinMaxScaler to keep values positive
target_scaler = MinMaxScaler()
y_train = target_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test = target_scaler.transform(y_test.reshape(-1, 1)).flatten()


In [24]:
# Step 4: Define a simple feedforward neural network model
model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(16, activation='relu'),
    Dense(1)  # Output layer for predicting passenger count
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
# Compile with SGD optimizer
model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='mse', metrics=['mae'])

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [26]:
# Train the model with early stopping and a reduced number of epochs
history = model.fit(
    X_train, y_train,
    epochs=10,           # Adjusted to fewer epochs for faster training
    batch_size=64,       # Increased batch size
    validation_split=0.2,
    callbacks=[early_stopping]
)


Epoch 1/10
[1m2596/2596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 0.0189 - mae: 0.0469 - val_loss: 0.0026 - val_mae: 0.0259
Epoch 2/10
[1m2596/2596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0029 - mae: 0.0251 - val_loss: 0.0021 - val_mae: 0.0252
Epoch 3/10
[1m2596/2596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0021 - mae: 0.0241 - val_loss: 0.0021 - val_mae: 0.0235
Epoch 4/10
[1m2596/2596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0021 - mae: 0.0240 - val_loss: 0.0017 - val_mae: 0.0212
Epoch 5/10
[1m2596/2596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0015 - mae: 0.0204 - val_loss: 0.0013 - val_mae: 0.0186
Epoch 6/10
[1m2596/2596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - loss: 0.0013 - mae: 0.0186 - val_loss: 0.0013 - val_mae: 0.0186
Epoch 7/10
[1m2596/2596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[

In [27]:
# Step 5: Evaluate the model
loss, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae}")

# Step 6: Make predictions on the test set and inverse transform
y_pred = model.predict(X_test)
y_pred_rescaled = target_scaler.inverse_transform(y_pred)


[1m1623/1623[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 681us/step - loss: 0.0012 - mae: 0.0179
Test MAE: 0.01792076788842678
[1m1623/1623[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 713us/step


In [28]:
# Display a few predictions vs actual values in the original scale
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))

for i in range(10):
    print(f"Predicted: {y_pred_rescaled[i][0]}, Actual: {y_test_rescaled[i][0]}")

Predicted: 8.535279273986816, Actual: 2.0
Predicted: 5.9705424308776855, Actual: 2.0
Predicted: 8.535279273986816, Actual: 14.0
Predicted: 7.05559778213501, Actual: 6.0
Predicted: 2.971367597579956, Actual: 3.0
Predicted: 6.579662322998047, Actual: 7.0
Predicted: 8.535279273986816, Actual: 1.0
Predicted: 16.740285873413086, Actual: 16.0
Predicted: 3.962538719177246, Actual: 1.0
Predicted: 8.535279273986816, Actual: 6.0


In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predict on the test set
y_pred = model.predict(X_test)

# Inverse transform the predictions and true values to their original scale
y_pred_rescaled = target_scaler.inverse_transform(y_pred)
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))

# Calculate MAE
mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate MSE
mse = mean_squared_error(y_test_rescaled, y_pred_rescaled)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate R-squared
r2 = r2_score(y_test_rescaled, y_pred_rescaled)
print(f"R-squared (R²): {r2}")


[1m1623/1623[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 691us/step
Mean Absolute Error (MAE): 6.487324098179955
Mean Squared Error (MSE): 161.32603228528353
R-squared (R²): 0.5544403357169292


In [30]:
from tensorflow.keras.optimizers import Adam

# Define a more flexible model function for easy tuning
def build_model(learning_rate=0.001, layer1_neurons=64, layer2_neurons=32, layer3_neurons=16):
    model = Sequential()
    model.add(Dense(layer1_neurons, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dense(layer2_neurons, activation='relu'))
    if layer3_neurons > 0:
        model.add(Dense(layer3_neurons, activation='relu'))
    model.add(Dense(1))  # Output layer for predicting passenger count
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

# Experiment with different configurations
layer_configs = [
    (64, 32, 16),  # Original setup
    (128, 64, 32), # Increase complexity
    (64, 32, 0)    # Simplify (fewer layers)
]
learning_rates = [0.001, 0.0005]
batch_sizes = [32, 64]

# Testing configurations
for layers in layer_configs:
    for lr in learning_rates:
        for batch_size in batch_sizes:
            model = build_model(learning_rate=lr, layer1_neurons=layers[0], layer2_neurons=layers[1], layer3_neurons=layers[2])
            print(f"Training model with layers {layers}, learning rate {lr}, batch size {batch_size}")
            model.fit(X_train, y_train, epochs=10, batch_size=batch_size, validation_split=0.2, callbacks=[early_stopping])
            loss, mae = model.evaluate(X_test, y_test)
            print(f"Test MAE: {mae} with configuration: layers={layers}, lr={lr}, batch_size={batch_size}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training model with layers (64, 32, 16), learning rate 0.001, batch size 32
Epoch 1/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.0374 - mae: 0.0719 - val_loss: 0.0012 - val_mae: 0.0171
Epoch 2/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 0.0025 - mae: 0.0213 - val_loss: 8.3614e-04 - val_mae: 0.0156
Epoch 3/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 8.2377e-04 - mae: 0.0140 - val_loss: 7.0270e-04 - val_mae: 0.0141
Epoch 4/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 6.5070e-04 - mae: 0.0130 - val_loss: 5.7202e-04 - val_mae: 0.0111
Epoch 5/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 5.4938e-04 - mae: 0.0118 - val_loss: 4.8107e-04 - val_mae: 0.0110
Epoch 6/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 4.3098e-04 - mae: 0.0108 - val

In [31]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the final model with the best configuration
def build_final_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1)  # Output layer for predicting passenger count
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

# Build the model
final_model = build_final_model()

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the final model
history = final_model.fit(
    X_train, y_train,
    epochs=10,           # Adjusted to fewer epochs for faster training
    batch_size=32,       # Best batch size from tuning
    validation_split=0.2,
    callbacks=[early_stopping]
)

# Evaluate the model on the test set
loss, mae = final_model.evaluate(X_test, y_test)
print(f"Final Model Test MAE: {mae}")

# Predict and evaluate additional metrics on test set
y_pred = final_model.predict(X_test)
y_pred_rescaled = target_scaler.inverse_transform(y_pred)
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))

# Calculate final metrics
mae_final = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
mse_final = mean_squared_error(y_test_rescaled, y_pred_rescaled)
r2_final = r2_score(y_test_rescaled, y_pred_rescaled)

print(f"Final Model MAE: {mae_final}")
print(f"Final Model MSE: {mse_final}")
print(f"Final Model R-squared (R²): {r2_final}")


Epoch 1/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.0266 - mae: 0.0608 - val_loss: 0.0066 - val_mae: 0.0245
Epoch 2/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 0.0013 - mae: 0.0156 - val_loss: 7.2855e-04 - val_mae: 0.0127
Epoch 3/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 6.6115e-04 - mae: 0.0128 - val_loss: 5.8119e-04 - val_mae: 0.0123
Epoch 4/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 5.7307e-04 - mae: 0.0119 - val_loss: 5.2294e-04 - val_mae: 0.0125
Epoch 5/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 4.8040e-04 - mae: 0.0109 - val_loss: 4.3975e-04 - val_mae: 0.0102
Epoch 6/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 4.2513e-04 - mae: 0.0103 - val_loss: 4.1173e-04 - val_mae: 0.0103
Epoch 7/10
[1m5192/5192[0m [32m

In [37]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
mae_scores = []

for train_index, test_index in kf.split(X_train):
    X_train_kf, X_test_kf = X_train[train_index], X_train[test_index]
    y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]
    
    model = build_final_model()  # Use the optimized model configuration
    model.fit(X_train_kf, y_train_kf, epochs=10, batch_size=32, validation_split=0.2)
    y_pred_kf = model.predict(X_test_kf)
    mae_scores.append(mean_absolute_error(y_test_kf, y_pred_kf))

print("Average MAE across folds:", np.mean(mae_scores))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 0.0291 - mae: 0.0586 - val_loss: 0.0012 - val_mae: 0.0178
Epoch 2/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 0.0012 - mae: 0.0178 - val_loss: 8.7266e-04 - val_mae: 0.0160
Epoch 3/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 8.3610e-04 - mae: 0.0148 - val_loss: 6.5605e-04 - val_mae: 0.0134
Epoch 4/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 6.0413e-04 - mae: 0.0125 - val_loss: 6.4228e-04 - val_mae: 0.0126
Epoch 5/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 5.1734e-04 - mae: 0.0115 - val_loss: 5.0430e-04 - val_mae: 0.0114
Epoch 6/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 4.5763e-04 - mae: 0.0109 - val_loss: 4.9011e-04 - val_mae: 0.0103
Epoch 7/10
[1m4154/4154[0m [32m━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - loss: 0.0303 - mae: 0.0555 - val_loss: 9.7530e-04 - val_mae: 0.0154
Epoch 2/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 9.1970e-04 - mae: 0.0144 - val_loss: 6.9105e-04 - val_mae: 0.0132
Epoch 3/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - loss: 6.6333e-04 - mae: 0.0129 - val_loss: 6.0787e-04 - val_mae: 0.0115
Epoch 4/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - loss: 5.6594e-04 - mae: 0.0118 - val_loss: 4.8266e-04 - val_mae: 0.0109
Epoch 5/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 4.8137e-04 - mae: 0.0110 - val_loss: 4.7537e-04 - val_mae: 0.0104
Epoch 6/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 4.3092e-04 - mae: 0.0103 - val_loss: 4.3040e-04 - val_mae: 0.0100
Epoch 7/10
[1m4154/4154[

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - loss: 0.0294 - mae: 0.0616 - val_loss: 9.4647e-04 - val_mae: 0.0146
Epoch 2/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - loss: 9.4947e-04 - mae: 0.0147 - val_loss: 6.7822e-04 - val_mae: 0.0128
Epoch 3/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 6.7949e-04 - mae: 0.0129 - val_loss: 6.3087e-04 - val_mae: 0.0121
Epoch 4/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - loss: 5.8127e-04 - mae: 0.0121 - val_loss: 5.0817e-04 - val_mae: 0.0106
Epoch 5/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 4.9157e-04 - mae: 0.0109 - val_loss: 4.7270e-04 - val_mae: 0.0102
Epoch 6/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 4.2598e-04 - mae: 0.0100 - val_loss: 4.2785e-04 - val_mae: 0.0097
Epoch 7/10
[1m4154/4154[

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - loss: 0.0340 - mae: 0.0765 - val_loss: 0.0019 - val_mae: 0.0200
Epoch 2/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - loss: 0.0013 - mae: 0.0175 - val_loss: 9.1644e-04 - val_mae: 0.0153
Epoch 3/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - loss: 9.0190e-04 - mae: 0.0154 - val_loss: 7.9918e-04 - val_mae: 0.0139
Epoch 4/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 7.9298e-04 - mae: 0.0134 - val_loss: 6.9345e-04 - val_mae: 0.0136
Epoch 5/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 5.6214e-04 - mae: 0.0125 - val_loss: 5.6492e-04 - val_mae: 0.0120
Epoch 6/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 5.1199e-04 - mae: 0.0119 - val_loss: 4.8596e-04 - val_mae: 0.0126
Epoch 7/10
[1m4154/4154[0m [32m

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - loss: 0.0062 - mae: 0.0316 - val_loss: 7.7557e-04 - val_mae: 0.0128
Epoch 2/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 7.1989e-04 - mae: 0.0130 - val_loss: 6.0009e-04 - val_mae: 0.0129
Epoch 3/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 6.0908e-04 - mae: 0.0119 - val_loss: 4.9819e-04 - val_mae: 0.0106
Epoch 4/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 4.7739e-04 - mae: 0.0108 - val_loss: 4.0320e-04 - val_mae: 0.0100
Epoch 5/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - loss: 4.5444e-04 - mae: 0.0103 - val_loss: 4.0675e-04 - val_mae: 0.0108
Epoch 6/10
[1m4154/4154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 3.9574e-04 - mae: 0.0097 - val_loss: 3.9816e-04 - val_mae: 0.0104
Epoch 7/10
[1m4154/4154[

In [39]:
# Train the final model on the full training set
final_model = build_final_model()  # Rebuild the model to reset weights

# Train on the entire X_train and y_train
final_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the final model on the test set
y_pred_test = final_model.predict(X_test)

# Rescale predictions and test targets if they were scaled
y_pred_test_rescaled = target_scaler.inverse_transform(y_pred_test)
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))

# Calculate final evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae_final = mean_absolute_error(y_test_rescaled, y_pred_test_rescaled)
mse_final = mean_squared_error(y_test_rescaled, y_pred_test_rescaled)
r2_final = r2_score(y_test_rescaled, y_pred_test_rescaled)

print("Final Model Evaluation Results on Test Set:")
print(f"Final Model MAE: {mae_final}")
print(f"Final Model MSE: {mse_final}")
print(f"Final Model R-squared (R²): {r2_final}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.0098 - mae: 0.0339 - val_loss: 7.7301e-04 - val_mae: 0.0127
Epoch 2/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 7.1221e-04 - mae: 0.0129 - val_loss: 5.9457e-04 - val_mae: 0.0119
Epoch 3/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 5.5579e-04 - mae: 0.0116 - val_loss: 5.0058e-04 - val_mae: 0.0106
Epoch 4/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 4.9460e-04 - mae: 0.0106 - val_loss: 4.4237e-04 - val_mae: 0.0098
Epoch 5/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 4.1956e-04 - mae: 0.0099 - val_loss: 3.9199e-04 - val_mae: 0.0096
Epoch 6/10
[1m5192/5192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 3.8243e-04 - mae: 0.0094 - val_loss: 3.6022e-04 - val_mae: 0.0095
Epoch 7/10
[1m5192/5192[0m [

In [40]:
# Generate predictions on the test set
y_pred_test = final_model.predict(X_test)

# Rescale predictions and test targets if they were scaled
y_pred_test_rescaled = target_scaler.inverse_transform(y_pred_test)
y_test_rescaled = target_scaler.inverse_transform(y_test.reshape(-1, 1))

# Display a few sample predictions alongside actual values
print("Sample Predicted vs. Actual Passenger Counts:")
for i in range(10):  # Show first 10 samples
    print(f"Predicted: {y_pred_test_rescaled[i][0]:.2f}, Actual: {y_test_rescaled[i][0]:.2f}")


[1m1623/1623[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 775us/step
Sample Predicted vs. Actual Passenger Counts:
Predicted: 1.55, Actual: 2.00
Predicted: 1.41, Actual: 2.00
Predicted: 13.84, Actual: 14.00
Predicted: 5.82, Actual: 6.00
Predicted: 0.74, Actual: 3.00
Predicted: 3.99, Actual: 7.00
Predicted: 3.46, Actual: 1.00
Predicted: 24.18, Actual: 16.00
Predicted: 1.17, Actual: 1.00
Predicted: 2.66, Actual: 6.00


In [41]:
# Round down to the nearest integer
y_pred_test_int = np.floor(y_pred_test_rescaled).astype(int)

# OR

# Round up to the nearest integer
y_pred_test_int = np.ceil(y_pred_test_rescaled).astype(int)


In [42]:
# Display a few integer predictions alongside actual values
print("Sample Predicted (Int) vs. Actual Passenger Counts:")
for i in range(10):  # Show first 10 samples
    print(f"Predicted: {y_pred_test_int[i][0]}, Actual: {y_test_rescaled[i][0]}")


Sample Predicted (Int) vs. Actual Passenger Counts:
Predicted: 2, Actual: 2.0
Predicted: 2, Actual: 2.0
Predicted: 14, Actual: 14.0
Predicted: 6, Actual: 6.0
Predicted: 1, Actual: 3.0
Predicted: 4, Actual: 7.0
Predicted: 4, Actual: 1.0
Predicted: 25, Actual: 16.0
Predicted: 2, Actual: 1.0
Predicted: 3, Actual: 6.0


In [47]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for passenger_count
Q1 = modeldata["passenger_count"].quantile(0.25)
Q3 = modeldata["passenger_count"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers from modeldata
modeldata_no_outliers = modeldata[(modeldata["passenger_count"] >= lower_bound) & (modeldata["passenger_count"] <= upper_bound)]
print("Shape of modeldata after removing outliers:", modeldata_no_outliers.shape)


Shape of modeldata after removing outliers: (227823, 1953)


In [48]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and target variable
X = modeldata_no_outliers.drop(columns=['passenger_count'])
y = modeldata_no_outliers['passenger_count']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (if necessary)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [49]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the model
def build_final_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1)  # Output layer for predicting passenger count
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

# Instantiate and train the model
final_model = build_final_model()
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = final_model.fit(
    X_train, y_train,
    epochs=10,           # Adjust epochs as needed
    batch_size=32,       # Adjust batch size as needed
    validation_split=0.2,
    callbacks=[early_stopping]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m4557/4557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 10.0101 - mae: 2.2012 - val_loss: 7.2809 - val_mae: 1.8720
Epoch 2/10
[1m4557/4557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 6.8807 - mae: 1.8097 - val_loss: 6.1678 - val_mae: 1.7049
Epoch 3/10
[1m4557/4557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 6.1270 - mae: 1.7005 - val_loss: 5.9159 - val_mae: 1.6885
Epoch 4/10
[1m4557/4557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 5.5534 - mae: 1.6175 - val_loss: 5.5764 - val_mae: 1.6245
Epoch 5/10
[1m4557/4557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 5.3348 - mae: 1.5844 - val_loss: 5.5206 - val_mae: 1.6185
Epoch 6/10
[1m4557/4557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 5.1881 - mae: 1.5625 - val_loss: 5.3764 - val_mae: 1.5939
Epoch 7/10
[1m4557/4557[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s

In [50]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate the model
y_pred = final_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Evaluation Results on Test Set:")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R-squared (R²): {r2}")


[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 695us/step
Evaluation Results on Test Set:
MAE: 1.588350353541312
MSE: 5.322190640226233
R-squared (R²): 0.6631621506627741


In [51]:
# Sample some predictions and actual values for comparison
import pandas as pd

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Convert predictions and actual values to a DataFrame for easier viewing
results_df = pd.DataFrame({
    "Predicted": y_pred.flatten(),  # Flatten if y_pred is in a 2D array
    "Actual": y_test.values  # Assuming y_test is a DataFrame or Series
})

# Show a sample of the results (you can change the number of rows displayed)
sample_results = results_df.sample(10, random_state=42)  # Sample 10 rows for comparison
print("Sample of Predicted vs Actual Passenger Counts:")
print(sample_results)


[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 765us/step
Sample of Predicted vs Actual Passenger Counts:
       Predicted  Actual
7003    1.422054       2
9667    2.435620       5
37897  14.017361      11
41751   1.619893       2
20969   4.762378       6
37385   2.869371       2
19927   3.633337       4
15673   1.507567       1
27449   9.556856      15
34967   3.181180       3


In [52]:
# Make predictions on the test set and convert to integers
y_pred = final_model.predict(X_test).flatten().astype(int)  # Flatten and convert to int

# Convert predictions and actual values to a DataFrame for easier viewing
results_df = pd.DataFrame({
    "Predicted": y_pred,
    "Actual": y_test.values.astype(int)  # Assuming y_test is a Series or array of integers
})

# Show a sample of the results (you can change the number of rows displayed)
sample_results = results_df.sample(10, random_state=42)  # Sample 10 rows for comparison
print("Sample of Predicted vs Actual Passenger Counts:")
print(sample_results)


[1m1424/1424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 741us/step
Sample of Predicted vs Actual Passenger Counts:
       Predicted  Actual
7003           1       2
9667           2       5
37897         14      11
41751          1       2
20969          4       6
37385          2       2
19927          3       4
15673          1       1
27449          9      15
34967          3       3


In [55]:
import pandas as pd
import numpy as np

# Example input data (replace with your test values)
input_data = pd.DataFrame({
    "geton_station_name": ["1100고지휴게소"],  # Example station name
    "day_of_week": [2],               # e.g., Tuesday
    "hour": [15],                      # e.g., 3 PM
    "Temperature (°C)": [25.0],        # e.g., 25 degrees Celsius
    "Precipitation (mm)": [0.0],       # e.g., no precipitation
    "Wind Speed (m/s)": [3.5],         # e.g., wind speed of 3.5 m/s
    "Humidity (%)": [65],              # e.g., 65% humidity
    "Sea Level Pressure (hPa)": [1013],# e.g., 1013 hPa
    "Cloud Cover (10 Levels)": [5],    # e.g., cloud cover level of 5
})

# One-hot encode 'day_of_week' and 'geton_station_name' to match training
input_data_encoded = pd.get_dummies(input_data, columns=['day_of_week', 'geton_station_name'])

# Add missing columns (from training data) and set them to zero if absent
for col in modeldata.columns.drop('passenger_count'):
    if col not in input_data_encoded.columns:
        input_data_encoded[col] = 0  # Fill missing columns with 0s

# Reorder columns to match the model's input order
input_data_encoded = input_data_encoded[modeldata.columns.drop('passenger_count')]

# Scale the input data using the same scaler as the training set
scaled_input_data = scaler.transform(input_data_encoded)

# Make prediction
predicted_count = final_model.predict(scaled_input_data).flatten()[0]

# Convert to integer if desired
predicted_count_int = int(round(predicted_count))

print(f"Predicted number of people: {predicted_count_int}")


  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step

  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_data_encoded[col] = 0  # Fill missing columns with 0s
  input_

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Predicted number of people: 2
