In [1]:
month = "joined"
# file_path = "/content/drive/My Drive/filtered_pred_"+month+"2024.csv"
file_path = "data/filtered_pred_"+month+"2024.csv"

In [None]:
# # # Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Import necessary libraries
from pyspark.sql import SparkSession

# Stop any existing Spark session
# Step 1: Initialize a Spark session
spark = SparkSession.builder \
    .appName("BigDataProcessing") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
# Step 2: Load your CSV file into a Spark DataFrame
data = spark.read.csv(file_path, header=True, inferSchema=True)


In [4]:
from pyspark.sql import functions as F
data = data.withColumn("DATE_HOUR", F.concat_ws(" ", F.col("DATE"), F.col("HOUR")))
data = data.drop("DATE","HOUR","ROUTE_NAME")
data.show(10)

In [5]:
from pyspark.sql.functions import sum as spark_sum


data_grouped = data.groupBy("DATE_HOUR").agg(
    spark_sum("NO_OF_ADULT").alias("NO_OF_ADULT"),
    spark_sum("NO_OF_CHILD").alias("NO_OF_CHILD")
)

data_grouped = data_grouped.orderBy("DATE_HOUR")
data_grouped = data_grouped.withColumn("TOTAL_PASSENGERS", F.col("NO_OF_ADULT") + F.col("NO_OF_CHILD"))
data_grouped.show(10)

In [6]:
df = data_grouped.toPandas()

In [7]:
import pandas as pd

# Step 7: Convert 'DATE_HOUR' to datetime format, then set as the index
df['DATE_HOUR'] = pd.to_datetime(df['DATE_HOUR'], format='%Y-%m-%d %H')
df.set_index('DATE_HOUR', inplace=True)
df.sort_index(inplace=True)
print(df)


In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0, 1))
scaled_data = sc.fit_transform(df)
print(scaled_data)


In [None]:
# Step 7: Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

scaled_data = scaler.fit_transform(df)


# print(scaled_data)


In [None]:
spark.stop()

In [None]:
import pickle
# Step 9: Prepare Train and Test Data
split_ratio = 0.8
split_index = int(len(scaled_data) * split_ratio)
# Create Train and Test datasets
Train = scaled_data[:split_index]
Test = scaled_data[split_index:]
# Save the Test dataset



In [None]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
# Step 10: Create TimeseriesGenerator for training data
n_input = 24  # Use the last 24 hours for prediction
n_features = 2  # Number of features (NO_OF_ADULT and NO_OF_CHILD)

generator = TimeseriesGenerator(Train, Train, length=n_input, batch_size=1)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional
# Define the LSTM model
model = Sequential([
    Bidirectional(LSTM(units=100, activation='tanh', return_sequences=True), input_shape=(n_input, n_features)),
    Bidirectional(LSTM(units=50, activation='tanh')),
    Dense(units=n_features, activation='linear')  # 'linear' is appropriate for regression
])

In [None]:
from sklearn.metrics import mean_squared_error
# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])

In [None]:
from keras.callbacks import EarlyStopping

# model training
# Train the model
epochs = 29  # More epochs can lead to better performance

# Add EarlyStopping callback to stop training if the validation loss does not improve
early_stopping = EarlyStopping(monitor='loss', patience=4, restore_best_weights=True)

# Train the model with the EarlyStopping callback
history = model.fit(generator, epochs=epochs, verbose=1, callbacks=[early_stopping])


In [None]:
model.save('/content/drive/My Drive/'+month+'.keras')

In [None]:
month