In [None]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Temporary Only select the first file
data = pd.read_csv('./A03_data/training_data/order_data/order_data_2016-01-01', header=None)

In [None]:
# Load Data

files = glob.glob("A03_data/training_data/order_data/*")
# data = [pd.read_csv(f) for f in files]
# data = pd.concat(data, ignore_index=True)
data = data.iloc[:, 0].str.split("\t", expand=True)
data.columns = [
    "order_id",
    "driver_id",
    "passenger_id",
    "start_region_hash",
    "dest_region_hash",
    "Price",
    "Time",
]

region = pd.read_csv("A03_data/training_data/cluster_map/cluster_map", header=None)
region = region.iloc[:, 0].str.split("\t", expand=True)
region.columns = ["region_hash", "region_id"]

In [None]:
region

In [None]:
data

In [None]:
region_ids = region["region_id"]

region_list = [region for region in region_ids for _ in range(144)]
time_slots = list(range(0, 144)) * len(region_ids)

df = pd.DataFrame({"region_id": region_list, "time_slot": time_slots, "orders_count": 0, "unfulfilled_orders_count": 0, "gap": 0})

df

In [None]:
# Temporary
data = data.head(10000)

In [None]:
# iterate over data
data["start_region_id"] = data["start_region_hash"].map(region.set_index("region_hash")["region_id"])

# Calculate the time slot
data["time_slot"] = data["Time"].str.split(" ").str[1].str.split(":").apply(lambda x: (int(x[0]) * 60 + int(x[1])) // 10)

# Group by start region id and time slot, and count the number of orders and unfulfilled orders
df = data.groupby(["start_region_id", "time_slot"]).agg(orders_count=("driver_id", lambda x: x.ne("NULL").sum()), unfulfilled_orders_count=("driver_id", lambda x: x.eq("NULL").sum())).reset_index()

# Calculate the gap
df["gap"] = df["orders_count"] - df["unfulfilled_orders_count"]

df

In [None]:
# Model Training
X = df[["start_region_id", "time_slot"]]  # input data
y = df["gap"]  # output data

# Convert 'time_slot' from string to datetime and then to timestamp
X.loc[:, "time_slot"] = (pd.to_datetime(X["time_slot"]).astype(int) / 10**9).astype(float)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

In [None]:
model.predict(X_test)

In [None]:
#show model 
print('Model Coefficients:')
for feature, coef in zip(X_train.columns, model.coef_):
    print(f'{feature}: {coef}')

In [None]:
#Print graph
import matplotlib.pyplot as plt

plt.scatter(df['time_slot'], df['gap'])
plt.xlabel('Time Slot')
plt.ylabel('Gap')
plt.title('Time Slot vs Gap')
plt.show()