In [28]:
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [29]:
SEGMENT = 4

In [30]:
df = pd.read_csv(f'../preprocess/all_segments/segment{SEGMENT}.csv')

df.dropna(inplace=True)
df

Unnamed: 0.1,Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time_in_seconds,length,weather,date_in_week,hour,time_interval,part_of_day
0,3,1.0,262.0,1.0,4.0,2021-10-01,06:54:04,06:57:19,195.0,1.5513,1,Friday,6,6-7,morning
1,18,8.0,274.0,1.0,4.0,2021-10-01,08:55:19,08:58:19,180.0,1.5513,1,Friday,8,8-9,morning
2,33,10.0,123.0,1.0,4.0,2021-10-01,09:25:46,09:27:59,133.0,1.5513,1,Friday,9,9-10,morning
3,42,15.0,262.0,1.0,4.0,2021-10-01,10:37:31,10:40:19,168.0,1.5513,1,Friday,10,10-11,morning
4,57,21.0,274.0,1.0,4.0,2021-10-01,12:07:59,12:12:22,263.0,1.5513,1,Friday,12,12-13,morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6882,171192,25364.0,1166.0,1.0,4.0,2022-11-01,16:55:45,16:59:58,253.0,1.5500,1,Tuesday,16,16-17,afternoon
6883,171207,25366.0,1358.0,1.0,4.0,2022-11-01,17:14:34,17:18:48,254.0,1.5500,1,Tuesday,17,17-18,afternoon
6884,171220,25368.0,284.0,1.0,4.0,2022-11-01,18:04:53,18:08:40,227.0,1.5500,1,Tuesday,18,18-19,afternoon
6885,171235,25369.0,513.0,1.0,4.0,2022-11-01,18:21:19,18:25:24,245.0,1.5500,1,Tuesday,18,18-19,afternoon


In [31]:

df = pd.get_dummies(df, columns=['date_in_week'])
def label_encoding(x):
    help = {
        'Monday':2,
        'Tuesday':3,
        'Wednesday': 4,
        'Thursday': 5,
        'Friday': 6,
        'Saturday': 7,
        'Sunday': 1,
        'morning': 0,
        'afternoon': 1,
        'evening': 2
    }
    return help[x]

df['part_of_day'] = df['part_of_day'].apply(label_encoding)

df

Unnamed: 0.1,Unnamed: 0,trip_id,deviceid,direction,segment,date,start_time,end_time,run_time_in_seconds,length,...,hour,time_interval,part_of_day,date_in_week_Friday,date_in_week_Monday,date_in_week_Saturday,date_in_week_Sunday,date_in_week_Thursday,date_in_week_Tuesday,date_in_week_Wednesday
0,3,1.0,262.0,1.0,4.0,2021-10-01,06:54:04,06:57:19,195.0,1.5513,...,6,6-7,0,True,False,False,False,False,False,False
1,18,8.0,274.0,1.0,4.0,2021-10-01,08:55:19,08:58:19,180.0,1.5513,...,8,8-9,0,True,False,False,False,False,False,False
2,33,10.0,123.0,1.0,4.0,2021-10-01,09:25:46,09:27:59,133.0,1.5513,...,9,9-10,0,True,False,False,False,False,False,False
3,42,15.0,262.0,1.0,4.0,2021-10-01,10:37:31,10:40:19,168.0,1.5513,...,10,10-11,0,True,False,False,False,False,False,False
4,57,21.0,274.0,1.0,4.0,2021-10-01,12:07:59,12:12:22,263.0,1.5513,...,12,12-13,0,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6882,171192,25364.0,1166.0,1.0,4.0,2022-11-01,16:55:45,16:59:58,253.0,1.5500,...,16,16-17,1,False,False,False,False,False,True,False
6883,171207,25366.0,1358.0,1.0,4.0,2022-11-01,17:14:34,17:18:48,254.0,1.5500,...,17,17-18,1,False,False,False,False,False,True,False
6884,171220,25368.0,284.0,1.0,4.0,2022-11-01,18:04:53,18:08:40,227.0,1.5500,...,18,18-19,1,False,False,False,False,False,True,False
6885,171235,25369.0,513.0,1.0,4.0,2022-11-01,18:21:19,18:25:24,245.0,1.5500,...,18,18-19,1,False,False,False,False,False,True,False


In [32]:
# Extract the training and test data
data = df.values
X = data[:,[1,2,3,10,13,14,15,16,17,18,19,20]]
y = data[:,8]
X

array([[1.0, 262.0, 1.0, ..., False, False, False],
       [8.0, 274.0, 1.0, ..., False, False, False],
       [10.0, 123.0, 1.0, ..., False, False, False],
       ...,
       [25368.0, 284.0, 1.0, ..., False, True, False],
       [25369.0, 513.0, 1.0, ..., False, True, False],
       [25370.0, 121.0, 1.0, ..., False, True, False]], dtype=object)

In [33]:
# Tính Z-score cho mỗi cột trong dữ liệu X
z_scores = np.abs((X - X.mean()) / X.std())

# Định một ngưỡng Z-score tùy ý (ví dụ: 3.0) để xác định outlier
threshold = 3.0

# Loại bỏ các dòng chứa outlier
X_no_outliers = X[(z_scores < threshold).all(axis=1)]
y_no_outliers = y[(z_scores < threshold).all(axis=1)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

array([[2350.0, 250.0, 1.0, ..., False, True, False],
       [9578.0, 1143.0, 1.0, ..., False, False, False],
       [8725.0, 116.0, 1.0, ..., False, False, True],
       ...,
       [21813.0, 513.0, 1.0, ..., False, False, False],
       [22156.0, 1143.0, 1.0, ..., False, False, False],
       [2127.0, 274.0, 1.0, ..., False, False, False]], dtype=object)

In [34]:
# # Scale the data to be between 0 and 1
# #scaler = StandardScaler()
# scaler = MinMaxScaler()
# scaler.fit(X_train[:,[0,1,2,3,5]])
# X_train = scaler.transform(X_train[:,[0,1,2,3,5]])
# X_test = scaler.transform(X_test[:,[0,1,2,3,5]])

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train


array([[0.09259332, 0.08359326, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.37750798, 0.64067374, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.34388427, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.85978951, 0.24766064, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.87330995, 0.64067374, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.08380307, 0.09856519, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [45]:
# Khởi tạo mô hình SVM với kernel tuyến tính và siêu tham số C mặc định
svm_regressor = SVR(kernel='linear', C=200, epsilon=23)
# Khởi tạo mô hình k-NN với số lân cận k (ví dụ: k=50)
# knn_regressor = KNeighborsRegressor(n_neighbors=20)
# Khởi tạo mô hình RandomForestRegressor với số cây (n_estimators) và các tham số khác
rf_regressor = RandomForestRegressor(n_estimators=50, random_state=42)


# Huấn luyện mô hình trên tập huấn luyện
rf_regressor.fit(X_train, y_train)
svm_regressor.fit(X_train, y_train)
# knn_regressor.fit(X_train, y_train)



# Dự đoán thời gian đến trên tập kiểm tra
y_pred_rf = rf_regressor.predict(X_test)
y_pred_svm = svm_regressor.predict(X_test)


# Đánh giá mô hình
mae_svm = mean_absolute_error(y_test, y_pred_svm)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

mse_svm = mean_squared_error(y_test, y_pred_svm)
mse_rf = mean_squared_error(y_test, y_pred_rf)

r2_svm = r2_score(y_test, y_pred_svm)
r2_rf = r2_score(y_test, y_pred_rf)


print(f"Mean Absolute Error (SVM): {mae_svm}")
print(f"Mean Squared Error (SVM): {mse_svm}")
print(f"Root Mean Squared Error (SVM): {math.sqrt(mse_svm)}\n")

print(f"Mean Absolute Error (RF): {mae_rf}")
print(f"Mean Squared Error (RF): {mse_rf}")
print(f"Root Mean Squared Error (RF): {math.sqrt(mse_rf)}")

Mean Absolute Error (SVM): 33.060748138952256
Mean Squared Error (SVM): 2152.7074780672765
Root Mean Squared Error (SVM): 46.397278778687834

Mean Absolute Error (RF): 32.85833091436865
Mean Squared Error (RF): 2103.7281529753263
Root Mean Squared Error (RF): 45.866416395608304


In [36]:
# # # Try to get the lowest RMSE by using different Epsilon value
# epsilons = np.arange(10,30)
# scores = []
# for e in epsilons:
#     svm_regressor.set_params(epsilon=e)
#     svm_regressor.fit(X_train, y_train)
#     scores.append(math.sqrt(mean_squared_error(y_test, svm_regressor.predict(X_test))))
# plt.plot(epsilons, scores)
# plt.title("Epsilon effect")
# plt.xlabel("epsilon")
# plt.ylabel("RMSE")
# plt.show()