Bước 1: Nhập các Thư Viện Cần Thiết

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pytorch_tabnet.tab_model import TabNetRegressor
import torch


Bước 2: Chuẩn bị Dữ liệu

In [2]:
flights = pd.read_csv('../data/flights.csv')
flights=flights.sample(n=100000)

  flights = pd.read_csv('../data/flights.csv')


In [3]:
variables_to_remove=["YEAR","FLIGHT_NUMBER","TAIL_NUMBER","DEPARTURE_TIME","TAXI_OUT","WHEELS_OFF","ELAPSED_TIME","AIR_TIME","WHEELS_ON","TAXI_IN","ARRIVAL_TIME","DIVERTED","CANCELLED","CANCELLATION_REASON","AIR_SYSTEM_DELAY", "SECURITY_DELAY","AIRLINE_DELAY","LATE_AIRCRAFT_DELAY","WEATHER_DELAY","SCHEDULED_TIME","SCHEDULED_ARRIVAL"]
flights.drop(variables_to_remove,axis=1,inplace= True)
flights.columns

Index(['MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT',
       'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY',
       'DISTANCE', 'ARRIVAL_DELAY'],
      dtype='object')

In [4]:
# Đọc dữ liệu aiports.csv
airport = pd.read_csv('../data/airports.csv')

In [5]:
flights.loc[~flights.ORIGIN_AIRPORT.isin(airport.IATA_CODE.values),'ORIGIN_AIRPORT']='OTHER'
flights.loc[~flights.DESTINATION_AIRPORT.isin(airport.IATA_CODE.values),'DESTINATION_AIRPORT']='OTHER'

In [6]:
flights=flights.dropna() # xóa giá trị NaN

In [7]:
# tiền xử lý dữ liệu thay thế các ngày tương ứng
df=pd.DataFrame(flights)
df['DAY_OF_WEEK']= df['DAY_OF_WEEK'].apply(str)
df["DAY_OF_WEEK"].replace({"1":"SUNDAY", "2": "MONDAY", "3": "TUESDAY", "4":"WEDNESDAY", "5":"THURSDAY", "6":"FRIDAY", "7":"SATURDAY"},inplace=True)

In [8]:
dums = ['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','DAY_OF_WEEK']
df_cat=pd.get_dummies(df[dums],drop_first=True,dtype=int)

In [9]:
var_to_remove=["DAY_OF_WEEK","AIRLINE","ORIGIN_AIRPORT","DESTINATION_AIRPORT"]
df.drop(var_to_remove,axis=1,inplace=True)

In [10]:
data=pd.concat([df,df_cat],axis=1)

In [11]:
X = data.drop('DEPARTURE_DELAY', axis=1)
Y = data['DEPARTURE_DELAY']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


Bước 3: Định dạng lại dữ liệu mục tiêu cho phù hợp với yêu cầu TabNet

In [12]:
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)


Bước 4: Khởi tạo và Huấn luyện Mô hình TabNet

In [13]:
# Khởi tạo mô hình TabNetRegressor
model = TabNetRegressor(optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=0.02),
                        scheduler_params={"step_size":10, "gamma":0.9},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR)



In [14]:
model.fit(
    X_train.values, y_train,
    eval_set=[(X_test.values, y_test)],
    eval_name=['test'],
    eval_metric=['mae', 'rmse','mse'],
    max_epochs=10,
    patience=10,
    batch_size=1024, 
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

epoch 0  | loss: 876.84593| test_mae: 8.15766 | test_rmse: 13.98941| test_mse: 195.7036|  0:00:37s
epoch 1  | loss: 280.47017| test_mae: 8.08657 | test_rmse: 13.02075| test_mse: 169.54001|  0:01:13s
epoch 2  | loss: 320.12166| test_mae: 8.2239  | test_rmse: 13.4835 | test_mse: 181.80477|  0:01:52s
epoch 3  | loss: 320.33221| test_mae: 7.48663 | test_rmse: 11.05019| test_mse: 122.10662|  0:02:36s
epoch 4  | loss: 221.31714| test_mae: 7.48886 | test_rmse: 10.78343| test_mse: 116.28228|  0:03:21s
epoch 5  | loss: 177.29139| test_mae: 7.22455 | test_rmse: 10.50927| test_mse: 110.44467|  0:04:05s
epoch 6  | loss: 160.6365| test_mae: 7.5044  | test_rmse: 11.27602| test_mse: 127.1486|  0:04:50s
epoch 7  | loss: 147.06896| test_mae: 7.96041 | test_rmse: 12.56783| test_mse: 157.95036|  0:05:37s
epoch 8  | loss: 150.90709| test_mae: 7.45517 | test_rmse: 10.86721| test_mse: 118.09619|  0:06:22s
epoch 9  | loss: 159.11408| test_mae: 7.66782 | test_rmse: 11.78614| test_mse: 138.91313|  0:07:07s
Sto



Bước 5: Dự đoán trên tập kiểm tra

In [15]:
p = model.predict(X_test.values)

In [16]:
r2 = r2_score(y_test, p)
rmse = mean_squared_error(y_test, p, squared=False)
mse = mean_squared_error(y_test, p, squared=True)
mae = mean_absolute_error(y_test, p)

print(f'R-squared: {r2}')
print(f'RMSE: {rmse}')
print(f'MSE: {mse}')
print(f'MAE: {mae}')


R-squared: 0.909154287605707
RMSE: 10.50926591619011
MSE: 110.44467009719513
MAE: 7.224551809469061




In [18]:
zz = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': p.flatten()})
zz


Unnamed: 0,Actual,Predicted
0,24.0,38.790401
1,3.0,0.402541
2,-2.0,0.000000
3,38.0,20.405792
4,-5.0,0.000000
...,...,...
19621,-1.0,0.000000
19622,11.0,0.000000
19623,3.0,0.000000
19624,-3.0,0.000000
