# Data Preparation

In [1]:
from data_preprocessor import DataPreprocessor
import pandas as pd

In [2]:
filepath = '../car-dataset.csv'

def load_dataframe (filepath):
  return pd.read_csv(filepath)


def shuffle_rows (dataframe):
  return dataframe.sample(frac=1, random_state = 2).reset_index(drop=True)

def get_car_data(ratio = (0.8, 0.1, 0.1)): # filetype: .csv | list_type: pandas dataframe | ratio: (train - val - test)
  # tập train lớn nên chọn 8 1 1
  data = load_dataframe(filepath)
  data = shuffle_rows(data)

  data_x = data.drop(columns=['Price'])
  data_y = data['Price']

  train_size = int(len(data) * ratio[0])
  val_size = int(len(data) * ratio[1])
  test_size = int(len(data) * ratio[2])

  x_train = data_x[:train_size]
  y_train = data_y[:train_size]

  x_val = data_x[train_size:train_size+val_size]
  y_val = data_y[train_size:train_size+val_size]

  x_test = data_x[train_size+val_size:]
  y_test = data_y[train_size+val_size:]


  return x_train, y_train, x_val, y_val, x_test, y_test

df = load_dataframe(filepath)

df = shuffle_rows(df)

x_train, y_train, x_val, y_val, x_test, y_test = get_car_data()

In [3]:
Prep = DataPreprocessor()

x_train, y_train = Prep.fit_transform_pipeline(x_train, y_train)

In [4]:
x_val, y_val = Prep.transform_pipeline(x_val, y_val, drop_nan=True)
x_test, y_test = Prep.transform_pipeline(x_test, y_test, drop_nan=False)

# Model

In [5]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

param_grid_xgb = {
    'n_estimators': [750],     
    'max_depth': [4],           
    'learning_rate': [0.05],    
    'reg_alpha': [11], 
    'reg_lambda': [2],            
}


xgb_model = XGBRegressor(
    random_state=42,
)

grid_search_xgb = GridSearchCV(
    xgb_model,
    param_grid_xgb,
    cv=3, 
    n_jobs=1,
    verbose=2
)

grid_search_xgb.fit(x_train, y_train)

model = grid_search_xgb.best_estimator_


Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END learning_rate=0.05, max_depth=4, n_estimators=750, reg_alpha=11, reg_lambda=2; total time=   4.4s
[CV] END learning_rate=0.05, max_depth=4, n_estimators=750, reg_alpha=11, reg_lambda=2; total time=   4.4s
[CV] END learning_rate=0.05, max_depth=4, n_estimators=750, reg_alpha=11, reg_lambda=2; total time=   4.5s


In [6]:
model

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [7]:
y_train_pred_xgb = model.predict(x_train)
y_val_pred_xgb = model.predict(x_val)

# Triển khai mô hình

## Lưu các lựa chọn phân loại

In [8]:
print(Prep.get_categorical_choices())

{'seat-num': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 17.0, 'Other'], 'maker': ['Abarth', 'Alfa Romeo', 'Aston Martin', 'Audi', 'BMW', 'Bentley', 'Brooke', 'Bugatti', 'Cadillac', 'Caterham', 'Chevrolet', 'Chrysler', 'Citroen', 'Corvette', 'DAX', 'DS', 'Dacia', 'Daewoo', 'Daihatsu', 'Daimler', 'Dodge', 'Ferrari', 'Fiat', 'Ford', 'GMC', 'Great Wall', 'Honda', 'Hummer', 'Hyundai', 'Infiniti', 'Isuzu', 'Jaguar', 'Jeep', 'Kia', 'Koenigsegg', 'Lamborghini', 'Land Rover', 'Lexus', 'Lincoln', 'London Taxis International', 'Lotus', 'MEV', 'MG', 'MINI', 'Maserati', 'Maybach', 'Mazda', 'McLaren', 'Mercedes-Benz', 'Mitsubishi', 'Morgan', 'Nissan', 'Noble', 'Opel', 'Perodua', 'Peugeot', 'Pilgrim', 'Porsche', 'Proton', 'Renault', 'Reva', 'Rolls-Royce', 'Rover', 'SEAT', 'SKODA', 'Saab', 'Santana', 'Sebring', 'Smart', 'Ssangyong', 'Subaru', 'Suzuki', 'TVR', 'Tesla', 'Toyota', 'Vauxhall', 'Volkswagen', 'Volvo', 'Westfield', 'Zenos', 'Other'], 'genmodel': {'Abarth': ['595', '124 Spider', '595C', 'O

## Lưu pipeline

In [9]:
import pickle

# Lưu mô hình XGBoost đã huấn luyện
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(Prep, f)

