In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns # seaborn là thư viện được xây trên matplotlib, giúp việc visualization đỡ khổ hơn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
cars_df = pd.read_csv("cars_dataset.csv", sep = "\t", encoding='utf-8')

# Chuyển đổi dữ liệu từ dữ liệu thô

In [None]:
num_cols = ['price', 'length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2', 'numberOfAxles',
            'numberOfDoors', 'numberOfForwardGears', 'seatingCapacity', 'cargoVolume', 'roofLoad', 
            'accelerationTime', 'fuelCapacity', 'fuelConsumption', 'speed', 'payload', 'trailerWeight', 
            'vEengineDisplacement', 'vEenginePower', 'torque']

In [None]:
cat_cols = []
for col in cars_df.columns:
    if not (col in num_cols):
        cat_cols.append(col)

In [None]:
print(len(num_cols), len(cat_cols))
print(num_cols, '\n', cat_cols)

### Xử lý các cột dữ liệu số

In [None]:
# copy ra df để xư lý
df = cars_df.copy()

In [None]:
# fucntion convert cac thuoc tinh khac
def cvtFloat(x):
    if type(x) == str:
        temp = x.replace(',', '.').split()[0]
    else:
        temp = x
    val = None
    try:
        val = float(temp)
    except ValueError:
        return val
    return val

In [None]:
for el in num_cols:
    if el != 'cargoVolume':
        print(el)
        df[el] = df[el].apply(cvtFloat)

In [None]:
# Hàm xử lý riêng cho cargoVolume
def cvtFloat_cargoVolume(x):
    temp = x.split()[0]
    temp = temp.replace('-', ' ')
    temp = temp.split()
    if len(temp) > 0:
        temp = temp[-1]
    else:
        temp = x
    val = None
    try:
        val = float(temp)
    except ValueError:
        return val
    return val

In [None]:
df['cargoVolume'] = df['cargoVolume'].apply(cvtFloat_cargoVolume)

In [None]:
df[num_cols].info()

### Xử lý các cột dữ liệu categorize

In [None]:
for cat in cat_cols[:]:
    print(cat, len(cars_df[cat].unique()))

* Có thể loại bỏ cột vehicleTransmission vì chỉ có 1 giá trị, không có ý nghĩa trong việc học.
* Cột fuelType và vEfuelType là giống nhau (do quá trình crawl nhóm không để ý), có thể drop cột fuelType.
* Các cột url, name, model có nhiều ý nghĩa, nên có thể loại bỏ.
* brand có thể xét vì có tới 89 giá trị (có khả năng sẽ có ý nghĩa với các brand có giá trị cao), modelDate cần xem xét.

**=> Số cột còn lại là: eLabel (9), bodyType (11), driveWheelConfiguration (6), vEengineType (4), vEfuelType (11).**

In [None]:
# chuẩn hóa cột modelDate
def norm_modelDate(x):
    if (x == 0):
        return None
    else:
        return str(x)
df['modelDate'] = df['modelDate'].apply(norm_modelDate)
df['modelDate'].unique()

In [None]:
df['driveWheelConfiguration'].unique()

In [None]:
#df[['price', 'driveWheelConfiguration']].groupby('driveWheelConfiguration').mean()

In [None]:
df['bodyType'].unique()

In [None]:
df['eLabel'].unique()

In [None]:
df['vEengineType'].unique()

In [None]:
df['vEfuelType'].unique()

* Cột driveWheelConfiguration không có giá trị lỗi ('N.A.', '-', ...)
* Các cột bodyType, vEengineType, vEfuelType có chứa nan (đã được xử lý).
* Cột eLabel có chứa các giá trị lỗi, cần được chuẩn hóa. Sau khi chuẩn hóa, dòng thiếu dữ liệu quá nhiều nên cần loại bỏ khi qua bước xử lý.

In [None]:
def norm_eLabel(x):
    if (x == 'N.A.' or x == '-'):
        return None
    else:
        return x

In [None]:
df['eLabel'] = df['eLabel'].apply(norm_eLabel)

In [None]:
df[cat_cols].info()

### Lưu ra file để tiện xử lý

In [None]:
# df.to_csv("cars_preprocessed_undrop.csv", sep = "\t", index=False, encoding='utf-8')

# Tiền xử lý dữ liệu

In [2]:
df_ori = pd.read_csv("train_data.csv", sep = "\t", encoding='utf-8')

In [3]:
df = df_ori.copy()

In [4]:
y = df['price']
X = df.drop(['price'], axis = 1)

In [5]:
# origin copy
X_ori = X.copy()
y_ori = y.copy()

In [6]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
len(train_X), len(val_X)

(60472, 15118)

In [31]:
temp = train_X.copy()

In [38]:
temp['vEfuelType'].unique()

array(['petrol', 'diesel', 'LPG', 'bio-ethanol', nan, 'natural gas',
       'benzine', 'aardgas / petrol', 'aardgas', 'LPG / petrol',
       'petrol / bio-ethanol'], dtype=object)

In [70]:
type_list = set()

    
type_sr = temp['vEfuelType'].str.replace('[/+]', ' ').str.split()
def func(x):
    if type(x) is list:
        type_list.update(x)
type_sr.apply(func)
type_list

{'LPG',
 'aardgas',
 'benzine',
 'bio-ethanol',
 'diesel',
 'gas',
 'natural',
 'petrol'}

### Tạo pipeline

#### Multilabel Encoding

In [76]:
def f(x, l):
    if type(x) is list:
        l.update(x)
class MultilabelEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, col_name):
        self.col_name = col_name
        self.type_list = set()
    def fit(self, X_df, y=None):
        types_sr = X_df[self.col_name].str.replace('[/+]', ' ').str.split()
        types_sr.apply(f, args=(self.type_list,))
        self.type_list = list(self.type_list)
        return self
    def transform(self, X_df, y=None):
        transformed_df = X_df.copy()
        for i in range(len(self.type_list)):
            transformed_df[self.col_name + '_' + self.type_list[i]] = transformed_df[self.col_name].apply(lambda x: 1 if ((type(x) is str) and (self.type_list[i] in x)) else 0)
        transformed_df.drop(self.col_name, axis=1, inplace=True)
        return transformed_df

#### col add and drop

In [72]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_brands = 10):
        # TODO
        self.num_top_brands = num_top_brands
        self.dropped_cols = ['url', 'name', 'model', 'weightTotal', 'fuelType', 'vehicleTransmission', 'modelDate']
    def fit(self, X_df, y=None):
        brand_col = X_df.brand.str.extract(r'([a-zA-z]+)', expand=False)
        self.brand_counts_ = brand_col.value_counts()
        brands = list(self.brand_counts_.index)
        self.top_brands_ = brands[:max(1, min(self.num_top_brands, len(brands)))]
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        brand_col = df.brand.str.extract("([a-zA-z]+)", expand=False)
        brand_col[~brand_col.isin(self.top_brands_)] = 'Others'
        df["brand"] = brand_col
        df.drop(self.dropped_cols, axis=1, inplace=True)
        return df

In [10]:
cat_multi = ['vEfuelType', 'driveWheelConfiguration']
cat_single = ['brand', 'eLabel', 'bodyType', 'vEengineType']

In [11]:
col_adderdropper = ColAdderDropper()

In [12]:
num_cols = []
for col in X.columns:
    if (not col in cat_single) and (not col in cat_multi) and (not col in col_adderdropper.dropped_cols):
        num_cols.append(col)

In [13]:
len(num_cols), len(cat_single), len(cat_multi)

(20, 4, 2)

### pipeline cho preprocessing

In [77]:
imp_mean = SimpleImputer(strategy='mean') 
imp_mode = SimpleImputer(strategy='most_frequent')

encoding = OneHotEncoder(handle_unknown='ignore', sparse=False)
vEfuelType_encoder = MultilabelEncoding('vEfuelType')
driveWheelConfig_encoder = MultilabelEncoding('driveWheelConfiguration')

categorical_transformer = make_pipeline(imp_mode, encoding)

colTransform = ColumnTransformer(transformers=[('numerical', imp_mean, num_cols),\
                                               ('categorical', categorical_transformer, cat_single),
                                               ('vEfuelType', vEfuelType_encoder, ['vEfuelType']),
                                               ('driveWheelConfig', driveWheelConfig_encoder, ['driveWheelConfiguration'])])

colNormalize = StandardScaler()

#preprocessing = make_pipeline(col_adderdropper, colTransform, colNormalize)

### train và validate model

In [84]:
def train_and_val(full_pipeline, train_X, train_y, val_X, val_y):
    full_pipeline.fit(train_X, train_y);
    #print("n_brands:", len(col_adderdropper.top_brands_))
    pred_y = full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    print(train_score, val_score)
    print("MSE =", np.round(np.mean((pred_y - val_y) ** 2), 3))
    print("MAE =", np.round(np.mean(np.abs(pred_y - val_y)), 3))
    return train_score, val_score

### full pipeline với MLPRegressor

In [86]:
mlpregressor = MLPRegressor(hidden_layer_sizes=(128, 256, 256, 128, ), solver='adam', learning_rate='adaptive'\
                            ,random_state=0, max_iter=500, early_stopping=False, verbose=1)

# preprocessing.set_params(coladderdropper__num_top_brands = 100)

full_pipeline = make_pipeline(col_adderdropper, colTransform, colNormalize, mlpregressor)

full_pipeline.set_params(coladderdropper__num_top_brands = 100)

train_and_val(full_pipeline, train_X, train_y, val_X, val_y)

Iteration 1, loss = 135.22053926
Iteration 2, loss = 47.06151951
Iteration 3, loss = 40.72841616
Iteration 4, loss = 37.59122228
Iteration 5, loss = 34.09809402
Iteration 6, loss = 32.26771227
Iteration 7, loss = 32.08256134
Iteration 8, loss = 29.97067353
Iteration 9, loss = 28.12729006
Iteration 10, loss = 24.30484254
Iteration 11, loss = 28.13428605
Iteration 12, loss = 22.79060809
Iteration 13, loss = 23.23534896
Iteration 14, loss = 24.95457520
Iteration 15, loss = 18.42730958
Iteration 16, loss = 23.32014577
Iteration 17, loss = 17.62998306
Iteration 18, loss = 17.64785341
Iteration 19, loss = 17.54144185
Iteration 20, loss = 16.37988419
Iteration 21, loss = 16.30041557
Iteration 22, loss = 15.92875516
Iteration 23, loss = 14.66517204
Iteration 24, loss = 14.75445453
Iteration 25, loss = 17.70168423
Iteration 26, loss = 14.82008038
Iteration 27, loss = 13.50990349
Iteration 28, loss = 12.81675808
Iteration 29, loss = 11.77376615
Iteration 30, loss = 11.90233630
Iteration 31, loss

(0.9801582900562297, 0.7180321752236754)

In [80]:
len(col_adderdropper.top_brands_)

87

In [81]:
mlpregressor = MLPRegressor(hidden_layer_sizes=(16, 32, 64, 64, 32, 16, ), solver='adam', learning_rate='adaptive'\
                            ,random_state=0, max_iter=500, early_stopping=True, verbose=0)

full_pipeline = make_pipeline(col_adderdropper, colTransform, colNormalize, mlpregressor)

In [83]:
mlpregressor = MLPRegressor(hidden_layer_sizes=(16, 32, 64, 64, 32, 16, ), solver='adam', learning_rate='adaptive'\
                            ,random_state=0, max_iter=500, early_stopping=True)
for n_brands in range(10, 95, 5):
    print('.')
    full_pipeline.set_params(coladderdropper__num_top_brands=n_brands)
    train_and_val(full_pipeline, train_X, train_y, val_X, val_y)
    print()
'Finish!'

n_brands: 10
0.9288349180654218 0.7038676272867647
MSE = 268.301
MAE = 4.181

n_brands: 15
0.9743537993693123 0.6729681857059386
MSE = 296.297
MAE = 3.327

n_brands: 20
0.9707413272311217 0.5961798633177005
MSE = 365.868
MAE = 3.479

n_brands: 25
0.9661561424118765 0.6930479393998731
MSE = 278.104
MAE = 3.573

n_brands: 30
0.9685471320405159 0.5422445951124799
MSE = 414.734
MAE = 3.506

n_brands: 35
0.9708504913781262 0.34738018365784296
MSE = 591.285
MAE = 3.48

n_brands: 40
0.9673899866376643 0.6248246909915811
MSE = 339.915
MAE = 3.537

n_brands: 45
0.9723446048487259 0.5886804806431734
MSE = 372.663
MAE = 3.559

n_brands: 50
0.9726382009578769 0.36684338926492666
MSE = 573.651
MAE = 3.441

n_brands: 55
0.9622277134541068 0.7792741241967922
MSE = 199.982
MAE = 3.455

n_brands: 60
0.9663933914430549 0.4550759592660548
MSE = 493.711
MAE = 3.58

n_brands: 65
0.9749038787644124 0.6637803958545977
MSE = 304.621
MAE = 3.208

n_brands: 70
0.9732689744269162 0.5785846686747914
MSE = 381.81


'Finish!'

In [None]:
full_pipeline.coef_

### full pipeline với RandomForestRegressor

In [None]:
rfregressor = RandomForestRegressor(n_estimators=512, random_state=0, verbose=1)
full_pipeline = make_pipeline(preprocessing, rfregressor)
train_and_val(full_pipeline, train_X, train_y, val_X, val_y)

### Chọn hyper-parameter tùy vào regressor

In [None]:
train_scores = []
val_scores = []
alphas = [0.1, 1, 10, 100, 1000]
best_val_score = -float('inf'); best_alpha = None;
for alpha in alphas:
    full_pipeline.set_params(mlpregressor__alpha=alpha)
    full_pipeline.fit(train_X, train_y)
    full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    train_scores.append(train_score)
    val_scores.append(val_score)
    if best_val_score < val_score:
        best_val_score = val_score
        best_alpha = alpha
'Finish!'

### Test

In [None]:
#full_pipeline.set_params(mlpregressor__alpha=best_alpha)
pred_y = full_pipeline.predict(test_X)

In [None]:
pred_y

In [None]:
np.array(test_y)

In [None]:
np.mean(np.abs(pred_y - test_y))

In [None]:
np.mean(np.abs(pred_y - test_y))

In [None]:
np.mean((pred_y - test_y) ** 2)

In [None]:
np.mean((pred_y - test_y) ** 2)