In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns # seaborn là thư viện được xây trên matplotlib, giúp việc visualization đỡ khổ hơn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [78]:
cars_df = pd.read_csv("cars_dataset.csv", sep = "\t", encoding='utf-8')

# Chuyển đổi dữ liệu từ dữ liệu thô

In [79]:
num_cols = ['price', 'length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2', 'numberOfAxles',
            'numberOfDoors', 'numberOfForwardGears', 'seatingCapacity', 'cargoVolume', 'roofLoad', 
            'accelerationTime', 'fuelCapacity', 'fuelConsumption', 'speed', 'payload', 'trailerWeight', 
            'vEengineDisplacement', 'vEenginePower', 'torque']

In [80]:
cat_cols = []
for col in cars_df.columns:
    if not (col in num_cols):
        cat_cols.append(col)

In [81]:
print(len(num_cols), len(cat_cols))
print(num_cols, '\n', cat_cols)

22 12
['price', 'length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2', 'numberOfAxles', 'numberOfDoors', 'numberOfForwardGears', 'seatingCapacity', 'cargoVolume', 'roofLoad', 'accelerationTime', 'fuelCapacity', 'fuelConsumption', 'speed', 'payload', 'trailerWeight', 'vEengineDisplacement', 'vEenginePower', 'torque'] 
 ['url', 'name', 'model', 'brand', 'eLabel', 'bodyType', 'modelDate', 'fuelType', 'vehicleTransmission', 'driveWheelConfiguration', 'vEengineType', 'vEfuelType']


### Xử lý các cột dữ liệu số

In [56]:
# copy ra df để xư lý
df = cars_df.copy()

In [57]:
# Loại bỏ các cột không liên quan
#df.drop(['url', 'name', 'model'], axis=1, inplace=True)

In [58]:
# fucntion convert cac thuoc tinh khac
def cvtFloat(x):
    if type(x) == str:
        temp = x.replace(',', '.').split()[0]
    else:
        temp = x
    val = None
    try:
        val = float(temp)
    except ValueError:
        return val
    return val

In [59]:
for el in num_cols:
    if el != 'cargoVolume':
        print(el)
        df[el] = df[el].apply(cvtFloat)

price
length
height
width
weight
weightTotal
emissionsCO2
numberOfAxles
numberOfDoors
numberOfForwardGears
seatingCapacity
roofLoad
accelerationTime
fuelCapacity
fuelConsumption
speed
payload
trailerWeight
vEengineDisplacement
vEenginePower
torque


In [60]:
# Hàm xử lý riêng cho cargoVolume
def cvtFloat_cargoVolume(x):
    temp = x.split()[0]
    temp = temp.replace('-', ' ')
    temp = temp.split()
    if len(temp) > 0:
        temp = temp[-1]
    else:
        temp = x
    val = None
    try:
        val = float(temp)
    except ValueError:
        return val
    return val

In [61]:
df['cargoVolume'] = df['cargoVolume'].apply(cvtFloat_cargoVolume)

In [62]:
df[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84174 entries, 0 to 84173
Data columns (total 22 columns):
price                   83989 non-null float64
length                  84119 non-null float64
height                  84066 non-null float64
width                   84111 non-null float64
weight                  83795 non-null float64
weightTotal             83795 non-null float64
emissionsCO2            68428 non-null float64
numberOfAxles           84174 non-null float64
numberOfDoors           84174 non-null float64
numberOfForwardGears    82037 non-null float64
seatingCapacity         82228 non-null float64
cargoVolume             82921 non-null float64
roofLoad                70042 non-null float64
accelerationTime        81646 non-null float64
fuelCapacity            83966 non-null float64
fuelConsumption         82853 non-null float64
speed                   83242 non-null float64
payload                 82819 non-null float64
trailerWeight           79262 non-null float6

### Xử lý các cột dữ liệu categorize

In [63]:
for cat in cat_cols[:]:
    print(cat, len(cars_df[cat].unique()))

url 84173
name 44332
model 1678
brand 89
eLabel 9
bodyType 11
modelDate 52
fuelType 11
vehicleTransmission 1
driveWheelConfiguration 6
vEengineType 4
vEfuelType 11


* Có thể loại bỏ cột vehicleTransmission vì chỉ có 1 giá trị, không có ý nghĩa trong việc học.
* Cột fuelType và vEfuelType là giống nhau (do quá trình crawl nhóm không để ý), có thể drop cột fuelType.
* Các cột url, name, model có nhiều ý nghĩa, nên có thể loại bỏ.
* brand có thể xét vì có tới 89 giá trị (có khả năng sẽ có ý nghĩa với các brand có giá trị cao), modelDate cần xem xét.

**=> Số cột còn lại là: eLabel (9), bodyType (11), driveWheelConfiguration (6), vEengineType (4), vEfuelType (11).**

In [64]:
# chuẩn hóa cột modelDate
def norm_modelDate(x):
    if (x == 0):
        return None
    else:
        return str(x)
df['modelDate'] = df['modelDate'].apply(norm_modelDate)
df['modelDate'].unique()

array(['1999', '2018', '2019', '2015', '2016', '2017', '1991', '1995',
       '1984', '2009', '2012', '2011', '2010', '1979', '1980', '2001',
       '2004', '2003', '2002', '1994', '1981', '2007', '2008', '1983',
       '1982', '2005', '2006', '1998', '1993', '1990', '1985', '1986',
       '2013', '2014', '1997', None, '1996', '1988', '1992', '1987',
       '1989', '2000', '1976', '1978', '1977', '1972', '1975', '1974',
       '1973', '1970', '1971', '1969'], dtype=object)

In [65]:
df['driveWheelConfiguration'].unique()

array(['front+rear', 'front', 'rear', 'voor', 'voor+achter', 'achter'],
      dtype=object)

In [66]:
#df[['price', 'driveWheelConfiguration']].groupby('driveWheelConfiguration').mean()

In [67]:
df['bodyType'].unique()

array(['hatchback', 'stationwagon', 'suv/crossover', 'sedan', 'cabriolet',
       'coupe', 'mpv', 'van', nan, 'bus', 'pick-up'], dtype=object)

In [68]:
df['eLabel'].unique()

array(['G', 'C', 'E', 'D', 'B', 'F', 'N.A.', '-', 'A'], dtype=object)

In [69]:
df['vEengineType'].unique()

array(['dohc', 'ohc', 'ohv', nan], dtype=object)

In [70]:
df['vEfuelType'].unique()

array(['petrol', 'diesel', 'LPG', 'benzine', 'bio-ethanol', 'natural gas',
       nan, 'LPG / petrol', 'aardgas', 'aardgas / petrol',
       'petrol / bio-ethanol'], dtype=object)

* Cột driveWheelConfiguration không có giá trị lỗi ('N.A.', '-', ...)
* Các cột bodyType, vEengineType, vEfuelType có chứa nan (đã được xử lý).
* Cột eLabel có chứa các giá trị lỗi, cần được chuẩn hóa. Sau khi chuẩn hóa, dòng thiếu dữ liệu quá nhiều nên cần loại bỏ khi qua bước xử lý.

In [71]:
def norm_eLabel(x):
    if (x == 'N.A.' or x == '-'):
        return None
    else:
        return x

In [72]:
df['eLabel'] = df['eLabel'].apply(norm_eLabel)

In [73]:
df[cat_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84174 entries, 0 to 84173
Data columns (total 12 columns):
url                        84174 non-null object
name                       84174 non-null object
model                      84174 non-null object
brand                      84174 non-null object
eLabel                     68428 non-null object
bodyType                   83418 non-null object
modelDate                  83418 non-null object
fuelType                   83999 non-null object
vehicleTransmission        84174 non-null object
driveWheelConfiguration    84174 non-null object
vEengineType               84001 non-null object
vEfuelType                 83999 non-null object
dtypes: object(12)
memory usage: 7.7+ MB


### Lưu ra file để tiện xử lý

In [74]:
# df.to_csv("cars_preprocessed_undrop.csv", sep = "\t", index=False, encoding='utf-8')

# Tiền xử lý dữ liệu

In [2]:
df_ori = pd.read_csv("train_data.csv", sep = "\t", encoding='utf-8')

In [3]:
df = df_ori.copy()

In [6]:
y = df['price']
X = df.drop(['price'], axis = 1)

In [7]:
# origin copy
X_ori = X.copy()
y_ori = y.copy()

In [8]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
len(train_X), len(val_X)

(60472, 15118)

### Tạo pipeline

#### Multilabel Encoding

In [32]:
class MultilabelEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, col_name):
        self.col_name = col_name
        self.type_list = set()
    def fit(self, X_df, y=None):
        types_sr = X_df[self.col_name].str.replace('[^\w]', ' ').str.split()
        type_sr.reset_index(inplace=True, drop=True)
        for i in range(types_sr.shape[0]):
            if type(types_sr[i]) is list:
                self.type_list.add(types_sr[i][0])
        self.type_list = list(self.type_list)
        return self
    def transform(self, X_df, y=None):
        transformed_df = X_df.copy()
        for i in range(len(self.type_list)):
            transformed_df[self.col_name + '_' + self.type_list[i]] = transformed_df[self.col_name].apply(lambda x: 1 if ((type(x) is str) and (self.type_list[i] in x)) else 0)
        transformed_df.drop(self.col_name, axis = 1)
        return transformed_df

#### col add and drop

In [11]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_brands = 10):
        # TODO
        self.num_top_brands = num_top_brands
        self.dropped_cols = ['url', 'name', 'model', 'weightTotal', 'fuelType', 'vehicleTransmission', 'modelDate']
    def fit(self, X_df, y=None):
        brand_col = X_df.brand.str.extract(r'([a-zA-z]+)', expand=False)
        self.brand_counts_ = brand_col.value_counts()
        brands = list(self.brand_counts_.index)
        self.top_brands_ = brands[:max(1, min(self.num_top_brands, len(brands)))]
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        brand_col = df.brand.str.extract("([a-zA-z]+)", expand=False)
        brand_col[~brand_col.isin(self.top_brands_)] = 'Others'
        df["brand"] = brand_col
        df.drop(self.dropped_cols, axis=1, inplace=True)
        return df

In [12]:
cat_multi = ['vEfuelType', 'driveWheelConfiguration']
cat_single = ['brand', 'eLabel', 'bodyType', 'vEengineType']

In [13]:
col_adderdropper = ColAdderDropper()

In [14]:
num_cols = []
for col in X.columns:
    if (not col in cat_single) and (not col in cat_multi) and (not col in col_adderdropper.dropped_cols):
        num_cols.append(col)

In [15]:
len(num_cols), len(cat_single), len(cat_multi)

(20, 4, 2)

### pipeline cho preprocessing

In [34]:
imp_mean = SimpleImputer(strategy='mean') 
imp_mode = SimpleImputer(strategy='most_frequent')

encoding = OneHotEncoder(handle_unknown='ignore', sparse=False)
ME_0 = MultilabelEncoding(cat_multi[0])
ME_1 = MultilabelEncoding(cat_multi[1])

categorical_transformer = make_pipeline(imp_mode, encoding)
#cat_multi0 = make_pipeline(ME_0, imp_mode)
#cat_multi1 = make_pipeline(Mmp_mode)

colTransform = ColumnTransformer(transformers=[('numerical', imp_mean, num_cols),\
                                               ('categorical', categorical_transformer, cat_single),\
                                               ('cat_multi0', ME_0, [cat_multi[0]]),\
                                               ('cat_multi1', ME_1, [cat_multi[1]])])
colNormalize = StandardScaler()

preprocessing = make_pipeline(col_adderdropper, colTransform, colNormalize)

In [35]:
p = preprocessing.fit_transform(train_X)

ValueError: could not convert string to float: 'Mercedes'

### train và validate model

In [25]:
def train_and_val(full_pipeline, train_X, train_y, val_X, val_y):
    full_pipeline.fit(train_X, train_y);
    pred_y = full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    print(train_score, val_score)
    print("MSE =", np.round(np.mean((pred_y - val_y) ** 2), 3))
    print("MAE =", np.round(np.mean(np.abs(pred_y - val_y)), 3))

### full pipeline với MLPRegressor

In [26]:
mlpregressor = MLPRegressor(hidden_layer_sizes=(32, 64, 64, 32,), solver='adam', learning_rate='adaptive', random_state=0, max_iter=500, verbose=1)
# preprocessing.set_params(coladderdropper__num_top_brands = 10)
full_pipeline = make_pipeline(preprocessing, mlpregressor)
#train_and_val(full_pipeline, train_X, train_y, val_X, val_y)

In [126]:
for n_brands in range(10, 95, 5):
    print("n_brands:", n_brands)
    preprocessing.set_params(coladderdropper__num_top_brands=n_brands)
    full_pipeline = make_pipeline(preprocessing, mlpregressor)
    train_and_val(full_pipeline, train_X, train_y, val_X, val_y)
    print()
'Finish!'

n_brands: 10
0.9771374278480797 0.4963322444258407
MSE = 456.332
MAE = 3.199

n_brands: 15
0.9831557429618785 0.38240738576010547
MSE = 559.55
MAE = 3.106

n_brands: 20
0.9790336282917528 0.5545734530074317
MSE = 403.564
MAE = 3.187

n_brands: 25
0.9804818494866387 0.4662344874076559
MSE = 483.601
MAE = 3.083

n_brands: 30
0.9825669142241719 0.4117452748093833
MSE = 532.969
MAE = 3.083

n_brands: 35
0.9835857647507742 0.3748699960307987
MSE = 566.379
MAE = 3.026

n_brands: 40
0.9838878679588565 0.601941717207078
MSE = 360.648
MAE = 2.945

n_brands: 45
0.9862259862317899 0.7054175174302216
MSE = 266.897
MAE = 2.847

n_brands: 50
0.9853996651364374 0.4273621931787086
MSE = 518.82
MAE = 2.833

n_brands: 55
0.9765862893270885 -0.06991600496647954
MSE = 969.363
MAE = 3.174

n_brands: 60
0.9851617319951368 0.5778284749921256
MSE = 382.495
MAE = 2.83

n_brands: 65
0.9820386704695752 -0.1750132204802899
MSE = 1064.583
MAE = 3.042

n_brands: 70
0.9822833329808499 0.7739139957406
MSE = 204.838
M

'Finish!'

In [50]:
full_pipeline.coef_

AttributeError: 'Pipeline' object has no attribute 'coef_'

### full pipeline với RandomForestRegressor

In [81]:
rfregressor = RandomForestRegressor(n_estimators=512, random_state=0, verbose=1)
full_pipeline = make_pipeline(preprocessing, rfregressor)
train_and_val(full_pipeline, train_X, train_y, val_X, val_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed: 14.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:   14.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:   11.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:    3.2s finished


0.992041403636497 0.9764326316382476
MSE = 21.352
MAE = 2.446


### Chọn hyper-parameter tùy vào regressor

In [89]:
train_scores = []
val_scores = []
alphas = [0.1, 1, 10, 100, 1000]
best_val_score = -float('inf'); best_alpha = None;
for alpha in alphas:
    full_pipeline.set_params(mlpregressor__alpha=alpha)
    full_pipeline.fit(train_X, train_y)
    full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    train_scores.append(train_score)
    val_scores.append(val_score)
    if best_val_score < val_score:
        best_val_score = val_score
        best_alpha = alpha
'Finish!'

'Finish!'

### Test

In [79]:
#full_pipeline.set_params(mlpregressor__alpha=best_alpha)
pred_y = full_pipeline.predict(test_X)

In [80]:
pred_y

array([86.27785847, 19.04360568, 21.83718231, ..., 21.72945061,
       23.92988487, 35.20590858])

In [81]:
np.array(test_y)

array([89.697, 17.93 , 23.25 , ..., 23.195, 22.89 , 38.89 ])

In [82]:
np.mean(np.abs(pred_y - test_y))

2.8049753121555656

In [76]:
np.mean(np.abs(pred_y - test_y))

3.035190445811691

In [83]:
np.mean((pred_y - test_y) ** 2)

23.289880708895822

In [77]:
np.mean((pred_y - test_y) ** 2)

28.014193729943894