In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns # seaborn là thư viện được xây trên matplotlib, giúp việc visualization đỡ khổ hơn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
cars_df = pd.read_csv("cars_dataset.csv", sep = "\t", encoding='utf-8')

# Chuyển đổi dữ liệu từ dữ liệu thô

In [4]:
num_cols = ['price', 'length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2', 'numberOfAxles',
            'numberOfDoors', 'numberOfForwardGears', 'seatingCapacity', 'cargoVolume', 'roofLoad', 
            'accelerationTime', 'fuelCapacity', 'fuelConsumption', 'speed', 'payload', 'trailerWeight', 
            'vEengineDisplacement', 'vEenginePower', 'torque']

In [5]:
cat_cols = []
for col in cars_df.columns:
    if not (col in num_cols):
        cat_cols.append(col)

In [6]:
print(len(num_cols), len(cat_cols))
print(num_cols, '\n', cat_cols)

22 12
['price', 'length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2', 'numberOfAxles', 'numberOfDoors', 'numberOfForwardGears', 'seatingCapacity', 'cargoVolume', 'roofLoad', 'accelerationTime', 'fuelCapacity', 'fuelConsumption', 'speed', 'payload', 'trailerWeight', 'vEengineDisplacement', 'vEenginePower', 'torque'] 
 ['url', 'name', 'model', 'brand', 'eLabel', 'bodyType', 'modelDate', 'fuelType', 'vehicleTransmission', 'driveWheelConfiguration', 'vEengineType', 'vEfuelType']


### Xử lý các cột dữ liệu số

In [7]:
# copy ra df để xư lý
df = cars_df.copy()

In [8]:
# Loại bỏ các cột không liên quan
#df.drop(['url', 'name', 'model'], axis=1, inplace=True)

In [10]:
# fucntion convert cac thuoc tinh khac
def cvtFloat(x):
    if type(x) == str:
        temp = x.replace(',', '.').split()[0]
    else:
        temp = x
    val = None
    try:
        val = float(temp)
    except ValueError:
        return val
    return val

In [11]:
for el in num_cols:
    if el != 'cargoVolume':
        print(el)
        df[el] = df[el].apply(cvtFloat)

price
length
height
width
weight
weightTotal
emissionsCO2
numberOfAxles
numberOfDoors
numberOfForwardGears
seatingCapacity
roofLoad
accelerationTime
fuelCapacity
fuelConsumption
speed
payload
trailerWeight
vEengineDisplacement
vEenginePower
torque


In [12]:
# Hàm xử lý riêng cho cargoVolume
def cvtFloat_cargoVolume(x):
    temp = x.split()[0]
    temp = temp.replace('-', ' ')
    temp = temp.split()
    if len(temp) > 0:
        temp = temp[-1]
    else:
        temp = x
    val = None
    try:
        val = float(temp)
    except ValueError:
        return val
    return val

In [13]:
df['cargoVolume'] = df['cargoVolume'].apply(cvtFloat_cargoVolume)

In [72]:
df[num_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84174 entries, 0 to 84173
Data columns (total 22 columns):
price                   83989 non-null float64
length                  84119 non-null float64
height                  84066 non-null float64
width                   84111 non-null float64
weight                  83795 non-null float64
weightTotal             83795 non-null float64
emissionsCO2            68428 non-null float64
numberOfAxles           84174 non-null float64
numberOfDoors           84174 non-null float64
numberOfForwardGears    82037 non-null float64
seatingCapacity         82228 non-null float64
cargoVolume             82921 non-null float64
roofLoad                70042 non-null float64
accelerationTime        81646 non-null float64
fuelCapacity            83966 non-null float64
fuelConsumption         82853 non-null float64
speed                   83242 non-null float64
payload                 82819 non-null float64
trailerWeight           79262 non-null float6

### Xử lý các cột dữ liệu categorize

In [19]:
for cat in cat_cols[:]:
    print(cat, len(cars_df[cat].unique()))

url 84173
name 44332
model 1678
brand 89
eLabel 9
bodyType 11
modelDate 52
fuelType 11
vehicleTransmission 1
driveWheelConfiguration 6
vEengineType 4
vEfuelType 11


* Có thể loại bỏ cột vehicleTransmission vì chỉ có 1 giá trị, không có ý nghĩa trong việc học.
* Cột fuelType và vEfuelType là giống nhau (do quá trình crawl nhóm không để ý), có thể drop cột fuelType.
* Các cột url, name, model có nhiều ý nghĩa, nên có thể loại bỏ.
* brand có thể xét vì có tới 89 giá trị (có khả năng sẽ có ý nghĩa với các brand có giá trị cao), modelDate cần xem xét.

**=> Số cột còn lại là: eLabel (9), bodyType (11), driveWheelConfiguration (6), vEengineType (4), vEfuelType (11).**

In [76]:
# chuẩn hóa cột modelDate
def norm_modelDate(x):
    if (x == 0):
        return None
    else:
        return str(x)
df['modelDate'] = df['modelDate'].apply(norm_modelDate)
df['modelDate'].unique()

array(['1999', '2018', '2019', '2015', '2016', '2017', '1991', '1995',
       '1984', '2009', '2012', '2011', '2010', '1979', '1980', '2001',
       '2004', '2003', '2002', '1994', '1981', '2007', '2008', '1983',
       '1982', '2005', '2006', '1998', '1993', '1990', '1985', '1986',
       '2013', '2014', '1997', None, '1996', '1988', '1992', '1987',
       '1989', '2000', '1976', '1978', '1977', '1972', '1975', '1974',
       '1973', '1970', '1971', '1969'], dtype=object)

In [39]:
df['driveWheelConfiguration'].unique()

array(['front+rear', 'front', 'rear', 'voor', 'voor+achter', 'achter'],
      dtype=object)

In [48]:
#df[['price', 'driveWheelConfiguration']].groupby('driveWheelConfiguration').mean()

In [35]:
df['bodyType'].unique()

array(['hatchback', 'stationwagon', 'suv/crossover', 'sedan', 'cabriolet',
       'coupe', 'mpv', 'van', nan, 'bus', 'pick-up'], dtype=object)

In [37]:
df['eLabel'].unique()

array(['G', 'C', 'E', 'D', 'B', 'F', 'N.A.', '-', 'A'], dtype=object)

In [50]:
df['vEengineType'].unique()

array(['dohc', 'ohc', 'ohv', nan], dtype=object)

In [52]:
df['vEfuelType'].unique()

array(['petrol', 'diesel', 'LPG', 'benzine', 'bio-ethanol', 'natural gas',
       nan, 'LPG / petrol', 'aardgas', 'aardgas / petrol',
       'petrol / bio-ethanol'], dtype=object)

* Cột driveWheelConfiguration không có giá trị lỗi ('N.A.', '-', ...)
* Các cột bodyType, vEengineType, vEfuelType có chứa nan (đã được xử lý).
* Cột eLabel có chứa các giá trị lỗi, cần được chuẩn hóa. Sau khi chuẩn hóa, dòng thiếu dữ liệu quá nhiều nên cần loại bỏ khi qua bước xử lý.

In [67]:
def norm_eLabel(x):
    if (x == 'N.A.' or x == '-'):
        return None
    else:
        return x

In [68]:
df['eLabel'] = df['eLabel'].apply(norm_eLabel)

In [77]:
df[cat_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84174 entries, 0 to 84173
Data columns (total 12 columns):
url                        84174 non-null object
name                       84174 non-null object
model                      84174 non-null object
brand                      84174 non-null object
eLabel                     68428 non-null object
bodyType                   83418 non-null object
modelDate                  83418 non-null object
fuelType                   83999 non-null object
vehicleTransmission        84174 non-null object
driveWheelConfiguration    84174 non-null object
vEengineType               84001 non-null object
vEfuelType                 83999 non-null object
dtypes: object(12)
memory usage: 7.7+ MB


### Lưu ra file để tiện xử lý

In [78]:
df.to_csv("cars_preprocessed_undrop.csv", sep = "\t", index=False, encoding='utf-8')

# Tiền xử lý dữ liệu

In [4]:
df_ori = pd.read_csv("cars_preprocessed_undrop.csv", sep = "\t", encoding='utf-8')

In [3]:
# Tách các dong null ra để làm tập test với model sau này so với thực tế.
# df_ori[df['price'].isnull()].to_csv('test_without_price.csv', sep = "\t", index=False, encoding='utf-8')

In [5]:
df = df_ori.copy()

In [6]:
df = df[df['price'].isnull() == False]

In [9]:
a = [1, 2, 3]
a.extend([2, 3])
a

[1, 2, 3, 2, 3]

In [300]:
# Bỏ các cột đã đề cập ở trên
# df.drop(['url', 'name', 'model', 'weightTotal', 'fuelType', 'vehicleTransmission', 'modelDate'], axis = 1,inplace = True)

In [10]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self):
        # TODO
        self.dropped_cols = ['url', 'name', 'model', 'weightTotal', 'fuelType', 'vehicleTransmission', 'modelDate']
    def fit(self, X_df, y=None):
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        df.drop(self.dropped_cols, axis=1, inplace=True)
        return df

In [68]:
df.columns

Index(['url', 'name', 'model', 'brand', 'price', 'eLabel', 'bodyType',
       'length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2',
       'modelDate', 'fuelType', 'numberOfAxles', 'numberOfDoors',
       'numberOfForwardGears', 'seatingCapacity', 'vehicleTransmission',
       'cargoVolume', 'roofLoad', 'accelerationTime',
       'driveWheelConfiguration', 'fuelCapacity', 'fuelConsumption', 'speed',
       'payload', 'trailerWeight', 'vEengineType', 'vEfuelType',
       'vEengineDisplacement', 'vEenginePower', 'torque'],
      dtype='object')

### Trước tiên thử loại bỏ hết các dòng có chứa giá trị null

In [302]:
#for col in df.columns:
#    df = df[df[col].isnull() == False]

In [12]:
len(df.index)

83989

In [13]:
y = df['price']
X = df.drop(['price'], axis = 1)

In [14]:
# origin copy
X_ori = X.copy()
y_ori = y.copy()

In [15]:
len(X), len(y)

(83989, 83989)

In [16]:
# Tách tập train + val và tập test theo tỉ lệ 90%:10%
X_, test_X, y_, test_y = train_test_split(X_ori, y_ori, test_size=0.1, random_state=0)

In [308]:
#df_test = test_X.copy()
#df_test['price'] = test_y.copy().round(3)

In [19]:
train_X, val_X, train_y, val_y = train_test_split(X_, y_, test_size=0.2, random_state=0)

In [20]:
len(train_X), len(val_X)

(60472, 15118)

### Tạo pipeline

In [23]:
col_adderdropper = ColAdderDropper()

In [33]:
col_adderdropper.dropped_cols;

In [29]:
cat_cols, num_cols = [], []
for col in X_.columns:
    if not col in col_adderdropper.dropped_cols:
        if (X_[col].dtype == 'O'):
            cat_cols.append(col)
        else:
            num_cols.append(col)

In [30]:
len(num_cols), len(cat_cols)

(20, 6)

In [66]:
cat_cols, num_cols

(['brand',
  'eLabel',
  'bodyType',
  'driveWheelConfiguration',
  'vEengineType',
  'vEfuelType'],
 ['length',
  'height',
  'width',
  'weight',
  'emissionsCO2',
  'numberOfAxles',
  'numberOfDoors',
  'numberOfForwardGears',
  'seatingCapacity',
  'cargoVolume',
  'roofLoad',
  'accelerationTime',
  'fuelCapacity',
  'fuelConsumption',
  'speed',
  'payload',
  'trailerWeight',
  'vEengineDisplacement',
  'vEenginePower',
  'torque'])

### pipeline cho preprocessing

In [40]:
imp_mean = SimpleImputer(strategy='mean') 
imp_mode = SimpleImputer(strategy='most_frequent')

encoding = OneHotEncoder(handle_unknown='ignore', sparse=False)

categorical_transformer = make_pipeline(imp_mode, encoding)

colTransform = ColumnTransformer(transformers=[('numerical', imp_mean, num_cols),\
                                               ('categorical', categorical_transformer, cat_cols)])
colNormalize = StandardScaler()

preprocessing = make_pipeline(col_adderdropper, colTransform, colNormalize)

### train và validate model

In [41]:
def train_and_val(full_pipeline, train_X, train_y, val_X, val_y):
    full_pipeline.fit(train_X, train_y);
    pred_y = full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    print(train_score, val_score)
    print("MSE =", np.round(np.mean((pred_y - val_y) ** 2), 3))
    print("MAE =", np.round(np.mean(np.abs(pred_y - val_y)), 3))

### full pipeline với MLPRegressor

In [69]:
mlpregressor = MLPRegressor(hidden_layer_sizes=(32, 64, 64, 32,), solver='adam', learning_rate='adaptive', random_state=0, max_iter=500, verbose=1)

full_pipeline = make_pipeline(preprocessing, mlpregressor)

train_and_val(full_pipeline, train_X, train_y, val_X, val_y)

Iteration 1, loss = 229.44849337
Iteration 2, loss = 59.49141069
Iteration 3, loss = 47.17302827
Iteration 4, loss = 41.31768226
Iteration 5, loss = 39.28661956
Iteration 6, loss = 37.10548543
Iteration 7, loss = 34.65520703
Iteration 8, loss = 33.56284655
Iteration 9, loss = 31.55660596
Iteration 10, loss = 30.40952277
Iteration 11, loss = 29.87287085
Iteration 12, loss = 30.29118029
Iteration 13, loss = 28.12559662
Iteration 14, loss = 28.19782518
Iteration 15, loss = 26.08834241
Iteration 16, loss = 24.83734456
Iteration 17, loss = 24.72649574
Iteration 18, loss = 24.33778683
Iteration 19, loss = 23.68393142
Iteration 20, loss = 21.55771609
Iteration 21, loss = 22.66899942
Iteration 22, loss = 20.66578082
Iteration 23, loss = 20.33237498
Iteration 24, loss = 20.21758749
Iteration 25, loss = 18.82353425
Iteration 26, loss = 18.53370885
Iteration 27, loss = 18.81333674
Iteration 28, loss = 17.93155389
Iteration 29, loss = 17.93152126
Iteration 30, loss = 16.95296120
Iteration 31, loss

### full pipeline với RandomForestRegressor

In [81]:
rfregressor = RandomForestRegressor(n_estimators=512, random_state=0, verbose=1)
full_pipeline = make_pipeline(preprocessing, rfregressor)
train_and_val(full_pipeline, train_X, train_y, val_X, val_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed: 14.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:   14.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:   11.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:    3.2s finished


0.992041403636497 0.9764326316382476
MSE = 21.352
MAE = 2.446


### Chọn hyper-parameter tùy vào regressor

In [89]:
train_scores = []
val_scores = []
alphas = [0.1, 1, 10, 100, 1000]
best_val_score = -float('inf'); best_alpha = None;
for alpha in alphas:
    full_pipeline.set_params(mlpregressor__alpha=alpha)
    full_pipeline.fit(train_X, train_y)
    full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    train_scores.append(train_score)
    val_scores.append(val_score)
    if best_val_score < val_score:
        best_val_score = val_score
        best_alpha = alpha
'Finish!'

'Finish!'

### Test

In [82]:
#full_pipeline.set_params(mlpregressor__alpha=best_alpha)
pred_y = full_pipeline.predict(test_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:    2.1s finished


In [83]:
pred_y

array([84.87904136, 19.15797231, 21.40738565, ..., 21.98797525,
       24.22413128, 34.17408183])

In [84]:
np.array(test_y)

array([89.697, 17.93 , 23.25 , ..., 23.195, 22.89 , 38.89 ])

In [85]:
np.mean(np.abs(pred_y - test_y))

2.48961187763616