In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns # seaborn là thư viện được xây trên matplotlib, giúp việc visualization đỡ khổ hơn
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestRegressor

# Tiền xử lý dữ liệu

In [95]:
train_df = pd.read_csv('/home/hawliet/Documents/Data-science-project/data/cars/train_data.csv', sep='\t', encoding='utf-8')
train_df.head(2)

Unnamed: 0,url,name,model,brand,price,eLabel,bodyType,length,height,width,...,fuelCapacity,fuelConsumption,speed,payload,trailerWeight,vEengineType,vEfuelType,vEengineDisplacement,vEenginePower,torque
0,http://www.cars-data.com//en/mitsubishi-pajero...,Mitsubishi Pajero Pinin Long Body 1.8 GL,Mitsubishi Pajero Pinin Long Body,Mitsubishi,23.745,,suv/crossover,4035.0,1700.0,1695.0,...,53.0,9.3,155.0,525.0,1500.0,dohc,petrol,1834.0,84.0,160.0
1,http://www.cars-data.com//en/mercedes-e-500-pr...,Mercedes E 500 Prestige,Mercedes-Benz E-class,Mercedes,107.05,G,sedan,4879.0,1474.0,1854.0,...,80.0,8.9,250.0,655.0,2100.0,dohc,petrol,4663.0,300.0,600.0


In [96]:
df = train_df.copy()

In [97]:
df = df[df['price'].isnull() == False]

In [100]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self):
        # TODO
        self.dropped_cols = ['url', 'name', 'model', 'weightTotal', 'fuelType', 'vehicleTransmission', 'modelDate']
    def fit(self, X_df, y=None):
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        df.drop(self.dropped_cols, axis=1, inplace=True)
        return df

### Multilabel Ecoding

In [176]:
class MultilabelEcoding(BaseEstimator, TransformerMixin):
    def __init__(self, col_name):
        self.col_name = col_name
        self.type_list = set()
    def fit(self, X_df, y=None):
        types_sr = X_df[self.col_name].str.replace('[^\w]', ' ').str.split()
        types_sr.reset_index(inplace=True, drop=True)
        for i in range(types_sr.shape[0]):
            if type(types_sr[i]) is list:
                self.type_list.add(types_sr[i][0])
        self.type_list = list(self.type_list)
        return self
    def transform(self, X_df, y=None):
        transformed_df = X_df.copy()
        for i in range(len(self.type_list)):
            transformed_df[self.col_name + '_' + self.type_list[i]] = transformed_df[self.col_name].apply(lambda x: 1 if ((type(x) is str) and (self.type_list[i] in x)) else 0)
        transformed_df = transformed_df.drop(self.col_name, axis=1)
        return transformed_df

In [102]:
df.columns

Index(['url', 'name', 'model', 'brand', 'price', 'eLabel', 'bodyType',
       'length', 'height', 'width', 'weight', 'weightTotal', 'emissionsCO2',
       'modelDate', 'fuelType', 'numberOfAxles', 'numberOfDoors',
       'numberOfForwardGears', 'seatingCapacity', 'vehicleTransmission',
       'cargoVolume', 'roofLoad', 'accelerationTime',
       'driveWheelConfiguration', 'fuelCapacity', 'fuelConsumption', 'speed',
       'payload', 'trailerWeight', 'vEengineType', 'vEfuelType',
       'vEengineDisplacement', 'vEenginePower', 'torque'],
      dtype='object')

### Trước tiên thử loại bỏ hết các dòng có chứa giá trị null

In [104]:
len(df.index)

75590

In [105]:
y = df['price']
X = df.drop(['price'], axis = 1)

In [177]:
# origin copy
X_ori = X.copy()
y_ori = y.copy()

In [107]:
len(X), len(y)

(75590, 75590)

In [183]:
train_X, val_X, train_y, val_y = train_test_split(X_ori, y_ori, test_size=0.2, random_state=0)

In [179]:
len(train_X), len(val_X)

(60472, 15118)

### Tạo pipeline

In [180]:
train_X.columns

Index(['url', 'name', 'model', 'brand', 'eLabel', 'bodyType', 'length',
       'height', 'width', 'weight', 'weightTotal', 'emissionsCO2', 'modelDate',
       'fuelType', 'numberOfAxles', 'numberOfDoors', 'numberOfForwardGears',
       'seatingCapacity', 'vehicleTransmission', 'cargoVolume', 'roofLoad',
       'accelerationTime', 'driveWheelConfiguration', 'fuelCapacity',
       'fuelConsumption', 'speed', 'payload', 'trailerWeight', 'vEengineType',
       'vEfuelType', 'vEengineDisplacement', 'vEenginePower', 'torque'],
      dtype='object')

In [184]:
col_adderdropper = ColAdderDropper()

In [185]:
col_adderdropper.dropped_cols;

In [186]:
cat_cols, num_cols = [], []
cat_multi = ['vEfuelType', 'driveWheelConfiguration']
for col in X_.columns:
    if not col in col_adderdropper.dropped_cols:
        if (X_[col].dtype == 'O'):
            cat_cols.append(col)
        else:
            num_cols.append(col)

In [187]:
cat_cols = ['brand',
 'eLabel',
 'bodyType',
 'vEengineType']

In [188]:
len(num_cols), len(cat_cols)

(20, 4)

In [64]:
cat_cols, num_cols

(['brand', 'eLabel', 'bodyType', 'vEengineType'],
 ['length',
  'height',
  'width',
  'weight',
  'emissionsCO2',
  'numberOfAxles',
  'numberOfDoors',
  'numberOfForwardGears',
  'seatingCapacity',
  'cargoVolume',
  'roofLoad',
  'accelerationTime',
  'fuelCapacity',
  'fuelConsumption',
  'speed',
  'payload',
  'trailerWeight',
  'vEengineDisplacement',
  'vEenginePower',
  'torque'])

### pipeline cho preprocessing

In [189]:
imp_mean = SimpleImputer(strategy='mean') 
imp_mode = SimpleImputer(strategy='most_frequent')
vEfuelType_encoder = MultilabelEcoding('vEfuelType')
driveWheelConfig_encoder = MultilabelEcoding('driveWheelConfiguration')

encoding = OneHotEncoder(handle_unknown='ignore', sparse=False)

categorical_transformer = make_pipeline(imp_mode, encoding)

colTransform = ColumnTransformer(transformers=[('numerical', imp_mean, num_cols),\
                                               ('categorical', categorical_transformer, cat_cols),
                                               ('vEfuelType', vEfuelType_encoder, ['vEfuelType']),
                                               ('driveWheelConfig', driveWheelConfig_encoder, ['driveWheelConfiguration'])])
colNormalize = StandardScaler()

preprocessing = make_pipeline(col_adderdropper, colTransform, colNormalize)

### train và validate model

In [190]:
def train_and_val(full_pipeline, train_X, train_y, val_X, val_y):
    full_pipeline.fit(train_X, train_y);
    pred_y = full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    print(train_score, val_score)
    print("MSE =", np.round(np.mean((pred_y - val_y) ** 2), 3))
    print("MAE =", np.round(np.mean(np.abs(pred_y - val_y)), 3))

### full pipeline với MLPRegressor

In [None]:
mlpregressor = MLPRegressor(hidden_layer_sizes=(32, 64, 64, 32,), solver='adam', learning_rate='adaptive', random_state=0, max_iter=500, verbose=1)

full_pipeline = make_pipeline(preprocessing, mlpregressor)

train_and_val(full_pipeline, train_X, train_y*1000, val_X, val_y*1000)

Iteration 1, loss = 611177225.45849490
Iteration 2, loss = 82953885.32473096
Iteration 3, loss = 65685546.51044609
Iteration 4, loss = 58141175.41203404
Iteration 5, loss = 52506070.76057746
Iteration 6, loss = 48026809.82781999
Iteration 7, loss = 45067448.80102903
Iteration 8, loss = 42459547.57111611
Iteration 9, loss = 41246627.89373754
Iteration 10, loss = 40141282.85729135
Iteration 11, loss = 39103918.31839111
Iteration 12, loss = 38545996.56468870
Iteration 13, loss = 38102645.35426016
Iteration 14, loss = 37507211.22685609
Iteration 15, loss = 36694001.50624350
Iteration 16, loss = 36547041.32437725
Iteration 17, loss = 35912484.35526033
Iteration 18, loss = 35419207.25827975
Iteration 19, loss = 35380361.08927415
Iteration 20, loss = 34797482.06670790
Iteration 21, loss = 34112206.61028389
Iteration 22, loss = 33879442.29378086
Iteration 23, loss = 33539395.03991899
Iteration 24, loss = 33177113.86587960
Iteration 25, loss = 32776253.08614397
Iteration 26, loss = 32415420.013

### full pipeline với RandomForestRegressor

In [81]:
rfregressor = RandomForestRegressor(n_estimators=512, random_state=0, verbose=1)
full_pipeline = make_pipeline(preprocessing, rfregressor)
train_and_val(full_pipeline, train_X, train_y, val_X, val_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed: 14.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:   14.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:   11.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:    3.2s finished


0.992041403636497 0.9764326316382476
MSE = 21.352
MAE = 2.446


### Chọn hyper-parameter tùy vào regressor

In [89]:
train_scores = []
val_scores = []
alphas = [0.1, 1, 10, 100, 1000]
best_val_score = -float('inf'); best_alpha = None;
for alpha in alphas:
    full_pipeline.set_params(mlpregressor__alpha=alpha)
    full_pipeline.fit(train_X, train_y)
    full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    train_scores.append(train_score)
    val_scores.append(val_score)
    if best_val_score < val_score:
        best_val_score = val_score
        best_alpha = alpha
'Finish!'

'Finish!'

### Test

In [82]:
#full_pipeline.set_params(mlpregressor__alpha=best_alpha)
pred_y = full_pipeline.predict(test_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 512 out of 512 | elapsed:    2.1s finished


In [83]:
pred_y

array([84.87904136, 19.15797231, 21.40738565, ..., 21.98797525,
       24.22413128, 34.17408183])

In [84]:
np.array(test_y)

array([89.697, 17.93 , 23.25 , ..., 23.195, 22.89 , 38.89 ])

In [85]:
np.mean(np.abs(pred_y - test_y))

2.48961187763616