In [48]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

In [49]:
train = pd.read_csv("../data/cars/train_data.csv", sep = "\t", encoding='utf-8')
test = pd.read_csv("../data/cars/test_data.csv", sep = "\t", encoding='utf-8')

In [50]:
y = train['price']
test_y = test['price']
X = train.drop(['price'], axis = 1)
test_X = test.drop(['price'], axis = 1)

#### Multilabel Encoding

In [51]:
def f(x, l):
    if type(x) is list:
        l.update(x)
class MultilabelEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, col_name):
        self.col_name = col_name
        self.type_list = set()
    def fit(self, X_df, y=None):
        types_sr = X_df[self.col_name].str.replace('[/+]', ' ').str.split()
        types_sr.apply(f, args=(self.type_list,))
        self.type_list = list(self.type_list)
        return self
    def transform(self, X_df, y=None):
        transformed_df = X_df.copy()
        for i in range(len(self.type_list)):
            transformed_df[self.col_name + '_' + self.type_list[i]] = transformed_df[self.col_name].apply(lambda x: 1 if ((type(x) is str) and (self.type_list[i] in x)) else 0)
        transformed_df.drop(self.col_name, axis=1, inplace=True)
        return transformed_df

#### col add and drop

In [52]:
class ColAdderDropper(BaseEstimator, TransformerMixin):
    def __init__(self, num_top_brands = 10):
        # TODO
        self.num_top_brands = num_top_brands
        self.dropped_cols = ['url', 'name', 'model', 'weightTotal', 'fuelType', 'vehicleTransmission', 'modelDate']
    def fit(self, X_df, y=None):
        brand_col = X_df.brand.str.extract(r'([a-zA-z]+)', expand=False)
        self.brand_counts_ = brand_col.value_counts()
        brands = list(self.brand_counts_.index)
        self.top_brands_ = brands[:max(1, min(self.num_top_brands, len(brands)))]
        return self
    def transform(self, X_df, y=None):
        df = X_df.copy()
        brand_col = df.brand.str.extract("([a-zA-z]+)", expand=False)
        brand_col[~brand_col.isin(self.top_brands_)] = 'Others'
        df["brand"] = brand_col
        df.drop(self.dropped_cols, axis=1, inplace=True)
        return df

In [53]:
cat_multi = ['vEfuelType', 'driveWheelConfiguration']
cat_single = ['brand', 'eLabel', 'bodyType', 'vEengineType']

In [54]:
col_adderdropper = ColAdderDropper()

In [55]:
num_cols = []
for col in X.columns:
    if (not col in cat_single) and (not col in cat_multi) and (not col in col_adderdropper.dropped_cols):
        num_cols.append(col)
        
num_col_to_remove = ['height', 'numberOfAxles', 'numberOfDoors', 'seatingCapacity', 'cargoVolume',
                    'accelerationTime']
[num_cols.remove(x) for x in num_col_to_remove]
num_cols, len(num_cols)

(['length',
  'width',
  'weight',
  'emissionsCO2',
  'numberOfForwardGears',
  'roofLoad',
  'fuelCapacity',
  'fuelConsumption',
  'speed',
  'payload',
  'trailerWeight',
  'vEengineDisplacement',
  'vEenginePower',
  'torque'],
 14)

#### Hàm train và validate model

In [56]:
def train_and_val(full_pipeline, train_X, train_y, val_X, val_y):
    full_pipeline.fit(train_X, train_y);
    #print("n_brands:", len(col_adderdropper.top_brands_))
    pred_y = full_pipeline.predict(val_X)
    train_score = full_pipeline.score(train_X, train_y)
    val_score = full_pipeline.score(val_X, val_y)
    print(train_score, val_score)
    print("MSE =", np.round(np.mean((pred_y - val_y) ** 2), 3))
    print("MAE =", np.round(np.mean(np.abs(pred_y - val_y)), 3))
    return train_score, val_score

### các pipeline cần thiết cho pipeline cho preprocessing

In [57]:
imp_mean = SimpleImputer(strategy='mean') 
imp_mode = SimpleImputer(strategy='most_frequent')

encoding = OneHotEncoder(handle_unknown='ignore', sparse=False)
vEfuelType_encoder = MultilabelEncoding('vEfuelType')
driveWheelConfig_encoder = MultilabelEncoding('driveWheelConfiguration')

categorical_transformer = make_pipeline(imp_mode, encoding)

colTransform = ColumnTransformer(transformers=[('numerical', imp_mean, num_cols),\
                                               ('categorical', categorical_transformer, cat_single),
                                               ('vEfuelType', vEfuelType_encoder, ['vEfuelType']),
                                               ('driveWheelConfig', driveWheelConfig_encoder, ['driveWheelConfiguration'])])

colNormalize = StandardScaler()
pca = PCA(30)

preprocessing = make_pipeline(col_adderdropper, colTransform, colNormalize)

### Full pipeline với MLPRegressor

In [58]:
best_n_brands = 80
mlpregressor = MLPRegressor(hidden_layer_sizes=(256, 512, 512, 256, ), solver='adam', learning_rate='adaptive'\
                            ,random_state=0, max_iter=500, early_stopping=True, verbose=1)
full_pipeline = make_pipeline(col_adderdropper, colTransform, colNormalize, mlpregressor)
full_pipeline.set_params(coladderdropper__num_top_brands=best_n_brands)
train_and_val(full_pipeline, train, y, test_X, test_y)

Iteration 1, loss = 110.28410122
Validation score: 0.922276
Iteration 2, loss = 44.44261977
Validation score: 0.927356
Iteration 3, loss = 37.54884753
Validation score: 0.941048
Iteration 4, loss = 36.46500166
Validation score: 0.927193
Iteration 5, loss = 34.99357146
Validation score: 0.937087
Iteration 6, loss = 30.39187163
Validation score: 0.947375
Iteration 7, loss = 27.50362920
Validation score: 0.949383
Iteration 8, loss = 28.01421253
Validation score: 0.957199
Iteration 9, loss = 25.04107164
Validation score: 0.956414
Iteration 10, loss = 24.22103417
Validation score: 0.955130
Iteration 11, loss = 20.81094842
Validation score: 0.957062
Iteration 12, loss = 20.37327122
Validation score: 0.956998
Iteration 13, loss = 20.35389482
Validation score: 0.960079
Iteration 14, loss = 20.46053649
Validation score: 0.946680
Iteration 15, loss = 17.70876329
Validation score: 0.934189
Iteration 16, loss = 16.85773045
Validation score: 0.945306
Iteration 17, loss = 15.46220332
Validation scor

(0.9846814518215354, 0.9758296985939665)

#### save model

In [60]:
joblib.dump(full_pipeline, '../resource/nn_80_final.pkl')

['../resource/nn_80_final.pkl']

### full pipeline với RandomForestRegressor

In [13]:
rfregressor = RandomForestRegressor(n_estimators=256, random_state=0, verbose=1)
full_pipeline = make_pipeline(col_adderdropper, colTransform, colNormalize, rfregressor)
full_pipeline.set_params(coladderdropper__num_top_brands = 90)
train_and_val(full_pipeline, X, y, test_X, test_y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed:  3.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed:    4.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.9922015113377136 0.9706421257451378
MSE = 27.172
MAE = 2.429


[Parallel(n_jobs=1)]: Done 256 out of 256 | elapsed:    0.6s finished


(0.9922015113377136, 0.9706421257451378)

In [19]:
# save model
# joblib.dump(full_pipeline, 'rf_1024.pkl')

### Gradient Boosting Regressor

In [47]:
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [44]:
preprocessing.set_params(coladderdropper__num_top_brands=best_n_brands)
X_preprocessed = preprocessing.fit_transform(X)

In [48]:
param_test1 = {'n_estimators': [100, 200, 400, 600, 800],
              'max_depth': list(range(1,10))}
gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=500, min_samples_leaf=50,
                                                               max_features='sqrt',subsample=0.8,verbose = 0), 
                        param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

In [49]:
gsearch1.fit(X_preprocessed, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features='sqrt',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=50,
                                                 min_samples_split=500,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
           

In [50]:
gsearch1.best_params_

{'max_depth': 9, 'n_estimators': 800}

In [51]:
gbm = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=500, min_samples_leaf=50,max_depth=9,
                                n_estimators=800, max_features='sqrt',subsample=0.8, random_state=10)

In [52]:
full_pipeline = make_pipeline(col_adderdropper, colTransform, colNormalize, gbm)
full_pipeline.set_params(coladderdropper__num_top_brands = 90)
train_and_val(full_pipeline, X, y, test_X, test_y)

0.9759913902927685 0.9726224630872703
MSE = 25.339
MAE = 2.771


(0.9759913902927685, 0.9726224630872703)