In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# 1. Data Cleaning

In [24]:
df_vehicles = pd.read_csv('./data/vehicles.csv')


In [25]:
df_vehicles = df_vehicles.drop(['Unnamed: 0'], axis=1)


In [26]:
df_vehicles.shape


(458213, 25)

In [27]:
def cutIQR(df, col, q1=0.25, q2=0.75):
    Q1 = df[col].quantile(q1)
    Q3 = df[col].quantile(q2)

    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    cutted_df = df[(df[col] >= lower) & (df[col] <= upper)]

    return cutted_df

def clean_data(df_vehicles):
    # step1: missing values
    # for others, simply impute with unknown or generated from other features
    cleaned_df = df_vehicles.dropna(subset=['year'])

    # step2: remove extreme outlier
    cleaned_df = cutIQR(cleaned_df, 'price', q2=0.85)
    
    cleaned_df = cleaned_df[cleaned_df['odometer'] <= 10000000]
    cleaned_df = cutIQR(cleaned_df, 'odometer', q2=0.85)

    return cleaned_df


In [28]:
df = clean_data(df_vehicles)


## 2. Create a Test Set


In [29]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)


In [30]:
train_set.shape, test_set.shape


((318009, 25), (79503, 25))

### Split X and y


In [58]:
vehicles_train = train_set.drop('price', axis=1)
y_train = train_set['price']

vehicles_test = test_set.drop('price', axis=1)
y_test = test_set['price']


In [59]:
vehicles_train.shape, y_train.shape, vehicles_test.shape, y_test.shape


((318009, 24), (318009,), (79503, 24), (79503,))

## 3. Pipeline

Useless columns
- 'id', 'url', 'region_url', 'image_url', 'post_date' are entirely irrelavant to the cars' price


Missing Values
- odometer', 'lat', 'long': imputed with mean values


Categorical Variables
- Onehot


Feature Enginering



In [66]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class selectColumnTransformer(BaseEstimator, TransformerMixin):
    """
    A transformer to select columns
    """
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X):
        return X[self.columns]
    
    def fit(self, X, y=None):
        return self


class dropAllZeroColumnTransformer(BaseEstimator, TransformerMixin):
    """
    A transformer to drop all-zero columns
    """
    def transform(self, X):
        empty_columns = X.columns[X.isnull().all(axis=0)]
        return X.drop(empty_columns, axis=1)
        
    def fit(self, X, y=None):
        return self

    
class dropUselessColumnsTransformer(BaseEstimator, TransformerMixin):
    """
    A transformer to drop useless columns
    """
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X):
        return X.drop(self.columns, axis=1)
        
    def fit(self, X, y=None):
        return self


class categoryAddFeatureTransformer(BaseEstimator, TransformerMixin):
    """
    generate more features from category variables
    """
    def transform(self, X):
        return X.fillna('unknown')

    def fit(self, X, y=None):
        return self
    

In [67]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler

outlier_attribs = ['odometer']
num_attribs = ['lat', 'long', 'year']
cat_attribs = [
    'manufacturer', 'condition', 'cylinders', 
    'fuel', 'title_status', 'transmission', 
    'drive', 'type', 'paint_color'  
]
useless_columns = [
    # irrelevant
    'id', 'url', 'region_url', 'image_url', 'posting_date', 'description',
    # too many missing values
    'VIN', 'size',
    # repeative or not much useful
    'region', 'state', 'model'
] 


outlier_pipeline = Pipeline([
    ('imputed', SimpleImputer(strategy='median')),
    ('outlier', RobustScaler())
])
num_pipeline = Pipeline([
    ('imputed', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])
cat_pipeline = Pipeline([  
    ('add_feature', categoryAddFeatureTransformer()),
    ('one_hot', OneHotEncoder())
])
# full pipeline
full_pipeline = make_pipeline(
#     dropAllZeroColumnTransformer(),
    dropUselessColumnsTransformer(useless_columns),
    ColumnTransformer([
        ('outlier', outlier_pipeline, outlier_attribs),
        ('num', num_pipeline, num_attribs),
        ('cat', cat_pipeline, cat_attribs)
    ])
)


In [68]:
X_train = full_pipeline.fit_transform(vehicles_train)

X_test = full_pipeline.transform(vehicles_test)

X_train.shape, X_test.shape


((318009, 111), (79503, 111))

In [69]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)


LinearRegression()

In [75]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_train_pred = lin_reg.predict(X_train)

print('Training MSE:', mean_squared_error(y_train, y_train_pred))
print('Training RMSE:', mean_squared_error(y_train, y_train_pred, squared=False))
print('Training MAE:', mean_absolute_error(y_train, y_train_pred))
print('Training R2 score:', lin_reg.score(X_train, y_train))


Training MSE: 85336433.89000027
Training RMSE: 9237.77212806206
Training MAE: 6355.654058421237
Training R2 score: 0.45603847702085976


In [77]:
y_test_pred = lin_reg.predict(X_test)

print('Test MSE:', mean_squared_error(y_test, y_test_pred))
print('Test RMSE:', mean_squared_error(y_test, y_test_pred, squared=False))
print('Test MAE:', mean_absolute_error(y_test, y_test_pred))
print('Test R2 score:', lin_reg.score(X_test, y_test))


Test MSE: 86106794.84656583
Test RMSE: 9279.374701269791
Test MAE: 6371.54748712891
Test R2 score: 0.45157500552120156
