# **100,000 UK Used Car Dataset**


In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv('toyota.csv')

In [3]:
data.head()


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,GT86,2016,16000,Manual,24089,Petrol,265,36.2,2.0
1,GT86,2017,15995,Manual,18615,Petrol,145,36.2,2.0
2,GT86,2015,13998,Manual,27469,Petrol,265,36.2,2.0
3,GT86,2017,18998,Manual,14736,Petrol,150,36.2,2.0
4,GT86,2017,17498,Manual,36284,Petrol,145,36.2,2.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6738 entries, 0 to 6737
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         6738 non-null   object 
 1   year          6738 non-null   int64  
 2   price         6738 non-null   int64  
 3   transmission  6738 non-null   object 
 4   mileage       6738 non-null   int64  
 5   fuelType      6738 non-null   object 
 6   tax           6738 non-null   int64  
 7   mpg           6738 non-null   float64
 8   engineSize    6738 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 473.9+ KB


# Data preparation


In [5]:
# Remove outlier
data = data[data.year < 2060]

In [6]:
# Select features
num_features = ["year", "engineSize", "mileage"]
cat_features = ["fuelType"]

# Baseline model

In [7]:
X = data[num_features + cat_features].copy()
y = data.price

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8,
                                                      random_state=42)


In [9]:
cat_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_encoder, cat_features)
    ]
)

rf = RandomForestRegressor()

model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('randomforest', rf)
    ]
)

model.fit(X_train, y_train)

y_predict = model.predict(X_valid)

In [10]:
print("RMSE: ", mean_squared_error(y_valid, y_predict, squared=False))


RMSE:  5499.134529246673


# Save baseline model

In [11]:
joblib.dump(model, "model_rf.pkl")

['model_rf.pkl']