In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import pickle


 Loading the datasets

In [3]:
df_ford = pd.read_csv('ford.csv')
df_volkswagen = pd.read_csv('vw.csv')

Combining Ford and Volkswagen datasets

In [4]:
df_merged = pd.concat([df_ford, df_volkswagen], ignore_index=True)

Preprocess the data

In [5]:
X = df_merged.drop('price', axis=1)
Y = df_merged['price']

Split the data into training and testing sets

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=30)

Preprocessing pipeline

In [7]:
numeric_columns = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
categorical_columns = ['model', 'transmission', 'fuelType']

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', StandardScaler(), numeric_columns),
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ])

Create a pipeline with a random forest regressor

In [8]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=30))])


Train the model

In [9]:
model.fit(X_train, Y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  ['year', 'mileage', 'tax',
                                                   'mpg', 'engineSize']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['model', 'transmission',
                                                   'fuelType'])])),
                ('regressor', RandomForestRegressor(random_state=30))])

Evaluating the model using MAE and R2

In [10]:
Y_pred = model.predict(X_test)
mae = mean_absolute_error(Y_test, Y_pred)
print("Mean Absolute Error: " + str(mae))

r2 = r2_score(Y_test, Y_pred)
print("R-squared (R^2) Score: "+str(r2))

Mean Absolute Error: 971.3420803588134


NameError: name 'r2_score' is not defined

Save the trained model

In [49]:
with open('ford_and_Volkswagen_price_predictor.pkl', 'wb') as f:
    pickle.dump(model, f)