In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from keras.layers import Dense, Dropout

In [19]:
df = pd.read_csv('cleaned_data.csv')
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

In [20]:
df.shape

(78410, 9)

In [21]:
X = df.drop('price', axis=1)
y = df['price']

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [23]:
categorical_features = ['registered_in', 'color', 'brand', 'vehicle', 'transmission', 'engine_type']
numeric_features = ['model_year', 'mileage']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Adjusting the pipeline to use the preprocessor for TensorFlow model
data_preparation_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [24]:
x_train_prepared = data_preparation_pipeline.fit_transform(x_train)
x_test_prepared = data_preparation_pipeline.transform(x_test)

In [25]:
x_train_prepared = x_train_prepared.toarray() if hasattr(x_train_prepared, "toarray") else x_train_prepared
x_test_prepared = x_test_prepared.toarray() if hasattr(x_test_prepared, "toarray") else x_test_prepared

In [26]:
model = tf.keras.Sequential([
    Dense(256, activation='relu', input_shape=(x_train_prepared.shape[1],)),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dropout(0.1),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dropout(0.1),
    Dense(1)
])

In [27]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [28]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [29]:
history = model.fit(x_train_prepared, y_train, validation_split=0.2, epochs=100, callbacks=[early_stopping], verbose=2)

Epoch 1/100
1373/1373 - 14s - loss: 330.8023 - mae: 6.9259 - val_loss: 150.0746 - val_mae: 7.5196 - 14s/epoch - 11ms/step
Epoch 2/100
1373/1373 - 9s - loss: 145.8054 - mae: 5.1588 - val_loss: 118.6289 - val_mae: 7.1648 - 9s/epoch - 7ms/step
Epoch 3/100
1373/1373 - 9s - loss: 133.2996 - mae: 4.8738 - val_loss: 80.6097 - val_mae: 4.9909 - 9s/epoch - 7ms/step
Epoch 4/100
1373/1373 - 11s - loss: 121.9984 - mae: 4.6124 - val_loss: 37.3310 - val_mae: 2.9066 - 11s/epoch - 8ms/step
Epoch 5/100
1373/1373 - 11s - loss: 111.7661 - mae: 4.4554 - val_loss: 40.0865 - val_mae: 2.8795 - 11s/epoch - 8ms/step
Epoch 6/100
1373/1373 - 11s - loss: 130.6494 - mae: 4.4808 - val_loss: 44.5122 - val_mae: 3.1517 - 11s/epoch - 8ms/step
Epoch 7/100
1373/1373 - 11s - loss: 100.7461 - mae: 4.2569 - val_loss: 75.8629 - val_mae: 3.3037 - 11s/epoch - 8ms/step
Epoch 8/100
1373/1373 - 10s - loss: 101.2939 - mae: 4.1975 - val_loss: 39.4263 - val_mae: 2.9309 - 10s/epoch - 8ms/step
Epoch 9/100
1373/1373 - 11s - loss: 100.7

In [30]:
y_pred = model.predict(x_test_prepared).flatten()

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("Multi Layer Perceptron Model Evaluation Metrics")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

Multi Layer Perceptron Model Evaluation Metrics
MAE: 3.030130556093026
MSE: 50.755011282261634
RMSE: 7.124255138767956
R^2: 0.9526307040544846


In [31]:
model.save('mlp_model.h5')

  saving_api.save_model(


In [32]:
import joblib
joblib.dump(data_preparation_pipeline, 'preprocessing_pipeline.pkl')

['preprocessing_pipeline.pkl']