In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders.binary import BinaryEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.utils import resample

# Load dataset
df = pd.read_csv("car_dataset.csv")

# Use a sample of the data for faster processing
#df = df[:1000]

# Define features and target
X = df.drop(['selling_price'], axis=1)
y = df['selling_price']

# Define categorical features
cat_features = [col for col in X.columns if X[col].dtype == 'object']

# Define preprocessing steps
num_features = X.select_dtypes(exclude="object").columns
onehot_columns = ['seller_type', 'fuel_type', 'transmission_type']
binary_columns = ['car_name']

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
binary_transformer = BinaryEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features),
        ("BinaryEncoder", binary_transformer, binary_columns)
    ]
)

# Separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

# Transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Define evaluation function
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

# Custom Decision Tree Implementation
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        # Convert DataFrame to NumPy array if necessary
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values
        self.tree_ = self._build_tree(X, y, depth=0)
    
    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        if num_samples <= 1 or (self.max_depth is not None and depth >= self.max_depth):
            return np.mean(y)

        best_split = self._find_best_split(X, y)
        if best_split is None:
            return np.mean(y)
        
        left_indices = X[:, best_split['feature']] <= best_split['value']
        right_indices = X[:, best_split['feature']] > best_split['value']
        
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        
        return {'feature': best_split['feature'], 'value': best_split['value'], 'left': left_tree, 'right': right_tree}
    
    def _find_best_split(self, X, y):
        best_split = None
        best_mse = float('inf')
        num_features = X.shape[1]

        for feature in range(num_features):
            values = np.unique(X[:, feature])
            for value in values:
                left_indices = X[:, feature] <= value
                right_indices = X[:, feature] > value
                
                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue
                
                left_y = y[left_indices]
                right_y = y[right_indices]
                
                mse = (np.var(left_y) * len(left_y) + np.var(right_y) * len(right_y)) / len(y)
                
                if mse < best_mse:
                    best_split = {'feature': feature, 'value': value}
                    best_mse = mse
        
        return best_split

    def predict(self, X):
        # Convert DataFrame to NumPy array if necessary
        if isinstance(X, pd.DataFrame):
            X = X.values
        return np.array([self._predict(sample, self.tree_) for sample in X])
    
    def _predict(self, sample, tree):
        if not isinstance(tree, dict):
            return tree
        
        if sample[tree['feature']] <= tree['value']:
            return self._predict(sample, tree['left'])
        else:
            return self._predict(sample, tree['right'])

# Custom Random Forest Implementation
class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        # Convert DataFrame to NumPy array if necessary
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values
        for _ in range(self.n_estimators):
            X_resampled, y_resampled = resample(X, y)
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_resampled, y_resampled)
            self.trees.append(tree)
    
    def predict(self, X):
        # Convert DataFrame to NumPy array if necessary
        if isinstance(X, pd.DataFrame):
            X = X.values
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_predictions, axis=0)

# Define models
models = {
    "Decision Tree": DecisionTree(max_depth=5),
    "Random Forest Regressor": RandomForest(n_estimators=100, max_depth=5),
}

# Train and evaluate models
results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train_transformed, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train_transformed)
    y_test_pred = model.predict(X_test_transformed)
    
    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    # Print results
    print(f"{model_name}:")
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print('----------------------------------')
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    print('='*35)
    print('\n')

    # Store results
    results.append({
        'model': model_name,
        'train_mae': model_train_mae,
        'train_rmse': model_train_rmse,
        'train_r2': model_train_r2,
        'test_mae': model_test_mae,
        'test_rmse': model_test_rmse,
        'test_r2': model_test_r2
    })

# Optional: Convert results to a DataFrame for easier analysis
results_df = pd.DataFrame(results)
print(results_df)



Decision Tree:
Model performance for Training set
- Root Mean Squared Error: 330620.0281
- Mean Absolute Error: 164560.3228
- R2 Score: 0.8652
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 386653.5437
- Mean Absolute Error: 180812.7653
- R2 Score: 0.8014


Random Forest Regressor:
Model performance for Training set
- Root Mean Squared Error: 294608.2950
- Mean Absolute Error: 145014.1077
- R2 Score: 0.8930
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 311285.1492
- Mean Absolute Error: 159035.1833
- R2 Score: 0.8713


                     model      train_mae     train_rmse  train_r2  \
0            Decision Tree  164560.322754  330620.028078  0.865222   
1  Random Forest Regressor  145014.107702  294608.295049  0.892984   

        test_mae      test_rmse   test_r2  
0  180812.765273  386653.543670  0.801402  
1  159035.183296  311285.149199  0.871280  


In [2]:
# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

In [3]:
# Save the model
joblib.dump(models['Random Forest Regressor'], 'car_price_model.pkl')

['car_price_model.pkl']

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the preprocessor
with open('preprocessor.pkl', 'rb') as file:
    preprocessor = pickle.load(file)

# Check the type of the preprocessor
print(type(preprocessor))

# Assuming it should be a StandardScaler or similar object
if isinstance(preprocessor, StandardScaler):
    # Load the model
    with open('car_price_model.pkl', 'rb') as file:
        model = pickle.load(file)

    # Input data without the 'selling_price' column
    input_data = {
        'brand': ['Maruti', 'Hyundai'],
        'model': ['Alto', 'Grand'],
        'vehicle_age': [9, 5],
        'km_driven': [120000, 20000],
        'seller_type': ['Individual', 'Individual'],
        'fuel_type': ['Petrol', 'Petrol'],
        'transmission_type': ['Manual', 'Manual'],
        'mileage': [19.7, 18.9],
        'engine': [796, 1197],
        'max_power': [46.3, 82],
        'seats': [5, 5]
    }

    # Convert the input data to a DataFrame
    input_df = pd.DataFrame(input_data)

    # Preprocess the input data
    input_transformed = preprocessor.transform(input_df)

    # Make predictions
    predicted_prices = model.predict(input_transformed)

    # Print the predicted prices
    for i, price in enumerate(predicted_prices):
        print(f'The predicted selling price for car {i+1} is: {price}')


<class 'numpy.ndarray'>
