In [61]:
# pip install pandas numpy matplotlib scikit-learn --quiet

#dataset source: https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho/data

!pip install flask joblib



In [1]:
import pandas as pd
import datetime
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [36]:
np.set_printoptions(suppress=True)

In [2]:
df = pd.read_csv("car_details.csv")
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [3]:
df.shape

(8128, 13)

### Data Cleaning & Feature Engineering

In [4]:
df.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

In [5]:
df = df.dropna()
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
mileage          0
engine           0
max_power        0
torque           0
seats            0
dtype: int64

In [6]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [7]:
current_year = datetime.datetime.today().strftime('%Y')

df['car_used_years'] = df['year'].apply(lambda x: int(current_year) - int(x))
df.tail()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,car_used_years
8123,Hyundai i20 Magna,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5 kmpl,1197 CC,82.85 bhp,113.7Nm@ 4000rpm,5.0,11
8124,Hyundai Verna CRDi SX,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8 kmpl,1493 CC,110 bhp,"24@ 1,900-2,750(kgm@ rpm)",5.0,17
8125,Maruti Swift Dzire ZDi,2009,382000,120000,Diesel,Individual,Manual,First Owner,19.3 kmpl,1248 CC,73.9 bhp,190Nm@ 2000rpm,5.0,15
8126,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396 CC,70 bhp,140Nm@ 1800-3000rpm,5.0,11
8127,Tata Indigo CR4,2013,290000,25000,Diesel,Individual,Manual,First Owner,23.57 kmpl,1396 CC,70 bhp,140Nm@ 1800-3000rpm,5.0,11


In [8]:
df.drop("year", axis=1, inplace=True)
df.drop("owner",axis=1, inplace=True)
df.drop("max_power", axis=1, inplace=True)
df.drop(["name","torque"], axis=1, inplace=True)
df.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,mileage,engine,seats,car_used_years
0,450000,145500,Diesel,Individual,Manual,23.4 kmpl,1248 CC,5.0,10
1,370000,120000,Diesel,Individual,Manual,21.14 kmpl,1498 CC,5.0,10
2,158000,140000,Petrol,Individual,Manual,17.7 kmpl,1497 CC,5.0,18
3,225000,127000,Diesel,Individual,Manual,23.0 kmpl,1396 CC,5.0,14
4,130000,120000,Petrol,Individual,Manual,16.1 kmpl,1298 CC,5.0,17


In [9]:
df.nunique()

selling_price     670
km_driven         898
fuel                4
seller_type         3
transmission        2
mileage           393
engine            121
seats               9
car_used_years     27
dtype: int64

In [10]:

# Define the mapping for fuel types
fuel_mapping = {
    'CNG': 'Petrol',
    'LPG': 'Diesel'
}

# Apply the mapping to the 'fuel' column
df['fuel'] = df['fuel'].replace(fuel_mapping)


# Define the mapping for seller_type
seller_mapping = {
    'Trustmark Dealer': 'Dealer'
}

# Apply the mapping to the 'seller_type' column
df['seller_type'] = df['seller_type'].replace(seller_mapping)

In [11]:
df.nunique()

selling_price     670
km_driven         898
fuel                2
seller_type         2
transmission        2
mileage           393
engine            121
seats               9
car_used_years     27
dtype: int64

In [12]:
# df['mileage'] = df['mileage'].apply(lambda x: float(x.replace(" kmpl","")))
df['mileage'] = df['mileage'].apply(lambda x: float( x.split(" ")[0]))
df['engine'] = df['engine'].apply(lambda x: float(x.split(" ")[0]))
df.head()


Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,mileage,engine,seats,car_used_years
0,450000,145500,Diesel,Individual,Manual,23.4,1248.0,5.0,10
1,370000,120000,Diesel,Individual,Manual,21.14,1498.0,5.0,10
2,158000,140000,Petrol,Individual,Manual,17.7,1497.0,5.0,18
3,225000,127000,Diesel,Individual,Manual,23.0,1396.0,5.0,14
4,130000,120000,Petrol,Individual,Manual,16.1,1298.0,5.0,17


In [13]:
df['transmission'] = df['transmission'].map({'Manual':0, 'Automatic':1})
df['fuel'] = df['fuel'].map({'Petrol':0, 'Diesel':1})
df['seller_type'] = df['seller_type'].map({'Dealer':0, 'Individual': 1})

df.head()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,mileage,engine,seats,car_used_years
0,450000,145500,1,1,0,23.4,1248.0,5.0,10
1,370000,120000,1,1,0,21.14,1498.0,5.0,10
2,158000,140000,0,1,0,17.7,1497.0,5.0,18
3,225000,127000,1,1,0,23.0,1396.0,5.0,14
4,130000,120000,0,1,0,16.1,1298.0,5.0,17


In [14]:
### importing packages
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [15]:
#Defining X and y variables
X_vars = df.drop('selling_price', axis=1)
X = X_vars.to_numpy()
y_var = df['selling_price']
y = y_var.to_numpy()

In [16]:
#separating train and test dataset
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=374)

In [17]:
#Scaling the data
sc = StandardScaler()
train2_X = sc.fit_transform(train_X)
test2_X = sc.transform(test_X)

In [18]:
# Creating a Linear Model
lr_model = LinearRegression()
lr_model.fit(train2_X, train_y)

In [19]:
#Making predictions
lr_predictions = lr_model.predict(test2_X)
lr_predictions

array([ 659113.86615488,  598217.79106616, 1257994.88923231, ...,
        468656.57670089,  661369.92927158,  294514.95615017])

In [20]:
y_intercept = lr_model.intercept_
y_intercept

np.float64(643715.6438962683)

In [46]:
#Linear Model Equation
def equation_creator(column_names, coefficients):
    print("###############################################")
    print("Linear Model Equation")
    print("###############################################")
    print("\n")
    
    col_and_coefs = list()
    for column_name, coefficient in zip(column_names, coefficients):
        if coefficient >= 0:
            col_and_coefs.append(f" + {coefficient} * {column_name}")
        else:
            col_and_coefs.append(f" - {abs(coefficient)} * {column_name}")
        
    
    return f'y(selling_price) = {lr_model.intercept_} {"".join(col_and_coefs)}'


print(equation_creator(X_vars.columns, lr_model.coef_))

###############################################
Linear Model Equation
###############################################


y(selling_price) = 643715.6438962683  - 72400.21176546405 * km_driven + 57295.55746226688 * fuel - 90687.04710946728 * seller_type + 267230.7823521329 * transmission - 687.4819385373331 * mileage + 359209.76691570046 * engine - 162258.49339857243 * seats - 204259.58316277261 * car_used_years


In [None]:
#Model evaluation (R2_score)
lr_model.score(train2_X, train_y), lr_model.score(test2_X, test_y)

(0.5690908754099034, 0.5789541670180263)

In [29]:
def displays_model_evaluations(model_name, model_predictions, test_y):
    
    def _model_intercept():
        try: 
            i = model_name.intercept_
        # except TypeError as t:
        #     print("Some error encountered in model intercept value")
        #     return None
        except Exception as e:
            print(f"Some error encountered in model intercept value. Error details - {e}")
            return None
        else:
            return np.round(model_name.intercept_, 2)
        
        
    print("####################################################")
    print("############# Model Evaluation Report ##############")
    print("####################################################")
    
    df1 = pd.DataFrame({
        'Evaluation name': [
            'Intercept', 
            'R2',
            'Adjusted R2',
            'Mean Absolute Error (MAE)',
            'Mean Square Error (MSE)',
            'Root Mean Squared Error (RMSE)',
        ],
        'Values': [
            f'{round(_model_intercept(), 2)}',
            f'{round(metrics.r2_score(test_y, model_predictions), 2)}', 
            f'{round(1-(1-metrics.r2_score(test_y, model_predictions)) * (len(y)-1)/(len(y)-X.shape[1]-1),2)}',
            f'{round(metrics.mean_absolute_error(test_y, model_predictions),2)}',
            f'{round(metrics.mean_squared_error(test_y, model_predictions),2)}',
            f'{round(np.sqrt(metrics.mean_squared_error(test_y, model_predictions)),2)}',
        ]
    })
    return df1

displays_model_evaluations(lr_model, lr_predictions, test_y)

####################################################
############# Model Evaluation Report ##############
####################################################


Unnamed: 0,Evaluation name,Values
0,Intercept,643715.64
1,R2,0.58
2,Adjusted R2,0.58
3,Mean Absolute Error (MAE),311602.01
4,Mean Square Error (MSE),293874506357.69
5,Root Mean Squared Error (RMSE),542101.93


In [40]:
#Forecast Table
print("####################################################")
print("################ Forecast Table ####################")
print("####################################################")

predicted = lr_predictions.T
diff = predicted - test_y
forecasted_table = pd.DataFrame({'Actual': test_y.round(3), 'Predicted':predicted.round(3), 'Difference': diff.round(3)})
forecasted_table.head()

####################################################
################ Forecast Table ####################
####################################################


Unnamed: 0,Actual,Predicted,Difference
0,300000,659113.866,359113.866
1,525000,598217.791,73217.791
2,950000,1257994.889,307994.889
3,610000,681441.028,71441.028
4,625000,1655947.898,1030947.898


### RandomForestRegressor

### Pipelining using joblib

In [41]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train2_X, train_y)

train_pred = rf_model.predict(train2_X)
test_pred = rf_model.predict(test2_X)

In [42]:
#r2_score 
train_score = r2_score(train_y, train_pred)
test_score = r2_score(test_y, test_pred)

print(f"Random Forest R2 Scores - Train: {train_score:.4f}, Test: {test_score:.4f}")

Random Forest R2 Scores - Train: 0.9924, Test: 0.9481


In [43]:
# Optimizing model using GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'max_features': ['log2', 'sqrt']
}

# Create a RandomForestRegressor instance
rf = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(train2_X, train_y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Negate because GridSearchCV uses negative MSE

print(f"Best Parameters: {best_params}")
print(f"Best MSE: {best_score:.4f}")

# Use the best model for predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(test2_X)

# Evaluate the model
from sklearn.metrics import r2_score, mean_squared_error
r2 = r2_score(test_y, predictions)
mse = mean_squared_error(test_y, predictions)

print(f"R-squared Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

Best Parameters: {'max_depth': None, 'max_features': 'log2', 'n_estimators': 100}
Best MSE: 34944749165.2705
R-squared Score: 0.9503
Mean Squared Error: 34690156456.7625


In [44]:
# Save the model to a file using joblib
joblib.dump(best_model, 'best_model.joblib')

['best_model.joblib']

In [45]:
# Save the scaler
joblib.dump(sc, 'scaler.joblib')

['scaler.joblib']