## *import libraries*

In [90]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

## *import csv data*

In [91]:
df=pd.read_csv(r"C:\Users\Hp\Desktop\ml projects\04. Used Car Prediction (Regression)\data\cardata.csv")

In [92]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000


## Data cleaning:-

In [93]:
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [94]:
df.duplicated().sum()

0

**Report**
* No missing value and no duplicate value

#### car name column can be dropped as it is already split into brand and model

In [95]:
df.drop('car_name',axis=1,inplace=True)
df.drop('Unnamed: 0',axis=1,inplace=True)
        

In [96]:
df.head(1)

Unnamed: 0,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000


### numeric features:-

In [97]:
num_features=[feature for feature in df.columns if df[feature].dtype!="O"]
print("num of numerical feature :",len(num_features))

num of numerical feature : 7


### categorical features:-

In [98]:
cat_features=[feature for feature in df.columns if df[feature].dtype=="O"]
print("num of categorical feature :",len(cat_features))

num of categorical feature : 5


### discrete features:-

In [99]:
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))

Num of Discrete Features : 2


### continues features:-

In [100]:
continues_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Discrete Features :',len(continues_features))

Num of Discrete Features : 5


**outliers or skweness**

In [101]:
df[num_features].skew()

vehicle_age       0.833712
km_driven        28.172711
mileage           0.104961
engine            1.666467
max_power         2.485129
seats             2.039983
selling_price    10.047048
dtype: float64

**use `Log Transformation` because we have `positivly skewed` data**

In [102]:
import numpy as np
df['km_driven'] = np.log1p(df['km_driven'])
df['max_power'] = np.log1p(df['max_power'])# Log transformation with +1 to avoid log(0)
df['seats'] = np.log1p(df['seats'])
df['selling_price'] = np.log1p(df['selling_price'])
df['engine']=np.log1p(df['engine'])

In [103]:
df[num_features].skew()

vehicle_age      0.833712
km_driven       -1.036292
mileage          0.104961
engine           0.699124
max_power        0.771409
seats            1.245620
selling_price    0.568413
dtype: float64

In [104]:
##  Independent and dependent features:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [105]:
x.head(1)

Unnamed: 0,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Maruti,Alto,9,11.695255,Individual,Petrol,Manual,19.7,6.680855,3.85651,1.791759


In [106]:
y

0        11.695255
1        13.217675
2        12.278398
3        12.328295
4        13.253393
           ...    
15406    12.429220
15407    13.737550
15408    12.959847
15409    14.018452
15410    13.997833
Name: selling_price, Length: 15411, dtype: float64

**feature encoding and scaling-->**

In [112]:
num_features = ['vehicle_age', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
onehot_columns = ['seller_type','fuel_type','transmission_type','brand', 'model']


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Initialize transformers
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

# Create the ColumnTransformer
preprocessor = ColumnTransformer( transformers=[
        ("OneHotEncoder", oh_transformer, onehot_columns),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)


# Fit and transform the data
x_transformed = preprocessor.fit_transform(x)


In [115]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_transformed,y,test_size=0.2,random_state=42)
x_train.shape, x_test.shape

((12328, 168), (3083, 168))

# **Model selection-->**

In [109]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor,VotingRegressor

In [116]:

# Define individual regression models
LinearRegression_model = LinearRegression()
Ridge_model = Ridge()
Lasso_model = Lasso()
KNeighbors_model = KNeighborsRegressor()
DecisionTree_model = DecisionTreeRegressor()
SVR_model = SVR()
RF_model = RandomForestRegressor()

# Create a voting regressor (ensemble method)
voting_regressor = VotingRegressor(estimators=[
    ("lr", LinearRegression_model),
    ("ridge", Ridge_model),
    ("dt", DecisionTree_model),
    ("rf", RF_model)
])

# Fit the voting regressor
voting_regressor.fit(x_train, y_train)

# Make predictions
y_pred = voting_regressor.predict(x_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2s = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2s}')


Mean Squared Error: 0.029724144780070252
R^2 Score: 0.9405094025203035


In [117]:
mse

0.029724144780070252

In [118]:
rmse=np.sqrt(mse)

In [119]:
rmse

0.1724069162767847

In [120]:
r2s

0.9405094025203035

In [121]:
np.mean(y)

13.279565000319645

**Relative RMSE = RMSE/Mean of target(y)**

In [122]:
 rmse/np.mean(y)*100

1.298287378183207

**Relative RMSE of 1.30%: This indicates that the average prediction error is about 1.30% of the mean target value, which is generally considered very good performance.**

In [123]:
# Define the models
models = {
    "LinearRegression_model": LinearRegression(),
    "Ridge_model": Ridge(),
    "Lasso_model": Lasso(),
    "KNeighbors_model": KNeighborsRegressor(),
    "DecisionTree_model": DecisionTreeRegressor(),
    "SVR_model": SVR(),
    "RF_model":RandomForestRegressor()
}


# Initialize a dictionary to store the results
results = {}

# Loop through each model
for name, model in models.items():
    # Fit the model to the training data
    model.fit(x_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(x_test)
    
    # Calculate the performance metrics
    mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
    r2 = r2_score(y_test, y_pred)  # R² Score
    
    # Store the metrics in the results dictionary
    results[name] = {"MSE": mse, "R²": r2}

# Print the results for each model
for model_name, metrics in results.items():
    print(f"{model_name} -->> MSE: {metrics['MSE']:.2f}, R²: {metrics['R²']:.2f}")


LinearRegression_model -->> MSE: 0.03, R²: 0.93
Ridge_model -->> MSE: 0.04, R²: 0.93
Lasso_model -->> MSE: 0.50, R²: -0.00
KNeighbors_model -->> MSE: 0.03, R²: 0.93
DecisionTree_model -->> MSE: 0.05, R²: 0.90
SVR_model -->> MSE: 0.03, R²: 0.94
RF_model -->> MSE: 0.03, R²: 0.94


## **hyperparameter tunning--->**

In [81]:
# hyperparameter tunning
from sklearn.model_selection import GridSearchCV

In [82]:
from sklearn.neighbors import KNeighborsRegressor

In [83]:
KNeighbors = KNeighborsRegressor()
KNeighbors.fit(x_train,y_train)

In [84]:
KNeighbors.score(x_train,y_train)*100 , KNeighbors.score(x_test,y_test)*100

(95.2174630804953, 93.1364912376256)

In [85]:
p_g = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

In [86]:
gd=GridSearchCV(KNeighborsRegressor(),param_grid=p_g)
                  
gd.fit(x_train,y_train)

In [87]:
gd.best_params_

{'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'}

In [88]:
KNeighbors = KNeighborsRegressor(metric= 'euclidean', n_neighbors= 7, weights= 'uniform')

KNeighbors.fit(x_train,y_train)

In [89]:
KNeighbors.score(x_train,y_train)*100,KNeighbors.score(x_test,y_test)*100

(94.68171158658694, 93.20573833332456)

*my training accuracy is `94%` and testing accuracy is `93%` a littile `overfitting`*

**problems:-**
* first my model show `92%` accuracy on `KNN` and `89%` accuracy on `Decsion Tree` and by using `voting regressor` it show best score `89%`
* it means that `overfitting` in model then do avoid overfitting i use `Random Forest` which show accuracy of `94%`
* but there is one problem `Relative RMSE=RMSE/mean of target(y)=28%` means that on average, model's predictions deviate from the actual values by about 28.5% of the mean target value.
* after finding a huge `Relative RMSE` i spend more time on data then i realise that i forgot to apply `log Transformation` on my data because there is `Positive Skweness` in Data after solving all the problem , i realise my `model Accuracy` increased by `2%` and `Relative RMSE` is `decreased` by `27%`

# **Final conclusion----->**

## **Best Model is `SVR` and `Random Forest` with `R2squared` is `94%`, `MSE` is `3%` and `Relative RMSE` is `2.3%`**

In [124]:
best_model = models["DecisionTree_model"]

In [32]:
import pickle

## **save the `Scaler` and `Model` as Pickle file**

In [125]:
pickle.dump(best_model,open("Model.pkl","wb"))    

In [126]:
pickle.dump(preprocessor,open('preprocessor.pkl', 'wb')) 

In [127]:
pickled_model=pickle.load(open("Model.pkl","rb"))

In [128]:
pickled_model

In [129]:
# Load the ColumnTransformer
with open('preprocessor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)


In [130]:
pickled_preprocessor=pickle.load(open('preprocessor.pkl','rb'))

In [78]:
pickled_preprocessor