In [1]:
class LassoReg:
    def __init__(self, alpha=1.0, max_iter=100, tol=0.5):
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol
        self.coef_ = None
        self.intercept_ = None
    def fit(self, X, y):
      n_samples, n_features = X.shape
      self.coef_ = np.zeros(n_features)
      self.intercept_ = 0.0

      Transpose = np.dot(X.T, X)

      for _ in range(self.max_iter):
        prev_coef = np.copy(self.coef_)
        prev_intercept = self.intercept_

        residuals = y - np.dot(X, self.coef_)
        self.intercept_ = np.mean(residuals) if n_samples > 1 else residuals[0]

        for j in range(n_features):
            rho_j = np.dot(X[:, j], residuals + np.dot(X, self.coef_) - X[:, j] * self.coef_[j])
            if rho_j < -self.alpha / 2:
                self.coef_[j] = (rho_j + self.alpha / 2) / Transpose[j, j]
            elif rho_j > self.alpha / 2:
                self.coef_[j] = (rho_j - self.alpha / 2) / Transpose[j, j]
            else:
                self.coef_[j] = 0.0

        coef_change = np.max(np.abs(self.coef_ - prev_coef))
        intercept_change = np.abs(self.intercept_ - prev_intercept)
        if coef_change < self.tol and intercept_change < self.tol:
            break


    def predict(self, X):
        return np.dot(X, self.coef_) + self.intercept_

In [2]:
import pandas as pd
file_path = '/Users/ishikamanghwani/desktop/cardetails_csv.csv'
data = pd.read_csv(file_path)
data.head()
data.columns
data_encoded = pd.get_dummies(data, columns=['fuel', 'seller_type', 'transmission', 'owner'])
data_encoded.head()

Unnamed: 0,name,year,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC,2007,70000,60000,False,False,False,False,True,False,True,False,False,True,True,False,False,False,False
1,Maruti Wagon R LXI Minor,2007,50000,135000,False,False,False,False,True,False,True,False,False,True,True,False,False,False,False
2,Hyundai Verna 1.6 SX,2012,100000,600000,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False
3,Datsun RediGO T Option,2017,46000,250000,False,False,False,False,True,False,True,False,False,True,True,False,False,False,False
4,Honda Amaze VX i-DTEC,2014,141000,450000,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False


In [3]:
X = data_encoded.drop(columns=['selling_price','name'])
y = data_encoded['selling_price']

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

X_df = pd.DataFrame(X)
y_df = pd.Series(y)

nan_mask_X = X_df.isna().any(axis=1)
nan_mask_y = y_df.isna()

nan_mask_combined = nan_mask_X | nan_mask_y

X_clean = X_df[~nan_mask_combined]
y_clean = y_df[~nan_mask_combined]

isolation_forest = IsolationForest(contamination=0.1, random_state=42)  
outlier_mask = isolation_forest.fit_predict(X_clean) != -1  

X_clean = X_clean[outlier_mask]
y_clean = y_clean[outlier_mask]

y_clean = y_clean[:X_clean.shape[0]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)
X_scaled_df = pd.DataFrame(X_scaled)

print("Original data shape:", X.shape)
print("Cleaned data shape:", X_clean.shape)
print("Scaled data shape:", X_scaled_df.shape)
print("Scaled feature shape", y_clean.shape)

Original data shape: (4340, 17)
Cleaned data shape: (3906, 17)
Scaled data shape: (3906, 17)
Scaled feature shape (3906,)


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y_clean, test_size=0.2, random_state=42)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)

Training data shape: (3124, 17)
Testing data shape: (782, 17)
Training target shape: (3124,)
Testing target shape: (782,)


In [6]:
data_encoded.head()

Unnamed: 0,name,year,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Automatic,transmission_Manual,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC,2007,70000,60000,False,False,False,False,True,False,True,False,False,True,True,False,False,False,False
1,Maruti Wagon R LXI Minor,2007,50000,135000,False,False,False,False,True,False,True,False,False,True,True,False,False,False,False
2,Hyundai Verna 1.6 SX,2012,100000,600000,False,True,False,False,False,False,True,False,False,True,True,False,False,False,False
3,Datsun RediGO T Option,2017,46000,250000,False,False,False,False,True,False,True,False,False,True,True,False,False,False,False
4,Honda Amaze VX i-DTEC,2014,141000,450000,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False


In [7]:
print("X train y train \n")
X_train
y_train

X train y train 



271      130000
3409     160000
3051     400000
2632     200000
3634     350000
         ...   
1278     450000
1455     600000
975     1770000
3908     280000
3541     180000
Name: selling_price, Length: 3124, dtype: int64

In [8]:
print("Training data shape:", X_train.shape)
print("Training target shape:", y_train.shape)

Training data shape: (3124, 17)
Training target shape: (3124,)


In [9]:
#numpy
import numpy as np
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()

lasso_model = LassoReg(alpha=1.0)
lasso_model.fit(X_train_np, y_train_np)

predictions = lasso_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.4316395686226122e+33
