In [15]:

import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dataset Overview

## Header

In [16]:
df = pd.read_csv("car_price_dataset.csv")
df.head()

Unnamed: 0,Brand,Model,Year,Engine_Size,Fuel_Type,Transmission,Mileage,Doors,Owner_Count,Price
0,Kia,Rio,2020,4.2,Diesel,Manual,289944,3,5,8501
1,Chevrolet,Malibu,2012,2.0,Hybrid,Automatic,5356,2,3,12092
2,Mercedes,GLA,2020,4.2,Diesel,Automatic,231440,4,2,11171
3,Audi,Q5,2023,2.0,Electric,Manual,160971,2,1,11780
4,Volkswagen,Golf,2003,2.6,Hybrid,Semi-Automatic,286618,3,3,2867


In [17]:
df.shape

(10000, 10)

## Columns with Datatypes

In [18]:
df_summary = pd.DataFrame({
    "Column Name": df.columns,
    "Non-Null Count": df.count().values,  # Count of non-null values
    "Data Type": df.dtypes.values
})

df_summary

Unnamed: 0,Column Name,Non-Null Count,Data Type
0,Brand,10000,object
1,Model,10000,object
2,Year,10000,int64
3,Engine_Size,10000,float64
4,Fuel_Type,10000,object
5,Transmission,10000,object
6,Mileage,10000,int64
7,Doors,10000,int64
8,Owner_Count,10000,int64
9,Price,10000,int64


## Missing Values in each column

In [19]:
empty = pd.DataFrame(df.isna().sum())
empty.rename(columns={0:"Count of Empty Values"})

Unnamed: 0,Count of Empty Values
Brand,0
Model,0
Year,0
Engine_Size,0
Fuel_Type,0
Transmission,0
Mileage,0
Doors,0
Owner_Count,0
Price,0


# Pre-Processing

## Replacing Missing values
There are no missing values in the dataset.

## One-hot encoding transformation of categorical columns

In [20]:


# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Separate binary and multi-class categorical columns
binary_cols = [col for col in categorical_cols if df[col].nunique() == 2]
multi_class_cols = [col for col in categorical_cols if df[col].nunique() > 2]

# Apply Label Encoding to binary columns (convert to 0 and 1)
for col in binary_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Reduce number of categories for high-cardinality features
top_n = 20  # Keep only top 20 categories
for col in multi_class_cols:
    top_categories = df[col].value_counts().nlargest(top_n).index
    df[col] = df[col].apply(lambda x: x if x in top_categories else "Other")

# Apply One-Hot Encoding with sparse output
encoder = OneHotEncoder(drop='first', sparse=True, handle_unknown='ignore')  
encoded_array = encoder.fit_transform(df[multi_class_cols])

# Convert sparse matrix to DataFrame
encoded_df = pd.DataFrame(encoded_array.toarray(), columns=encoder.get_feature_names_out(multi_class_cols))

# Merge encoded data with original dataframe (drop original multi-class categorical columns)
df_final = df.drop(columns=multi_class_cols).reset_index(drop=True).join(encoded_df)

df_final.head()


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

## Applying Scaling

In [None]:
# Choose the scaler
scaler = StandardScaler()  

# Fit and transform the dataset
df_scaled = pd.DataFrame(scaler.fit_transform(df_final), columns=df_final.columns)

df_scaled.head()


# Applying Multiple Regression

## Split Data into Train & Test Sets

In [None]:
X = df_scaled.drop(columns=['Price'])  # Replace 'target' with your actual target variable
y = df_scaled['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Function to evaluate model

In [None]:
def evaluate_model(model, X, y, method_name):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    print(f"{method_name} - Mean Squared Error: {mse}")
    return mse

## 📌 No Elimination (Keep All Variables)

In [None]:
model_all = LinearRegression()
model_all.fit(X_train, y_train)
evaluate_model(model_all, X_test, y_test, "All Variables")
all_features = X.columns.tolist()


## 📌 Backward Elimination

In [None]:
X_train_const = sm.add_constant(X_train)  # Add intercept term
X_test_const = sm.add_constant(X_test)

# Run OLS Regression
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)
model_ols = sm.OLS(y_train, X_train_const).fit()
p_values = model_ols.pvalues.copy()  # Copy p-values to avoid modifying original data

# Keep track of selected features
selected_features = X_train.columns.tolist()

while len(selected_features) > 0:  # Ensure the loop stops when no features remain
    max_p_value = p_values.max()  # Get the highest p-value
    if max_p_value > 0.05:  # If it is greater than 0.05, remove the feature
        worst_feature = p_values.idxmax()  # Find the feature with max p-value
        if worst_feature in selected_features:  # Check if it's still in the list
            selected_features.remove(worst_feature)
        else:
            break  # Stop if feature is already removed (prevents the error)

        # Re-run OLS without the removed feature
        X_train_const = sm.add_constant(X_train[selected_features])
        X_test_const = sm.add_constant(X_test[selected_features])
        model_ols = sm.OLS(y_train, X_train_const).fit()
        p_values = model_ols.pvalues.copy()  # Update p-values
    else:
        break  # Stop when all remaining p-values are below 0.05

# Train final model with selected features
model_backward = LinearRegression()
model_backward.fit(X_train[selected_features], y_train)
evaluate_model(model_backward, X_test[selected_features], y_test, "Backward Elimination")


## 📌 Forward Selection

In [None]:
selected_features = []
remaining_features = list(X_train.columns)

while len(remaining_features) > 0:
    best_feature = None
    best_score = float('inf')
    
    for feature in remaining_features:
        temp_features = selected_features + [feature]
        model_temp = LinearRegression()
        model_temp.fit(X_train[temp_features], y_train)
        mse = mean_squared_error(y_test, model_temp.predict(X_test[temp_features]))
        
        if mse < best_score:
            best_score = mse
            best_feature = feature

    if best_feature:
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)
    else:
        break

# Train final forward selection model
model_forward = LinearRegression()
model_forward.fit(X_train[selected_features], y_train)
evaluate_model(model_forward, X_test[selected_features], y_test, "Forward Selection")


## 📌 Stepwise Selection (Combination of Forward & Backward)

In [None]:
X = df_scaled.drop(columns=["Price"])
y = df_scaled["Price"]

# Initialize
selected_features = []  # Stores selected features (Forward Selection)
candidate_features = list(X.columns)  # Features that can be added
best_mse = float('inf')  # Best model MSE

# Forward and Backward Selection
while candidate_features:
    improved = False
    best_feature = None
    
    # Forward Selection: Try adding each feature
    for feature in candidate_features:
        temp_features = selected_features + [feature]
        X_temp = X[temp_features]
        model = LinearRegression().fit(X_temp, y)
        y_pred = model.predict(X_temp)
        mse = mean_squared_error(y, y_pred)
        
        if mse < best_mse:
            best_mse = mse
            best_feature = feature
            improved = True
    
    if improved and best_feature:
        selected_features.append(best_feature)
        candidate_features.remove(best_feature)
    else:
        break  # Stop if no improvement

    # Backward Elimination: Remove least significant feature
    X_temp = X[selected_features]
    X_temp = sm.add_constant(X_temp)
    model = sm.OLS(y, X_temp).fit()
    p_values = model.pvalues.iloc[1:]  # Exclude intercept
    max_p = p_values.max()
    
    if max_p > 0.05:  # Threshold for significance
        worst_feature = p_values.idxmax()
        selected_features.remove(worst_feature)
        candidate_features.append(worst_feature)
    
# Final Model with Selected Features
X_final = sm.add_constant(X[selected_features])
final_model = sm.OLS(y, X_final).fit()
print(final_model.summary())


In [None]:
## 📌 Compare the Selected Features

In [None]:
print("\nFeature Comparison:")
print()
print(f"All Features: {all_features}")
print()
print(f"Backward Elimination Selected: {selected_features}")
print()
print(f"Forward Selection Selected: {selected_features}")
print()
print(f"Stepwise Selection Selected: {selected_features}")