### Importing & Remove completly duplicate columns

In [1]:
import pandas as pd

# Reading Excel files into DataFrames
df_mean_mode_ohe = pd.read_excel(r"H:\CampusX_DS\week43 - My Projects Aug 2024\used_car_price_prediction\Used-Car-Price-Prediction\src\notebook\data\model_ready_data\df_mean_mode_ohe.xlsx")
df_median_mode_ohe = pd.read_excel(r"H:\CampusX_DS\week43 - My Projects Aug 2024\used_car_price_prediction\Used-Car-Price-Prediction\src\notebook\data\model_ready_data\df_median_mode_ohe.xlsx")
df_knn_imputed_ohe = pd.read_excel(r"H:\CampusX_DS\week43 - My Projects Aug 2024\used_car_price_prediction\Used-Car-Price-Prediction\src\notebook\data\model_ready_data\df_knn_imputed_ohe.xlsx")

In [2]:
df_mean_mode_ohe.drop(columns=['content.appointmentId'], inplace=True)
df_median_mode_ohe.drop(columns=['content.appointmentId'], inplace=True)
df_knn_imputed_ohe.drop(columns=['content.appointmentId'], inplace=True)

In [3]:
print(df_mean_mode_ohe.shape, df_median_mode_ohe.shape, df_knn_imputed_ohe.shape)

(2719, 816) (2719, 816) (2719, 823)


In [4]:
# Step 1: Identify and remove completely duplicate columns
df_mean_mode_ohe = df_mean_mode_ohe.loc[:, ~df_mean_mode_ohe.T.duplicated()]
df_median_mode_ohe = df_median_mode_ohe.loc[:, ~df_median_mode_ohe.T.duplicated()]
df_knn_imputed_ohe = df_knn_imputed_ohe.loc[:, ~df_knn_imputed_ohe.T.duplicated()]

In [5]:
print(df_mean_mode_ohe.shape, df_median_mode_ohe.shape, df_knn_imputed_ohe.shape)

(2719, 803) (2719, 803) (2719, 811)


### Remove model & variant to simplify

In [6]:
df_mean_mode_ohe = df_mean_mode_ohe.loc[:, ~df_mean_mode_ohe.columns.str.startswith("content.model_")]
df_median_mode_ohe = df_median_mode_ohe.loc[:, ~df_median_mode_ohe.columns.str.startswith("content.model_")]
df_knn_imputed_ohe = df_knn_imputed_ohe.loc[:, ~df_knn_imputed_ohe.columns.str.startswith("content.model_")]

df_mean_mode_ohe = df_mean_mode_ohe.loc[:, ~df_mean_mode_ohe.columns.str.startswith("content.variant_")]
df_median_mode_ohe = df_median_mode_ohe.loc[:, ~df_median_mode_ohe.columns.str.startswith("content.variant_")]
df_knn_imputed_ohe = df_knn_imputed_ohe.loc[:, ~df_knn_imputed_ohe.columns.str.startswith("content.variant_")]

In [7]:
print(df_mean_mode_ohe.shape, df_median_mode_ohe.shape, df_knn_imputed_ohe.shape)

(2719, 127) (2719, 127) (2719, 135)


### Find Base Accuracy of each dataset to compare (Apply All Regression Algorithms)

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import statsmodels.api as sm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import warnings; warnings.filterwarnings('ignore')

# Define a list of algorithms grouped as mentioned
linear_regression_models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet()
}

non_linear_regression_models = {
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machine (SVR)': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gaussian Process': GaussianProcessRegressor()
}

advanced_regression_models = {
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror'),
    'LightGBM': lgb.LGBMRegressor(),
    'CatBoost': CatBoostRegressor(learning_rate=0.1, iterations=500, depth=6, silent=True)
}

# Deep Learning Model
def build_keras_nn(X_train, y_train):
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Bayesian Regression Model
bayesian_regression_models = {
    'Bayesian Linear Regression': sm.OLS
}

# Combine all models into one dictionary
all_models = {
    **linear_regression_models,
    **non_linear_regression_models,
    **advanced_regression_models,
    'Keras Neural Network': build_keras_nn,
    **bayesian_regression_models
}

# Function to apply all regression models to a dataset using cross-validation
def apply_regression_models(data, target_column):
    X = data.drop(target_column, axis=1)
    y = data[target_column]
    
    # Preprocessing: Convert categorical variables to dummy variables, and scale features
    X = pd.get_dummies(X)  # Convert categorical variables to dummy variables
    X = StandardScaler().fit_transform(X)  # Standard scaling
    
    # Dictionary to hold results
    results = []
    
    # Apply each model using cross-validation
    for name, model in all_models.items():
        try:
            if name == 'Keras Neural Network':
                # Keras model needs to be fit on the entire dataset
                build_keras_nn(X, y)
                y_pred = model.predict(X)
                mse = mean_squared_error(y, y_pred)
                r2 = r2_score(y, y_pred)
            elif name == 'Bayesian Linear Regression':
                X_with_const = sm.add_constant(X)
                model_ols = model(X_with_const, y).fit()
                y_pred = model_ols.predict(X_with_const)
                mse = mean_squared_error(y, y_pred)
                r2 = r2_score(y, y_pred)
            else:
                # Use cross-validation
                scores = cross_val_score(model, X, y, scoring='r2', cv=5)
                r2 = np.mean(scores)
                mse = -np.mean(cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5))
            
            # Store result
            results.append({
                'Algorithm': name,
                'MSE': mse,
                'R²': r2
            })
        except Exception as e:
            print(f"Error occurred while processing model {name}: {e}")
    
    # Return the results as a DataFrame, if any results exist
    if results:
        return pd.DataFrame(results)
    else:
        return pd.DataFrame()  # Return an empty DataFrame if no results

# List of dataframes (df1, df2, df3)
datasets = [df_mean_mode_ohe, df_median_mode_ohe, df_knn_imputed_ohe]

# Loop over datasets
results_all_datasets = []

for idx, dataset in enumerate(datasets, start=1):
    try:
        print(f"Applying models to dataset {idx}")
        # Apply regression models on the current dataset
        target_column = 'content.onRoadPrice'
        results_df = apply_regression_models(dataset, target_column)

        # Check if results_df is not empty before appending
        if not results_df.empty:
            # Add dataset identifier to the results
            results_df['Dataset'] = f'dataset_{idx}'
            # Append results to the final list
            results_all_datasets.append(results_df)
        else:
            print(f"No results for dataset {idx}")
    except Exception as e:
        print(f"Error applying models to dataset {idx}: {e}")

# Check if there are any results to concatenate
if results_all_datasets:
    # Combine results of all datasets into one DataFrame
    final_results = pd.concat(results_all_datasets, ignore_index=True)

    # Sort the results by R² (or MSE, depending on your preference)
    final_results_sorted = final_results.sort_values(by='R²', ascending=False)

    # Display the sorted results, including the dataset name
    print(final_results_sorted[['Dataset', 'Algorithm', 'R²', 'MSE']])
else:
    print("No results to concatenate.")

Applying models to dataset 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1278
[LightGBM] [Info] Number of data points in the train set: 2175, number of used features: 109
[LightGBM] [Info] Start training from score 953609.615632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001785 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1297
[LightGBM] [Info] Number of data points in the train set: 2175, number of used features: 109
[LightGBM] [Info] Start training from score 963839.552184
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001029 seconds.
You can set `force_row_wise=true` to remove t

Error occurred while processing model Keras Neural Network: 'function' object has no attribute 'predict'
Error occurred while processing model Bayesian Linear Regression: shapes (2719,127) and (1,127) not aligned: 127 (dim 1) != 1 (dim 0)
Applying models to dataset 3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1589
[LightGBM] [Info] Number of data points in the train set: 2175, number of used features: 117
[LightGBM] [Info] Start training from score 953609.615632
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1613
[LightGBM] [Info] Number of data points in the train set: 217

### Scaling data

In [17]:
from sklearn.preprocessing import StandardScaler

def scaling_dataframe(df, target_col):
    # Separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Create a new DataFrame with scaled features
    df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    df_scaled[target_col] = y.values

    return df_scaled

In [18]:
scaled_dfs = []

for i in datasets:
    scaled_dfs.append(scaling_dataframe(i, "content.onRoadPrice"))

### Filter Method 1: Corelation

In [49]:
def apply_correlation_filter(df, target_column, threshold=0.5):
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in the DataFrame.")
    
    # Calculate the correlation matrix
    correlation_matrix = df.corr()
    
    # Get the absolute correlation values with respect to the target column
    correlations = correlation_matrix[target_column].abs()
    
    # Select features that have a correlation higher than the specified threshold
    relevant_features = correlations[correlations > threshold].index.tolist()
    
    # Create a new DataFrame with the selected features
    filtered_df = df[relevant_features]
    
    return filtered_df

In [50]:
corr_dfs = []
target_column = "content.onRoadPrice"

for i in scaled_dfs:
    corr_dfs.append(apply_correlation_filter(i, target_column, threshold=0.5))

In [52]:
corr_dfs[0].columns.tolist()

['Airbags',
 'NumberOfSpeakers',
 'Displacementcc',
 'GearBoxNumberOfGears',
 'NumberOfDiscBrakes',
 'Widthmm',
 'Lengthmm',
 'WheelBasemm',
 'FueltankCapacitylitres',
 'MaxPowerbhp',
 'MaxTorqueNm',
 'content.bodyType_SUV',
 'SeatUpholstery_Synthetic Leather',
 'HeadlampLensType_Projector Beam',
 'RimTypeFrontWheels_Steel',
 'SmartCardSmartKey_1',
 'AmbientLighting_1',
 'SunroofMoonroof_1',
 'DriverSeatAdjustmentElectric_1',
 'content.onRoadPrice']

### Filter method 2: Apply f-regression & chi-square 

In [62]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression, chi2

def select_features(data: pd.DataFrame, target_col: str, numerical_cols: list, k_num: int = 'all', k_cat: int = 'all') -> pd.DataFrame:
    """
    Select features from the dataset using SelectKBest for numerical and categorical features,
    and return a filtered DataFrame with selected features and the target.

    Parameters:
        data (pd.DataFrame): DataFrame containing feature variables and target.
        target_col (str): The name of the target variable column.
        numerical_cols (list): List of numerical column names.
        k_num (int or str): Number of top numerical features to select; 'all' to select all.
        k_cat (int or str): Number of top categorical features to select; 'all' to select all.

    Returns:
        pd.DataFrame: Filtered DataFrame with selected features and the target column.
    """
    # Separate features and target
    X = data.drop(columns=[target_col])
    y = data[target_col]

    # Step 1: Validate numerical columns
    numerical_cols = [col for col in numerical_cols if col in X.columns]
    
    # Step 2: Separate Numerical and Categorical Features
    X_numerical = X[numerical_cols]
    X_categorical = X.drop(columns=numerical_cols).select_dtypes(include=['object'])

    # Step 3: One-Hot Encoding for Categorical Features
    if not X_categorical.empty:
        X_encoded = pd.get_dummies(X_categorical, drop_first=True)
    else:
        X_encoded = pd.DataFrame()  # Empty DataFrame if no categorical features

    # Step 4: Apply SelectKBest with f_regression for Numerical Features
    if k_num == 'all':
        k_num = X_numerical.shape[1]
    selector_num = SelectKBest(score_func=f_regression, k=k_num)
    selector_num.fit(X_numerical, y)
    numerical_features = X_numerical.columns[selector_num.get_support()].tolist()

    # Step 5: Apply SelectKBest with chi2 for Categorical Features
    if not X_encoded.empty:
        if k_cat == 'all':
            k_cat = X_encoded.shape[1]
        selector_cat = SelectKBest(score_func=chi2, k=k_cat)
        selector_cat.fit(X_encoded, y)
        categorical_features = X_encoded.columns[selector_cat.get_support()].tolist()
    else:
        categorical_features = []  # No categorical features to select

    # Step 6: Combine the Selected Features
    final_features = numerical_features + categorical_features

    # Create a filtered DataFrame with selected features and the target column
    filtered_df = data[final_features + [target_col]]

    return filtered_df

In [68]:
numerical_col = [
    "content.year", "content.ownerNumber", "content.odometerReading",
    "content.onRoadPrice", "Airbags", "NumberOfSpeakers", "Displacementcc",
    "GearBoxNumberOfGears", "NumberOfDiscBrakes", "GroundClearancemm",
    "SeatingCapacity", "Bootspacelitres", "Widthmm", "Lengthmm",
    "WheelBasemm", "FueltankCapacitylitres", "MaxPowerbhp",
    "MaxPowerrpm", "MaxTorqueNm", "defects", "repainted",
    "MultifunctionDisplayScreenSizein", "EntertainmentDisplayScreenSizein",
    "NCAPRating"
]

kbest_dfs = []
target_column = "content.onRoadPrice"

for i in scaled_dfs:
    filtered_df = select_features(i, target_column, numerical_col, k_num=20, k_cat=20)
    kbest_dfs.append(filtered_df)


In [69]:
kbest_dfs[0].columns.tolist()

['content.year',
 'Airbags',
 'NumberOfSpeakers',
 'Displacementcc',
 'GearBoxNumberOfGears',
 'NumberOfDiscBrakes',
 'GroundClearancemm',
 'SeatingCapacity',
 'Bootspacelitres',
 'Widthmm',
 'Lengthmm',
 'WheelBasemm',
 'FueltankCapacitylitres',
 'MaxPowerbhp',
 'MaxPowerrpm',
 'MaxTorqueNm',
 'defects',
 'repainted',
 'MultifunctionDisplayScreenSizein',
 'EntertainmentDisplayScreenSizein',
 'content.onRoadPrice']

### Step3: Tree-based Model Importance

In [70]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def filter_features_by_importance(df, target_col, threshold=0.01):
    # Split the data into features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit a Random Forest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    # Get feature importances
    importances = model.feature_importances_
    
    # Create a DataFrame for feature importances
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    })
    
    # Filter features based on the importance threshold
    selected_features = feature_importances[feature_importances['Importance'] > threshold]['Feature']
    
    # Return the filtered DataFrame
    return df[selected_features.to_list() + [target_col]]

In [71]:
featureImportance_dfs = []
target_column = "content.onRoadPrice"

for i in scaled_dfs:
    filtered_df = filter_features_by_importance(i, target_column, threshold=0.01)
    featureImportance_dfs.append(filtered_df)

In [72]:
featureImportance_dfs[0].columns.tolist()

['content.year',
 'content.ownerNumber',
 'content.odometerReading',
 'Airbags',
 'NumberOfSpeakers',
 'Displacementcc',
 'GroundClearancemm',
 'Bootspacelitres',
 'Widthmm',
 'Lengthmm',
 'WheelBasemm',
 'FueltankCapacitylitres',
 'MaxPowerbhp',
 'MaxPowerrpm',
 'MaxTorqueNm',
 'defects',
 'repainted',
 'content.fitnessUpto_months_remaining',
 'content.insuranceExpiry_months_remaining',
 'content.lastServicedAt_months_remaining',
 'content.transmission_Manual',
 'content.insuranceType_Comprehensive',
 'content.insuranceType_Zero Depreciation',
 'content.duplicateKey_1',
 'Left Front Tyre_WARN',
 'Right Front Tyre_WARN',
 'Left Rear Tyre_WARN',
 'Right Rear Tyre_WARN',
 'Spare Tyre_WARN',
 'car_state_DL',
 'car_state_GJ',
 'car_state_HR',
 'car_state_KA',
 'car_state_MH',
 'car_state_TN',
 'content.onRoadPrice']

### Select Features

In [97]:
def union_of_features(dataframe_lists):
    # Get the union of feature sets from multiple lists of DataFrames
    union_features = set()
    for dataframe_list in dataframe_lists:
        for df in dataframe_list:
            union_features.update(df.columns[:-1])  # Exclude target column
    return union_features

def top_n_features(dataframe_lists, n):
    # Get the top N features from each DataFrame in the lists
    top_features = set()
    for dataframe_list in dataframe_lists:
        for df in dataframe_list:
            top_n = df.columns[:-1][:n]  # Exclude target column and take top N
            top_features.update(top_n)
    return top_features

In [98]:
all_dataframe_lists = [featureImportance_dfs, kbest_dfs, corr_dfs]

In [106]:
# Get union of features
union_features = union_of_features(all_dataframe_lists)

# Get top N features (e.g., top 5 features)
top_n = top_n_features(all_dataframe_lists, n=51)

In [107]:
print(union_features)
print()
print(top_n)

{'Lengthmm', 'content.year', 'DriverSeatAdjustmentElectric_1', 'Right Rear Tyre_WARN', 'car_state_DL', 'content.transmission_Manual', 'EntertainmentDisplayScreenSizein', 'car_state_KA', 'SmartCardSmartKey_1', 'MaxPowerbhp', 'content.duplicateKey_1', 'ParkingAssistRear_Sensor & Camera', 'content.ownerNumber', 'Right Front Tyre_WARN', 'content.insuranceExpiry_months_remaining', 'MultifunctionDisplayScreenSizein', 'RimTypeFrontWheels_Steel', 'Bootspacelitres', 'Widthmm', 'Left Front Tyre_WARN', 'MaxTorqueNm', 'AmbientLighting_1', 'GearBoxNumberOfGears', 'SunroofMoonroof_1', 'SeatingCapacity', 'Left Rear Tyre_WARN', 'content.odometerReading', 'MaxPowerrpm', 'Airbags', 'Displacementcc', 'repainted', 'content.fitnessUpto_months_remaining', 'content.insuranceType_Comprehensive', 'Spare Tyre_WARN', 'SunroofType_nan', 'NumberOfSpeakers', 'car_state_HR', 'WheelBasemm', 'car_state_TN', 'NumberOfDiscBrakes', 'content.bodyType_SUV', 'SeatUpholstery_Synthetic Leather', 'HeadlampLensType_Projector Be

In [108]:
union_features_set = set(union_features)
top_n_set = set(top_n)
common_features = union_features_set.intersection(top_n_set)

print(common_features)

{'Lengthmm', 'content.year', 'DriverSeatAdjustmentElectric_1', 'Right Rear Tyre_WARN', 'car_state_DL', 'content.transmission_Manual', 'EntertainmentDisplayScreenSizein', 'car_state_KA', 'SmartCardSmartKey_1', 'MaxPowerbhp', 'content.duplicateKey_1', 'ParkingAssistRear_Sensor & Camera', 'content.ownerNumber', 'Right Front Tyre_WARN', 'content.insuranceExpiry_months_remaining', 'MultifunctionDisplayScreenSizein', 'RimTypeFrontWheels_Steel', 'Bootspacelitres', 'Widthmm', 'FueltankCapacitylitres', 'Left Front Tyre_WARN', 'MaxTorqueNm', 'AmbientLighting_1', 'GearBoxNumberOfGears', 'SunroofMoonroof_1', 'SeatingCapacity', 'content.odometerReading', 'MaxPowerrpm', 'Airbags', 'Displacementcc', 'repainted', 'content.fitnessUpto_months_remaining', 'content.insuranceType_Comprehensive', 'Spare Tyre_WARN', 'NumberOfSpeakers', 'car_state_HR', 'WheelBasemm', 'car_state_TN', 'NumberOfDiscBrakes', 'content.bodyType_SUV', 'SeatUpholstery_Synthetic Leather', 'HeadlampLensType_Projector Beam', 'content.la

In [110]:
common_features_lis = ['Lengthmm', 'content.year', 'DriverSeatAdjustmentElectric_1', 'Right Rear Tyre_WARN', 'car_state_DL', 'content.transmission_Manual', 'EntertainmentDisplayScreenSizein','SmartCardSmartKey_1', 'MaxPowerbhp', 'content.duplicateKey_1', 'ParkingAssistRear_Sensor & Camera', 'content.ownerNumber', 'Right Front Tyre_WARN', 'content.insuranceExpiry_months_remaining', 'MultifunctionDisplayScreenSizein', 'RimTypeFrontWheels_Steel', 'Bootspacelitres', 'Widthmm', 'FueltankCapacitylitres', 'Left Front Tyre_WARN', 'MaxTorqueNm', 'AmbientLighting_1', 'GearBoxNumberOfGears', 'SunroofMoonroof_1', 'SeatingCapacity', 'content.odometerReading', 'MaxPowerrpm', 'Airbags', 'Displacementcc', 'repainted', 'content.fitnessUpto_months_remaining', 'content.insuranceType_Comprehensive', 'Spare Tyre_WARN', 'NumberOfSpeakers', 'WheelBasemm','NumberOfDiscBrakes', 'content.bodyType_SUV', 'SeatUpholstery_Synthetic Leather', 'HeadlampLensType_Projector Beam', 'content.lastServicedAt_months_remaining', 'GroundClearancemm', 'content.insuranceType_Zero Depreciation','NCAPRating', 'Left Rear Tyre_WARN', 'defects', 'SunroofType_nan']

In [111]:
len(common_features_lis)

46

### I have realized that keeping these many columns will not make a general model which can work on any data, Instead I should be more focused on general features which are mostly available on all website & train model on those columns --- NEXT: figure out those columns which are very common among most of websites