In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import  cross_val_score, KFold
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline

In [3]:
data = pd.read_csv('processed_zomato.csv')


In [4]:
data.columns

Index(['Unnamed: 0', 'type_Café/Bakery', 'type_Casual Dining',
       'type_Fine Dining', 'type_Other', 'type_Quick Bites', 'city_Ahmedabad',
       'city_Ajmer', 'city_Alappuzha', 'city_Allahabad',
       ...
       'delivery', 'chain_restaurant', 'indian', 'asian', 'western',
       'middle_eastern', 'fast_food', 'others', 'highlights_count', 'rating'],
      dtype='object', length=112)

In [5]:
data = data.drop('Unnamed: 0', axis=1)

I will define function which is based on the feature importances derived from a RandomForestRegressor. This approach falls under the category of embedded methods since it uses the RandomForestRegressor's built-in feature importance attribute to rank and select the most important features.

In [7]:
def select_important_features(df, target_col, n_features, random_state=42):
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    
    model = RandomForestRegressor(random_state=random_state)
    model.fit(X, y)

    
    importances = model.feature_importances_

    
    sorted_idx = importances.argsort()[::-1]

    
    important_features = X.columns[sorted_idx[:n_features]]

   
    return df[important_features.to_list() + [target_col]]


In [8]:
data_with_important_features = select_important_features(data, 'rating', 20)

In [9]:
data_with_important_features

Unnamed: 0,votes,photo_count,longitude,latitude,city_Amravati,city_Kolhapur,highlights_count,price_range,asian,chain_restaurant,...,fast_food,delivery,type_Casual Dining,type_Quick Bites,others,city_Kharagpur,type_Café/Bakery,area_88752,western,rating
0,0.680371,0.514950,0.465295,0.721545,0.0,0.0,0.333609,0.333333,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,4.4
1,0.720025,0.519495,0.465822,0.719522,0.0,0.0,0.300279,0.333333,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.4
2,0.678736,0.477724,0.463995,0.720411,0.0,0.0,0.500235,0.000000,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.2
3,0.664033,0.516923,0.465434,0.721315,0.0,0.0,0.422685,0.000000,0.0,1.0,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,4.3
4,0.624614,0.580009,0.468161,0.719408,0.0,0.0,0.765618,0.666667,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44254,0.223058,0.112316,0.418881,0.876275,0.0,0.0,0.264470,0.000000,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.3
44255,0.140734,0.000000,0.418815,0.876296,0.0,0.0,0.225529,0.000000,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.2
44256,0.111529,0.198939,0.418859,0.876265,0.0,0.0,0.300279,0.000000,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
44257,0.111529,0.000000,0.418877,0.876280,0.0,0.0,0.182457,0.000000,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now i will use Recursive Feature Elimination (RFE), which is wrapper method, and see how that works in comparing with previous one.

In [10]:
from sklearn.feature_selection import RFE


def select_important_features_rfe(df, target_col, n_features, random_state=42):
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    
    model = RandomForestRegressor(random_state=random_state)

    
    rfe_selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    rfe_selector.fit(X, y)

    
    selected_features = X.columns[rfe_selector.support_]

    
    return df[selected_features.to_list() + [target_col]]





In [11]:
data_with_selected_features = select_important_features_rfe(data, 'rating', 10)

In [12]:
data_with_selected_features

Unnamed: 0,type_Quick Bites,city_Amravati,city_Kolhapur,latitude,longitude,price_range,votes,photo_count,asian,highlights_count,rating
0,1.0,0.0,0.0,0.721545,0.465295,0.333333,0.680371,0.514950,0.0,0.333609,4.4
1,1.0,0.0,0.0,0.719522,0.465822,0.333333,0.720025,0.519495,1.0,0.300279,4.4
2,1.0,0.0,0.0,0.720411,0.463995,0.000000,0.678736,0.477724,0.0,0.500235,4.2
3,1.0,0.0,0.0,0.721315,0.465434,0.000000,0.664033,0.516923,0.0,0.422685,4.3
4,0.0,0.0,0.0,0.719408,0.468161,0.666667,0.624614,0.580009,0.0,0.765618,4.9
...,...,...,...,...,...,...,...,...,...,...,...
44254,1.0,0.0,0.0,0.876275,0.418881,0.000000,0.223058,0.112316,1.0,0.264470,3.3
44255,1.0,0.0,0.0,0.876296,0.418815,0.000000,0.140734,0.000000,0.0,0.225529,3.2
44256,0.0,0.0,0.0,0.876265,0.418859,0.000000,0.111529,0.198939,1.0,0.300279,0.0
44257,0.0,0.0,0.0,0.876280,0.418877,0.000000,0.111529,0.000000,1.0,0.182457,0.0


From embedded methods i pick LASSO and lassoCV for finding the best alpha value.

In [18]:
def lasso_cv_feature_selection(df, target_col, n_alphas=100, random_state=42):
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    lasso_cv = LassoCV(n_alphas=n_alphas, random_state=random_state, cv=5)
    lasso_cv.fit(X, y)

    best_alpha = lasso_cv.alpha_

    lasso = Lasso(alpha=best_alpha, random_state=random_state)
    lasso.fit(X, y)

    important_features = X.columns[lasso.coef_ != 0]

    return df[important_features.to_list() + [target_col]], best_alpha



In [21]:
selected_data, optimal_alpha = lasso_cv_feature_selection(data, 'rating')
print(f"Optimal alpha value: {optimal_alpha}")

Optimal alpha value: 0.003921306539916073


In [22]:
selected_data

Unnamed: 0,type_Café/Bakery,type_Casual Dining,type_Fine Dining,city_Amravati,city_Jamnagar,city_Kolhapur,city_Mumbai,city_New Delhi,city_Pune,area_1484,area_112077,area_196024,votes,delivery,chain_restaurant,indian,asian,others,rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680371,0.0,1.0,1.0,0.0,1.0,4.4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.720025,0.0,0.0,1.0,1.0,0.0,4.4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.678736,1.0,1.0,0.0,0.0,0.0,4.2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.664033,1.0,1.0,1.0,0.0,1.0,4.3
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.624614,1.0,0.0,1.0,0.0,0.0,4.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223058,1.0,0.0,1.0,1.0,0.0,3.3
44255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140734,1.0,0.0,0.0,0.0,0.0,3.2
44256,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111529,1.0,0.0,1.0,1.0,0.0,0.0
44257,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111529,1.0,0.0,0.0,1.0,0.0,0.0


In [23]:
datasets = [selected_data, data_with_selected_features, data_with_important_features ]

In [26]:
def compare_feature_selection_methods(datasets, target_col, test_size=0.2, random_state=42):
    best_score = float('inf')
    best_dataset = None
    best_model = None
    best_index = None
    
    for i, df in enumerate(datasets):
        X = df.drop(target_col, axis=1)
        y = df[target_col]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        model = RandomForestRegressor(random_state=random_state)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        score = mean_squared_error(y_test, y_pred)
        
        print(f"Dataset {i+1}: MSE = {score:.4f}")
        
        if score < best_score:
            best_score = score
            best_dataset = df
            best_model = model
            best_index = i
    
    print(f"Best dataset: {best_index + 1} with MSE = {best_score:.4f}")
    return best_dataset, best_model



In [27]:
best_dataset, best_model = compare_feature_selection_methods(
    [selected_data, data_with_selected_features, data_with_important_features ], 'rating'
)


Dataset 1: MSE = 0.1575
Dataset 2: MSE = 0.0992
Dataset 3: MSE = 0.0965
Best dataset: 3 with MSE = 0.0965


In [28]:
data_with_important_features.to_csv('data_features_rfe.csv')