# **Task: Predictive Modeling**

# *Build a regression model to predict theaggregate rating of a restaurant based onavailable features.*
# *Split the dataset into training and testing setsand evaluate the model's performance usingappropriate metrics.*

In [129]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [130]:
df = pd.read_csv(r'C:\Users\kuldeep singh\OneDrive\Desktop\Cognifyz-Technologies\dataset.csv')
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

In [132]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [133]:
df = df.drop(columns=['Restaurant ID', 'Restaurant Name', 'Address', 'Locality', 
                      'Locality Verbose', 'Rating color', 'Rating text', 'Switch to order menu'])


In [134]:
df.dropna(inplace=True)

In [135]:
df.head()

Unnamed: 0,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Price range,Aggregate rating,Votes
0,162,Makati City,121.027535,14.565443,"French, Japanese, Desserts",1100,Botswana Pula(P),Yes,No,No,3,4.8,314
1,162,Makati City,121.014101,14.553708,Japanese,1200,Botswana Pula(P),Yes,No,No,3,4.5,591
2,162,Mandaluyong City,121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,Botswana Pula(P),Yes,No,No,4,4.4,270
3,162,Mandaluyong City,121.056475,14.585318,"Japanese, Sushi",1500,Botswana Pula(P),No,No,No,4,4.9,365
4,162,Mandaluyong City,121.057508,14.58445,"Japanese, Korean",1500,Botswana Pula(P),Yes,No,No,4,4.8,229


In [136]:
X = df.drop(columns='Aggregate rating')
y = df['Aggregate rating']

categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [137]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [139]:
results = []

In [140]:
X_train

Unnamed: 0,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Price range,Votes
8177,1,Noida,77.353663,28.574219,South Indian,450,Indian Rupees(Rs.),No,No,No,1,6
6401,1,New Delhi,77.134360,28.671141,"Pizza, Fast Food",700,Indian Rupees(Rs.),No,No,No,2,112
81,30,S��o Paulo,-46.746958,-23.609207,"Lebanese, Arabian",120,Brazilian Real(R$),No,No,No,4,11
1332,1,Gurgaon,77.086080,28.482318,Mughlai,300,Indian Rupees(Rs.),No,No,No,1,0
9041,1,Noida,77.339801,28.586405,"American, Fast Food, Salad, Healthy Food",500,Indian Rupees(Rs.),No,Yes,No,2,93
...,...,...,...,...,...,...,...,...,...,...,...,...
5743,1,New Delhi,77.052860,28.664641,"North Indian, Chinese",250,Indian Rupees(Rs.),No,No,No,1,1
5200,1,New Delhi,77.211180,28.536406,Bihari,300,Indian Rupees(Rs.),No,Yes,No,1,308
5399,1,New Delhi,77.128051,28.478595,North Indian,500,Indian Rupees(Rs.),No,No,No,2,4
869,1,Faridabad,77.305745,28.490062,Bakery,100,Indian Rupees(Rs.),No,No,No,1,7


In [141]:
X_test

Unnamed: 0,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Currency,Has Table booking,Has Online delivery,Is delivering now,Price range,Votes
7142,1,New Delhi,0.000000,0.000000,"Chinese, Street Food",150,Indian Rupees(Rs.),No,No,No,1,2
1860,1,Gurgaon,77.064227,28.467934,"Asian, Seafood",1500,Indian Rupees(Rs.),No,No,No,3,105
4425,1,New Delhi,77.250660,28.543755,North Indian,250,Indian Rupees(Rs.),No,Yes,No,1,55
3219,1,New Delhi,77.230277,28.572796,Italian,1500,Indian Rupees(Rs.),Yes,Yes,No,3,245
8308,1,Noida,77.331711,28.548867,"Chinese, Fast Food",200,Indian Rupees(Rs.),No,No,No,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5612,1,New Delhi,77.219281,28.709557,North Indian,100,Indian Rupees(Rs.),No,No,No,1,0
4061,1,New Delhi,0.000000,0.000000,Desserts,500,Indian Rupees(Rs.),No,No,No,2,0
8746,1,Noida,77.385241,28.569480,"Raw Meats, Fast Food",400,Indian Rupees(Rs.),No,No,No,1,0
7678,1,New Delhi,77.056812,28.622119,"North Indian, Chinese",400,Indian Rupees(Rs.),No,No,No,1,3


In [142]:
y_train

8177    2.8
6401    3.2
81      4.1
1332    0.0
9041    3.4
       ... 
5743    0.0
5200    4.0
5399    3.0
869     2.8
7279    0.0
Name: Aggregate rating, Length: 7633, dtype: float64

In [143]:
y_test

7142    0.0
1860    4.1
4425    3.3
3219    3.9
8308    0.0
       ... 
5612    0.0
4061    0.0
8746    0.0
7678    0.0
4430    3.3
Name: Aggregate rating, Length: 1909, dtype: float64

In [144]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [145]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mse

0.09049345049764275

In [146]:
r2 = r2_score(y_test, y_pred)
r2

0.9604837650524638

# *Experiment with different algorithms (e.g.,linear regression, decision trees, randomforest) and compare their performance*

In [147]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [148]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

In [152]:
results = []

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append((name, mse, r2))  


In [154]:
mse

0.09049345049764275

In [155]:
r2

0.9604837650524638

In [153]:
results

[('Linear Regression', 1.4603229209908632, 0.36231336822933335),
 ('Decision Tree', 0.1682294394971189, 0.9265383956551236),
 ('Random Forest', 0.09049345049764275, 0.9604837650524638)]