In [533]:
import pandas as pd
import numpy as np
import boto3
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer #to handle missing values
from sklearn.preprocessing import StandardScaler #to reach global minima in Linear Regression i.e Feature Scaling
from sklearn.preprocessing import LabelEncoder #LabelEncoder since categorical Variables with no definite order or ranking  
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline #To create pipeline
from sklearn.compose import ColumnTransformer #to group pipeling together
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [534]:
# Configure the S3 client
# s3 = boto3.client('s3')

In [535]:
# s3 = boto3.resource(
#     service_name='s3',
#     region_name='ap-south-1',
#     aws_access_key_id='',
#     aws_secret_access_key=''
# )

In [536]:
# Specify the S3 bucket and file path
# bucket_name = 'zomato-dataset'
# file_path = 's3://zomato-dataset/zomato.csv'

In [537]:
# for obj in s3.Bucket(bucket_name).objects.all():
#     print(obj)

In [538]:
# Read the CSV file from S3 using the client
# obj = s3.Bucket(bucket_name).Object('zomato.csv').get()
# df = pd.read_csv(obj['Body'])

In [539]:
# Check the loaded DataFrame
# df.head()

In [540]:
df = pd.read_csv('data/clean_data.csv')

In [541]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,votes,location,rest_type,cuisines,average cost for 2
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,Chinese Indian Mughlai North,800.0
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,Chinese Indian North Thai,800.0
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,Cafe Casual Dining,Cafe Italian Mexican,800.0
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Bites Quick,Indian Indian North South,300.0
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,Indian North Rajasthani,600.0


In [542]:
df.shape

(21528, 9)

In [543]:
df = df.drop(columns=['name'])

In [544]:
df.head()

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,average cost for 2
0,Yes,Yes,4.1,775,Banashankari,Casual Dining,Chinese Indian Mughlai North,800.0
1,Yes,No,4.1,787,Banashankari,Casual Dining,Chinese Indian North Thai,800.0
2,Yes,No,3.8,918,Banashankari,Cafe Casual Dining,Cafe Italian Mexican,800.0
3,No,No,3.7,88,Banashankari,Bites Quick,Indian Indian North South,300.0
4,No,No,3.8,166,Basavanagudi,Casual Dining,Indian North Rajasthani,600.0


### Seperation of Dependent and Independent Variables

In [545]:
#Independent Variables
X = df.drop(columns=['rate'])

In [546]:
#Dependent Variables
Y = df['rate']

In [547]:
X.head()

Unnamed: 0,online_order,book_table,votes,location,rest_type,cuisines,average cost for 2
0,Yes,Yes,775,Banashankari,Casual Dining,Chinese Indian Mughlai North,800.0
1,Yes,No,787,Banashankari,Casual Dining,Chinese Indian North Thai,800.0
2,Yes,No,918,Banashankari,Cafe Casual Dining,Cafe Italian Mexican,800.0
3,No,No,88,Banashankari,Bites Quick,Indian Indian North South,300.0
4,No,No,166,Basavanagudi,Casual Dining,Indian North Rajasthani,600.0


In [548]:
Y.head()

0    4.1
1    4.1
2    3.8
3    3.7
4    3.8
Name: rate, dtype: float64

### Now we need to Scale the Data which is continuous and Label-encode data which is categorical but not ranked

In [549]:
numerical_columns = X.select_dtypes(exclude='object').columns
categorical_columns = X.select_dtypes(include='object').columns

In [550]:
numerical_columns

Index(['votes', 'average cost for 2'], dtype='object')

In [551]:
categorical_columns

Index(['online_order', 'book_table', 'location', 'rest_type', 'cuisines'], dtype='object')

### Creating Pipelines

In [552]:
import category_encoders as ce
#Numerical Pipeline
'''
1) Handle Missing values
2) Scaling
'''
numerical_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')), #Median since there are outliers
    ('scaler',StandardScaler())
    ]
)

#Categorical Piplepine
'''
1) Handle Missing values
2) Label Encoding
3) Scaling
'''
categorical_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[df[col].unique() for col in categorical_columns])),
    ('scaler',StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_columns),
    ('categorical_pipeline',categorical_pipeline,categorical_columns)
])

### Train Test Split

In [553]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=30)

In [554]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())


In [555]:
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [556]:
X_train

Unnamed: 0,numerical_pipeline__votes,numerical_pipeline__average cost for 2,categorical_pipeline__online_order,categorical_pipeline__book_table,categorical_pipeline__location,categorical_pipeline__rest_type,categorical_pipeline__cuisines
0,-0.498593,-0.692068,-0.638483,0.611069,-0.459750,-0.456672,0.301240
1,-0.438591,0.841122,-0.638483,0.611069,-0.310133,0.837672,0.658183
2,-0.490021,-0.117122,-0.638483,0.611069,0.437953,-0.672396,-0.986860
3,-0.391447,-0.500420,-0.638483,0.611069,-0.360005,-0.348810,-0.654747
4,-0.036580,1.032771,1.566214,-1.636477,-0.808856,0.729810,-0.757174
...,...,...,...,...,...,...,...
15064,-0.471163,-0.883717,-0.638483,0.611069,-0.908601,-0.456672,-0.350570
15065,-0.429162,0.841122,-0.638483,0.611069,-0.808856,-0.348810,1.316200
15066,-0.274015,-0.308771,-0.638483,0.611069,0.737187,-0.456672,-0.986860
15067,1.106023,1.224419,1.566214,-1.636477,-0.808856,1.376982,1.725909


In [557]:
X_train.columns

Index(['numerical_pipeline__votes', 'numerical_pipeline__average cost for 2',
       'categorical_pipeline__online_order',
       'categorical_pipeline__book_table', 'categorical_pipeline__location',
       'categorical_pipeline__rest_type', 'categorical_pipeline__cuisines'],
      dtype='object')

In [558]:
for i in X_train.columns:
    print(X_train[i].isna)

<bound method Series.isna of 0       -0.498593
1       -0.438591
2       -0.490021
3       -0.391447
4       -0.036580
           ...   
15064   -0.471163
15065   -0.429162
15066   -0.274015
15067    1.106023
15068   -0.382018
Name: numerical_pipeline__votes, Length: 15069, dtype: float64>
<bound method Series.isna of 0       -0.692068
1        0.841122
2       -0.117122
3       -0.500420
4        1.032771
           ...   
15064   -0.883717
15065    0.841122
15066   -0.308771
15067    1.224419
15068   -0.308771
Name: numerical_pipeline__average cost for 2, Length: 15069, dtype: float64>
<bound method Series.isna of 0       -0.638483
1       -0.638483
2       -0.638483
3       -0.638483
4        1.566214
           ...   
15064   -0.638483
15065   -0.638483
15066   -0.638483
15067    1.566214
15068    1.566214
Name: categorical_pipeline__online_order, Length: 15069, dtype: float64>
<bound method Series.isna of 0        0.611069
1        0.611069
2        0.611069
3        0.611069
4   

In [559]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15069 entries, 0 to 15068
Data columns (total 7 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   numerical_pipeline__votes               15069 non-null  float64
 1   numerical_pipeline__average cost for 2  15069 non-null  float64
 2   categorical_pipeline__online_order      15069 non-null  float64
 3   categorical_pipeline__book_table        15069 non-null  float64
 4   categorical_pipeline__location          15069 non-null  float64
 5   categorical_pipeline__rest_type         15069 non-null  float64
 6   categorical_pipeline__cuisines          15069 non-null  float64
dtypes: float64(7)
memory usage: 824.2 KB


In [560]:
y_train.unique()

array([3.8, 3.7, 4.1, 2.8, 3.9, 3.6, 4.2, 3.2, 4.5, 4.4, 4.3, 4. , 3.5,
       3.3, 3.1, 3. , 4.7, 2.7, 2.9, 4.6, 2.6, 3.4, 4.8, 2.4, 4.9, 2.5,
       2.3, 2. , 2.1, 2.2, 1.8])

In [561]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

In [562]:
regression = LinearRegression()
regression.fit(X_train,y_train)

In [563]:
regression.intercept_

3.9152166699847366

In [564]:
def evaluate(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2_square = r2_score(true,predicted)
    return mae,mse,rmse,r2_square

In [565]:
from sklearn.model_selection import GridSearchCV

In [566]:
#Training of Different models



models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(),
    'RandomForestRegressor_Tuned_1' : RandomForestRegressor(n_estimators=500,random_state=329,min_samples_leaf=.0001),
    'ExtraTreeRegressor': ExtraTreesRegressor(),
    'ExtraTreeRegressor_Tuned' : ExtraTreesRegressor(n_estimators = 100),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'DecisionTreeRegressor_Tuned' : DecisionTreeRegressor(min_samples_leaf=.0001)
}



mae_list = []
mse_list = []
rmse_list = []
trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)


    #Predictions on test data
    y_pred = model.predict(X_test)

    mae,mse,rmse,r2_square = evaluate(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('MAE:- ', mae)
    print('MSE:- ', mse)
    print('RMSE:- ',rmse)
    print('R2_Score',r2_square*100)

    r2_list.append(r2_square)
    mae_list.append(mae)
    mse_list.append(mse)
    rmse_list.append(rmse)

    print("*"*40)
    print('\n')

LinearRegression
Model Training Performance
MAE:-  0.2665224041698638
MSE:-  0.1412512114273127
RMSE:-  0.37583402111479036
R2_Score 22.93954656550762
****************************************


Lasso
Model Training Performance
MAE:-  0.3066516771470732
MSE:-  0.18332451596166663
RMSE:-  0.4281641226932339
R2_Score -0.013799406843317819
****************************************


Ridge
Model Training Performance
MAE:-  0.2665217613221606
MSE:-  0.14125127002498794
RMSE:-  0.3758340990716355
R2_Score 22.93951459719117
****************************************


ElasticNet
Model Training Performance
MAE:-  0.3066516771470732
MSE:-  0.18332451596166663
RMSE:-  0.4281641226932339
R2_Score -0.013799406843317819
****************************************


RandomForestRegressor
Model Training Performance
MAE:-  0.061694273429210196
MSE:-  0.019865309482102467
RMSE:-  0.1409443488831761
R2_Score 89.16236016074738
****************************************


RandomForestRegressor_Tuned_1
Model Traini

In [567]:
# Multiply R2 Score by 100 and round to 2 decimal places
r2_list_percentage = [round(r2 * 100, 2) for r2 in r2_list]

# Create a DataFrame with the results
df_model = pd.DataFrame({
    'Model': model_list,
    'MAE': [round(mae, 6) for mae in mae_list],
    'MSE': [round(mse, 6) for mse in mse_list],
    'RMSE': [round(rmse, 6) for rmse in rmse_list],
    'R2 Score': r2_list_percentage
})

# Print the DataFrame
print(df_model.to_string(index=False))

                        Model      MAE      MSE     RMSE  R2 Score
             LinearRegression 0.266522 0.141251 0.375834     22.94
                        Lasso 0.306652 0.183325 0.428164     -0.01
                        Ridge 0.266522 0.141251 0.375834     22.94
                   ElasticNet 0.306652 0.183325 0.428164     -0.01
        RandomForestRegressor 0.061694 0.019865 0.140944     89.16
RandomForestRegressor_Tuned_1 0.078375 0.025160 0.158618     86.27
           ExtraTreeRegressor 0.033504 0.014008 0.118354     92.36
     ExtraTreeRegressor_Tuned 0.033668 0.014140 0.118913     92.29
        DecisionTreeRegressor 0.042592 0.028039 0.167450     84.70
  DecisionTreeRegressor_Tuned 0.067724 0.034330 0.185283     81.27
