In [1]:
import pandas as pd
import numpy as np
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load the datasets
df1 = pd.read_csv('./kaggle/input/restaurant-data-with-consumer-ratings/rating_final.csv')
df2 = pd.read_csv('./kaggle/input/restaurant-data-with-consumer-ratings/userprofile.csv')
df3 = pd.read_csv('./kaggle/input/restaurant-data-with-consumer-ratings/geoplaces2.csv')

# Merge the datasets on the common column 'userID'
df = pd.merge(df1, df2, on='userID')
df = pd.merge(df, df3, on='placeID')

# Explore the merged dataset
print(df.head())
print(df.info())
print(df.describe())


  userID  placeID  rating  food_rating  service_rating  latitude_x  \
0  U1077   135085       2            2               2   22.156469   
1  U1077   135038       2            2               1   22.156469   
2  U1077   132825       2            2               2   22.156469   
3  U1077   135060       1            2               2   22.156469   
4  U1068   135104       1            1               2   23.752269   

   longitude_x smoker     drink_level dress_preference  ...  \
0  -100.985540  false  social drinker          elegant  ...   
1  -100.985540  false  social drinker          elegant  ...   
2  -100.985540  false  social drinker          elegant  ...   
3  -100.985540  false  social drinker          elegant  ...   
4   -99.168605  false  casual drinker         informal  ...   

             alcohol   smoking_area dress_code     accessibility   price url  \
0  No_Alcohol_Served  not permitted   informal  no_accessibility  medium   ?   
1  No_Alcohol_Served        section   in

In [3]:
# Drop insensitive features
df = df.drop('latitude_x', axis=1)  
df = df.drop('longitude_x', axis=1)  
df = df.drop('latitude_y', axis=1)  
df = df.drop('longitude_y', axis=1)  
df = df.drop('food_rating', axis=1)  
df = df.drop('service_rating', axis=1)  
df = df.drop('birth_year', axis=1)  
df = df.drop('weight', axis=1)  
df = df.drop('height', axis=1)
df = df.drop('url', axis=1)
df = df.drop('the_geom_meter', axis=1)
df = df.drop('placeID', axis=1)
df = df.drop('userID', axis=1)
df = df.drop('address', axis=1)
df = df.drop('city', axis=1)
df = df.drop('state', axis=1)
df = df.drop('fax', axis=1)
df = df.drop('zip', axis=1)
df = df.drop('other_services', axis=1)

# Handle missing values if any
df = df.dropna()

# Convert categorical variables to numerical (if any)
df = pd.get_dummies(df, drop_first=True)

# Split the data into features and target
X = df.drop('rating', axis=1)  # Replace 'target_column' with the actual target column name
y = df['rating']

# print(X.head())
print(X.info())
print(X.describe())
print(y.head())

print(X.isnull().sum())
print(y.isnull().sum())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1161 entries, 0 to 1160
Columns: 190 entries, smoker_false to area_open
dtypes: bool(190)
memory usage: 215.5 KB
None
       smoker_false smoker_true drink_level_casual drinker  \
count          1161        1161                       1161   
unique            2           2                          2   
top            True       False                      False   
freq            907         938                        746   

       drink_level_social drinker dress_preference_elegant  \
count                        1161                     1161   
unique                          2                        2   
top                         False                    False   
freq                          814                     1133   

       dress_preference_formal dress_preference_informal  \
count                     1161                      1161   
unique                       2                         2   
top                      False

In [4]:
# Train the model using different algorithms
# model = LogisticRegression()
# model = RandomForestClassifier(n_estimators=100)

# model.fit(X_train, y_train)

# Train the model using AutoML
#from sklearn.datasets import load_iris
#data = load_iris()
#X, y = data.data, data.target

model = AutoML()
model.fit(X_train=X, y_train=y, task="classification", time_budget=60)

[flaml.automl.logger: 09-09 04:30:48] {1680} INFO - task = classification
[flaml.automl.logger: 09-09 04:30:48] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 09-09 04:30:48] {1789} INFO - Minimizing error metric: log_loss
[flaml.automl.logger: 09-09 04:30:48] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 09-09 04:30:48] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 09-09 04:30:49] {2345} INFO - Estimated sufficient time budget=1020s. Estimated necessary time budget=24s.
[flaml.automl.logger: 09-09 04:30:49] {2392} INFO -  at 0.4s,	estimator lgbm's best error=1.0297,	best estimator lgbm's best error=1.0297
[flaml.automl.logger: 09-09 04:30:49] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 09-09 04:30:49] {2392} INFO -  at 0.5s,	estimator lgbm's best error=1.0297,	best estimator lgbm's best error=1.0297
[flaml.automl.logger: 09-09 04:



In [5]:
import joblib

# Save the model
joblib.dump(model, "./restaurant_recommendation_model.joblib")

['./restaurant_recommendation_model.joblib']

In [6]:
# Make predictions
# y_pred = model.predict(X_test)
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.8068669527896996
Confusion Matrix:
[[37  7  4]
 [ 3 75 16]
 [ 2 13 76]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.77      0.82        48
           1       0.79      0.80      0.79        94
           2       0.79      0.84      0.81        91

    accuracy                           0.81       233
   macro avg       0.82      0.80      0.81       233
weighted avg       0.81      0.81      0.81       233

