# Assignment – 1 #
## Predictive Modelling of Eating-Out Problem ##

## Part B – Predictive Modelling ##

#### Importing all Libraries ####

In [41]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, confusion_matrix, classification_report


#### Lodaing Data Set ####

In [42]:
# Load the restaurant data
csv_file_path = 'zomato_df_final_data.csv'
data = pd.read_csv(csv_file_path)

In [43]:
print("Shape of the dataset:", data.shape)

Shape of the dataset: (10500, 17)


### I. Feature Engineering ###

#### 1. Checking for Missing Values ####

In [44]:
# Check for missing values of the dataset
missing_values = data.isna().sum()

# Display the number of missing values for each column
print(missing_values)


address             0
cost              346
cuisine             0
lat               192
link                0
lng               192
phone               0
rating_number    3316
rating_text      3316
subzone             0
title               0
type               48
votes            3316
groupon             0
color               0
cost_2            346
cuisine_color       0
dtype: int64


#### 1. Imputating Missing Values ####

In [45]:
# For numerical columns fill NaN with the mean
for column in data.select_dtypes(include=['float64', 'int64']).columns:
    data[column] = data[column].fillna(data[column].median())  

# For categorical columns fill NaN with the most common value
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].fillna(data[column].mode()[0])  

In [46]:
# check again
missing_values = data.isna().sum()

# Display the number of missing values for each column
print(missing_values)

address          0
cost             0
cuisine          0
lat              0
link             0
lng              0
phone            0
rating_number    0
rating_text      0
subzone          0
title            0
type             0
votes            0
groupon          0
color            0
cost_2           0
cuisine_color    0
dtype: int64


#### 2. lable/Feature Encodding ####

In [47]:
# Convert string representation of lists to actual lists for cuisine and type
data['cuisine'] = data['cuisine'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['type'] = data['type'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [48]:


# One-Hot Encode cuisine column and rename columns with prefix cuisine_
mlb_cuisine = MultiLabelBinarizer()
cuisine_encoded = pd.DataFrame(mlb_cuisine.fit_transform(data['cuisine']), 
                               columns=[f'cuisine_{col}' for col in mlb_cuisine.classes_], 
                               index=data.index)
data = pd.concat([data, cuisine_encoded], axis=1)

# One-Hot Encode type column
mlb_type = MultiLabelBinarizer()
type_encoded = pd.DataFrame(mlb_type.fit_transform(data['type']), 
                            columns=[f'type_{col}' for col in mlb_type.classes_], 
                            index=data.index)
data = pd.concat([data, type_encoded], axis=1)
data = data.drop(columns=["cuisine","type"], errors='ignore')



data['groupon'] = data['groupon'].astype(int)


In [49]:
# Check the shape of the dataset and columns after encoding
print("Shape of the dataset:", data.shape)
print(data.columns)

Shape of the dataset: (10500, 163)
Index(['address', 'cost', 'lat', 'link', 'lng', 'phone', 'rating_number',
       'rating_text', 'subzone', 'title',
       ...
       'type_Casual Dining', 'type_Club', 'type_Dessert Parlour',
       'type_Fast Food', 'type_Fine Dining', 'type_Food Court',
       'type_Food Stall', 'type_Food Truck', 'type_Pub', 'type_Wine Bar'],
      dtype='object', length=163)


### II. Regression ###

#### 3. Leanier Regression Model ####

In [50]:


# Drop non-numeric columns that cannot be used in the regression
non_numeric_columns = ['address', 'url', 'phone', 'cuisine', 'title', 'subzone', 'type','link','color','cuisine_color',"cost_2"]  # Adjust based on your dataset
data = data.drop(columns=non_numeric_columns, errors='ignore')
data.to_csv('cleaned_encoded_data.csv', index=False)
data = data.drop(columns="rating_text", errors='ignore')

# Check if there are any other non-numeric columns (excluding target)
X = data.drop(columns=['rating_number'])  # Features
X = X.select_dtypes(include=['number'])   # Ensure only numeric features are use

# Target variable - numeric rating
y = data['rating_number']

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Feature scaling (optional but recommended for linear models)

# Build the Linear Regression Model
model_regression_1 = LinearRegression()

# Train the model on the training data
model_regression_1.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model_regression_1.predict(X_test)

# Calculate the Mean Squared Error (MSE) for model evaluation
mse = mean_squared_error(y_test, y_pred)



#### 4. Gradient Descent ####

In [51]:
# Feature scaling 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the Linear Regression Model using Gradient Descent with additional parameters
model_regression_2 = SGDRegressor(max_iter=5000, tol=1e-4, eta0=0.0001, learning_rate='constant', alpha=0.0001)

# Train the model on the training data
model_regression_2.fit(X_train, y_train)

# Make predictions on the test data
y_pred_sgd = model_regression_2.predict(X_test)

# Calculate the Mean Squared Error (MSE) for model evaluation
mse_sgd = mean_squared_error(y_test, y_pred_sgd)




#### 5. Report of the Models ####

In [52]:


# Calculate the Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) for Linear Regression
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # RMSE for Linear Regression
r2_linear = r2_score(y_test, y_pred)  # R² for Linear Regression

# Calculate the Mean Squared Error (MSE) and Root Mean Squared Error (RMSE) for SGD Regressor
mse_sgd = mean_squared_error(y_test, y_pred_sgd)
rmse_sgd = np.sqrt(mse_sgd)  # RMSE for SGD
r2_sgd = r2_score(y_test, y_pred_sgd)  # R² for SGD Regressor

# Print the results in a formatted table
print(f"{'Model':<20} {'Model Type':<20} {'Mean Squared Error':<20} {'Root Mean Squared Error':<25} {'R-squared':<10}")
print(f"{'-'*90}")
print(f"{'model_regression_1':<20} {'Linear Regression':<20} {mse:<20.5f} {rmse:<25.5f} {r2_linear:<10.5f}")
print(f"{'model_regression_2':<20} {'Gradient Descent':<20} {mse_sgd:<20.5f} {rmse_sgd:<25.5f} {r2_sgd:<10.5f}")


Model                Model Type           Mean Squared Error   Root Mean Squared Error   R-squared 
------------------------------------------------------------------------------------------
model_regression_1   Linear Regression    0.09240              0.30397                   0.28817   
model_regression_2   Gradient Descent     0.09236              0.30391                   0.28849   


Both models performed similarly, with the Gradient Descent model slightly outperforming the Linear Regression model. The R² values around 0.29 indicate that both models explain approximately 29% of the variance in the target variable.

### III. Classification ###

#### 6. Classfying Ratings ####

In [53]:
csv_file_path = 'cleaned_encoded_data.csv'
data = pd.read_csv(csv_file_path)
# Function to classify ratings into binary classes
def classify_rating(rating):
    # Class 1: Poor, Average
    if rating in ['Poor', 'Average']:
        return 1  
    # Class 2: Good, Very Good, Excellent
    elif rating in ['Good', 'Very Good', 'Excellent']:
        return 2  

# Apply the classification to the dataset
data['binary_class'] = data['rating_text'].apply(classify_rating)

# Verify the classification
print(data[['rating_text', 'binary_class']].head())
data = data.drop(columns="rating_text", errors='ignore')


  rating_text  binary_class
0   Very Good             2
1   Excellent             2
2   Excellent             2
3   Excellent             2
4   Excellent             2


#### 7. Logistic regression model ####

In [54]:
cuisine_columns = [col for col in data.columns if col.startswith('cuisine_')]
type_columns = [col for col in data.columns if col.startswith('type_')]


In [55]:
# Define the features (X) and target (y)
features = ['cost','votes'] + cuisine_columns + type_columns
X = data[features]
y = data['binary_class']  # Target: binary classification

# Split the data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Build the logistic regression model
model_classification_3 = LogisticRegression(max_iter=5000)

# Train the model on the training data
model_classification_3.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model_classification_3.predict(X_test)

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy of the model
print(f'Accuracy of the Logistic Regression model: {accuracy:.2f}')


Accuracy of the Logistic Regression model: 0.89


#### Logistic Regression Model Summary

- A logistic regression model was implemented to predict a binary classification based on features such as cost, votes, cuisine categories, and type categories.
- Data was split into 80% training and 20% test sets.
- The model was trained using 5000 iterations to ensure convergence.
- The model achieved an accuracy score of 89% on the test data, indicating strong performance in classification.


#### 8. confusion matrix ####

In [56]:

# Make predictions on the test data
y_pred = model_classification_3.predict(X_test)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Optionally, you can also print a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Class 1 (Poor/Average)', 'Class 2 (Good/Very Good/Excellent)']))


Confusion Matrix:
[[1584   47]
 [ 186  283]]

Classification Report:
                                    precision    recall  f1-score   support

            Class 1 (Poor/Average)       0.89      0.97      0.93      1631
Class 2 (Good/Very Good/Excellent)       0.86      0.60      0.71       469

                          accuracy                           0.89      2100
                         macro avg       0.88      0.79      0.82      2100
                      weighted avg       0.89      0.89      0.88      2100



#### Confusion Matrix

The confusion matrix for the logistic regression model is as follows:

- True negatives (Class 1 correctly predicted): 1584
- False positives (Class 1 predicted as Class 2): 47
- False negatives (Class 2 predicted as Class 1): 186
- True positives (Class 2 correctly predicted): 283

The model shows a good balance between correctly predicting both classes, with a higher number of correct predictions for Class 1.





#### 10. Extra Models for classification ####

In [57]:

# Define a function to train and evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Generate the confusion matrix and classification report
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, target_names=['Class 1 (Poor/Average)', 'Class 2 (Good/Very Good/Excellent)'])
    
    return conf_matrix, class_report

# Models to try
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Iterate over each model and evaluate
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...\n")
    conf_matrix, class_report = evaluate_model(model, X_train, y_train, X_test, y_test)
    
    # Print the confusion matrix and classification report for each model
    print("Confusion Matrix:")
    print(conf_matrix)
    print("\nClassification Report:")
    print(class_report)



Evaluating Random Forest...

Confusion Matrix:
[[1525  106]
 [ 120  349]]

Classification Report:
                                    precision    recall  f1-score   support

            Class 1 (Poor/Average)       0.93      0.94      0.93      1631
Class 2 (Good/Very Good/Excellent)       0.77      0.74      0.76       469

                          accuracy                           0.89      2100
                         macro avg       0.85      0.84      0.84      2100
                      weighted avg       0.89      0.89      0.89      2100


Evaluating SVM...

Confusion Matrix:
[[1535   96]
 [ 112  357]]

Classification Report:
                                    precision    recall  f1-score   support

            Class 1 (Poor/Average)       0.93      0.94      0.94      1631
Class 2 (Good/Very Good/Excellent)       0.79      0.76      0.77       469

                          accuracy                           0.90      2100
                         macro avg       0.86  

### Extra Models for Classification

Three additional models were evaluated for binary classification: Random Forest, Support Vector Machine (SVM), and Gradient Boosting. Below are the results for each model:

#### 1. Random Forest
- **Confusion Matrix**:
- **Classification Report**:
- Class 1 (Poor/Average): Precision = 0.93, Recall = 0.94, F1-Score = 0.93
- Class 2 (Good/Very Good/Excellent): Precision = 0.77, Recall = 0.74, F1-Score = 0.76
- **Overall Accuracy**: 89%

#### 2. Support Vector Machine (SVM)
- **Confusion Matrix**:
- **Classification Report**:
- Class 1 (Poor/Average): Precision = 0.93, Recall = 0.94, F1-Score = 0.94
- Class 2 (Good/Very Good/Excellent): Precision = 0.79, Recall = 0.76, F1-Score = 0.77
- **Overall Accuracy**: 90%

#### 3. Gradient Boosting
- **Confusion Matrix**:
- **Classification Report**:
- Class 1 (Poor/Average): Precision = 0.94, Recall = 0.93, F1-Score = 0.94
- Class 2 (Good/Very Good/Excellent): Precision = 0.77, Recall = 0.78, F1-Score = 0.78
- **Overall Accuracy**: 90%

In conclusion, the **SVM** and **Gradient Boosting** models provided the highest accuracy at 90%, while **Random Forest** performed slightly lower at 89%.
