## <center><b>ONLINE FOOD DATASET.</b></center>

IMPORT LIBRARY PACKAGES

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('always') 
import shap
import pickle

# Model Classifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.utils import resample
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from flask import Flask, render_template, request
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

LOAD AND DISPLAY DATA

In [None]:
online_food = pd.read_csv('onlinefoods.csv')
online_food.head()

#### <b> DATA PREPROCESSING </b>

CHECK THE NUMBER OF ROWS AND COLUMNS

In [None]:
print('The total number of rows and columns is',online_food.shape,'respectively.')

CHECK FOR THE COLUMNS

In [None]:
online_food.columns

CHECK THE OVERALL INFO

In [None]:
online_food.info()

CHECK FOR MISSING AND DUPLICATED VALUES

In [None]:
online_food.isna()

CHECK THE TOTAL NUMBER OF MISSING VALUES

In [None]:
#Display the total sum of missing values in each column.
online_food.isna().sum()

CHECK THE TOTAL NUMBER OF DUPLICATE ROWS

In [None]:
#Check the total number of duplicate rows
print ("There are",online_food.duplicated().sum(), "duplicate values in this dataset.")

In [None]:
#Display duplicates
duplicates = online_food[online_food.duplicated(keep=False)]
print("The duplicate rows are as shown below:")

duplicates

In [None]:
online_food.describe()

In [None]:
online_food.boxplot()

##### <b> DATA CLEANING </b>

In [None]:
#Drop unneeded column(s)
online_food.drop (columns=['Unnamed: 12'], inplace = True)
online_food.head()

#### <b>EXPLORATORY DATA ANALYSIS,(E.D.A)</b>

UNIVARIATE ANALYSIS

In [None]:
# Columns to exclude from the loop
exclude_columns = ['latitude', 'longitude', 'Pin code']

# Loop through each column
for column in online_food.columns:
    # Skip excluded columns
    if column in exclude_columns:
        continue
    
    # Get unique values and their counts
    value_counts = online_food[column].value_counts()
    
    # Create bar chart
    fig = px.bar(value_counts, x=value_counts.index, y=value_counts.values, 
                 labels={'x': column, 'y': 'Count'}, 
                 title=f"Value Counts for {column}")
    
    # Show plot
    fig.show()

In [None]:
# Central tendency
mean_value = online_food['Age'].mean()
median_value = online_food['Age'].median()
mode_value = online_food['Age'].mode().values[0]

In [None]:
# Variability or dispersion
range_value = online_food['Age'].max() - online_food['Age'].min()
variance_value = online_food['Age'].var()
std_deviation_value = online_food['Age'].std()

In [None]:
# Skewness and Kurtosis
skewness_value = online_food['Age'].skew()
kurtosis_value = online_food['Age'].kurtosis()

In [None]:
# Visualization (box plot for numerical variable)
plt.figure(figsize=(8,8))
sns.boxplot(online_food['Age'])
plt.title('Box Plot')
plt.xlabel('Age')
plt.show()

BIVARIATE ANALYSIS

In [None]:
#Bivariate analysis for 'Age' vs 'Output'
fig = px.box(online_food, x='Age', y='Output', title='Age vs Output')
fig.update_xaxes(title='Age')
fig.update_yaxes(title='Output')
fig.show()


# Bivariate analysis for 'Age' vs 'Marital Status'
fig = px.histogram(online_food, x='Age', color='Marital Status', 
                   title='Age vs Marital Status',
                   labels={'Age': 'Age', 'Marital Status': 'Marital Status', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Age' vs 'Monthly Income'
fig = px.histogram(online_food, x='Age', color='Monthly Income', 
             title='Age vs Monthly Income', 
             labels={'Age': 'Age', 'Monthly Income': 'Monthly Income', 'count': 'Count'})

fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Age' vs 'Occupation'
fig = px.histogram(online_food, x='Age', color='Occupation', 
             title='Age vs Occupation', 
             labels={'Age': 'Age', 'Occupation': 'Occupation', 'count': 'Count'})

fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Gender' vs 'Output'
fig = px.histogram(online_food, x='Gender', color='Educational Qualifications', 
             title='Gender vs Educational Qualifications', 
             labels={'Gender': 'Gender', 'Educational Qualifications': 'Educational Qualifications', 'count': 'Count'})

fig.update_layout(xaxis_title='Gender', yaxis_title='Count')
fig.show()


#Bivariate analysis for 'Marital Status' vs 'Gender'
# Create a cross-tabulation
cross_tab = pd.crosstab(online_food['Gender'], online_food['Marital Status'])

# Plot the heatmap
fig = px.imshow(cross_tab,
                labels=dict(x="Marital Status", y="Gender", color="Count"),
                x=cross_tab.columns,
                y=cross_tab.index,
                color_continuous_scale='viridis')

fig.update_layout(title='Gender vs. Marital Status',
                  xaxis_title='Marital Status',
                  yaxis_title='Gender')

fig.show()


# Bivariate analysis for 'Gender' vs 'Output'
fig = px.histogram(online_food, x='Gender', color='Output', 
                   title='Gender vs Output',
                   labels={'Gender': 'Gender', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Gender', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Marital Status' vs 'Output'
fig = px.histogram(online_food, x='Marital Status', color='Output', 
                   title='Marital Status vs Output',
                   labels={'Marital Status': 'Marital Status', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Marital Status', yaxis_title='Count')
fig.show()

# Bivariate analysis for 'Marital Status' vs 'Monthly Income'
fig = px.histogram(online_food, x='Marital Status', color='Monthly Income', 
                   title='Marital Status vs Monthly Income',
                   labels={'Marital Status': 'Marital Status', 'Monthly Income': 'Monthly Income', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Marital Status', yaxis_title='Monthly Income')
fig.show()


# Bivariate analysis for 'Occupation' vs 'Output'
fig = px.histogram(online_food, x='Occupation', color='Output', 
                   title='Occupation vs Output',
                   labels={'Occupation': 'Occupation', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Occupation', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Educational Qualifications' vs 'Output'
fig = px.histogram(online_food, x='Educational Qualifications', color='Output', 
                   title='Educational Qualifications vs Output',
                   labels={'Educational Qualifications': 'Educational Qualifications', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Educational Qualifications', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Family size' vs 'Output'
fig = px.box(online_food, x='Output', y='Family size', 
             title='Family size vs Output', 
             labels={'Output': 'Output', 'Family size': 'Family size'})

fig.update_xaxes(title='Output')
fig.update_yaxes(title='Family size')
fig.show()


# Bivariate analysis for 'Feedback' vs 'Output'
fig = px.histogram(online_food, x='Feedback', color='Output', 
                   title='Feedback vs Output',
                   labels={'Feedback': 'Feedback', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Feedback', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Monthly Income' vs 'Output'
fig = px.histogram(online_food, x='Output', y='Monthly Income', 
                   title='Monthly Income vs Output',
                   labels={'Output': 'Output', 'Monthly Income': 'Monthly Income', 'count': 'Count'})

fig.update_layout(xaxis_title='Output', yaxis_title='Monthly Income')
fig.show()

MULTIVARIATE ANALYSIS

In [None]:
# Pairplot for numerical variables
# Create the pairplot
g = sns.pairplot(online_food, diag_kind='kde', height=3, aspect=1.5)
g.fig.suptitle('Pairplot of Numerical Variables', y=1.02)

# Adjust plot aesthetics
plt.tight_layout(rect=[0, 0, 1, 0.96])

# Show the plot
plt.show()


# Boxplot of 'Rating' by 'Gender' and 'Marital Status'
plt.figure(figsize=(12, 6))
sns.boxplot(x='Age', y='Marital Status', hue='Family size', data=online_food)
plt.title('Age by Marital Status and Family size')
plt.xlabel('Age')
plt.ylabel('Marital Status')
plt.show()


# Barplot of 'Feedback' by 'Occupation' and 'Educational Qualifications'
plt.figure(figsize=(10, 6))
sns.histplot(x='Occupation', y='Feedback', hue='Educational Qualifications', data=online_food)
plt.title('Feedback by Occupation and Educational Qualifications')
plt.xlabel('Occupation')
plt.ylabel('Feedback')
plt.xticks(rotation=45)
plt.show()


# Barplot of Feedback by Occupation and Educational Qualifications
fig = px.bar(online_food, x='Occupation', y='Feedback', color='Educational Qualifications',
             title='Feedback by Occupation and Educational Qualifications')

fig.update_layout(xaxis_title='Occupation', yaxis_title='Feedback')
fig.update_xaxes(tickangle=45)
fig.show()

##### <b> FEATURE ENGINEERING <b>

LABEL ENCODING

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate over categorical columns and apply label encoding
for column in ['Gender', 'Marital Status', 'Occupation', 'Monthly Income','Educational Qualifications','Output','Feedback']:
    online_food[column] = label_encoder.fit_transform(online_food[column].values)

# Display the encoded dataset
online_food

CORRELATION HEATMAP

In [None]:
# Heatmap for correlation
plt.figure(figsize=(10, 10))
correlation_matrix = online_food.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

##### <b> MODELLING </b>

LOGISTIC REGRESSION

In [None]:
# Select features and target variable
X = online_food[['Age', 'Monthly Income', 'Family size']]
y = online_food['Output']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the random forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Save the model
pickle.dump(model, open('model.pkl', 'wb'))

RANDOM FOREST

# Initialize and train the random forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("ROC AUC Score:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1]))

GRADIENT BOOSTING MACHINES (GBM)

# Initialize and train the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", accuracy_xgb)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))

# Step 3: Tune hyperparameters for Random Forest
rf_param_grid = {'n_estimators': [100, 200, 300],
                 'max_depth': [None, 5, 10, 20],
                 'min_samples_leaf': [1, 2, 4]}
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)
print("Best Parameters for Random Forest:", rf_grid_search.best_params_)
best_rf_model = rf_grid_search.best_estimator_

# Step 4: Feature Importance Analysis for Random Forest
feature_importances = best_rf_model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
print("Feature Importances for Random Forest:\n", feature_importance_df)

# Step 5: Address Class Imbalance (if applicable)
# For example, if there's class imbalance, you can upsample the minority class
X_train_balanced, y_train_balanced = resample(X_train[y_train == 1], y_train[y_train == 1], 
                                              replace=True, n_samples=sum(y_train == 0), random_state=42)
X_train_balanced = pd.concat([X_train[y_train == 0], X_train_balanced])
y_train_balanced = pd.concat([y_train[y_train == 0], y_train_balanced])

# Step 6: Train and evaluate XGBoost Classifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
xgb_preds = xgb_clf.predict(X_test)

# Evaluate performance
print("XGBoost Classifier Performance:")
print("Accuracy:", accuracy_score(y_test, xgb_preds))
print("Classification Report:\n", classification_report(y_test, xgb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_preds))
print("ROC AUC Score:", roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1]))

# Step 7: Model Interpretability with SHAP values
explainer = shap.TreeExplainer(best_rf_model)  # For Random Forest
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=feature_names)

app = Flask(__app.py__)

# Load the pre-trained model
model = pickle.load(open('model.pkl', 'rb'))

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    # Get user input from the form
    age = int(request.form['age'])
    monthly_income = int(request.form['monthly_income'])
    family_size = int(request.form['family_size'])

    # Make prediction using the model
    prediction = model.predict([[age, monthly_income, family_size]])

    # Map prediction to human-readable output
    output = "Satisfied" if prediction[0] == 1 else "Not Satisfied"

    return render_template('index.html', prediction_text='Customer is {}'.format(output))

if __name__ == '__main__':
    app.run(debug=True)