## <center><b>ONLINE FOOD DATASET.</b></center>

<b>IMPORT LIBRARY PACKAGES</b>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('always') 
import shap
import pickle

# Model Classifier
from sklearn.svm import SVC
from collections import Counter
from xgboost import XGBClassifier
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn import under_sampling, over_sampling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

from flask import Flask, render_template, request

<b>LOAD & DISPLAY DATA</b>

In [None]:
online_food = pd.read_csv('onlinefoods.csv')
online_food.head()

##### <b> DATA EXPLORATION </b>

TOTAL ROWS AND COLUMNS

In [None]:
print('The total number of rows and columns is',online_food.shape,'respectively.')

<b>DATA SUMMARY</b>

This section will provide us with all the necessary information about this dataset.

In [None]:
online_food.info()

MISSING VALUES

In [None]:
online_food.isna()

TOTAL SUM OF MISSING VALUES

In [None]:
online_food.isna().sum()

CHECK FOR DUPLICATE ROWS

In [None]:
print ("There are",online_food.duplicated().sum(), "duplicate values in this dataset.")

DISPLAY DUPLICATES ROWS

In [None]:
duplicates = online_food[online_food.duplicated(keep=False)]
duplicates

INSIGHTS

This dataset has no missing values but had 103 duplicates identified.

SUMMARY STATISTICS

In [None]:
online_food.describe()

In [None]:
online_food.boxplot()

#### <b> DATA CLEANING </b>

Under this section, I will be dropping the last column with the name "Unnamed: 12" simply because I am unable to tell the purpose of this column in the dataset.

DROP UNWANTED COLUMN

In [None]:
online_food.drop (columns=['Unnamed: 12'], inplace = True)
online_food.head()

#### <b>EXPLORATORY DATA ANALYSIS, (E.D.A)</b>

UNIVARIATE ANALYSIS

In [None]:
# Extract the values from the 'Age' column and count occurrences
age_counts = online_food['Age'].value_counts().reset_index()
age_counts.columns = ['Age', 'Count']

# Create the bar chart with color grading
fig = px.bar(age_counts, x='Age', y='Count', title='Age Distribution',
             labels={'Age': 'Age', 'Count': 'Number of People'},
             color='Age', color_continuous_scale='Viridis')

# Show the plot
fig.show()

CALCULATE AND DISPLAY MEAN VALUE OF AGE

In [None]:
mean_value = online_food['Age'].mean()
print('The mean value for age is',mean_value)

CALCULATE AND DISPLAY THE MEDIAN VALUE OF AGE

In [None]:
median_value = online_food['Age'].median()
print('The median value for age is',median_value)

CALCULATE AND DISPLAY THE MODE OF AGE

In [None]:
mode_value = online_food['Age'].mode().values[0]
print('The median value for age is',mode_value)

VISUALIZATION FOR NUMERICAL VARIABLE

In [None]:
# Visualization (box plot for numerical variable)
plt.figure(figsize=(8,8))
sns.boxplot(online_food['Age'])
plt.title('Box Plot')
plt.xlabel('Age')
plt.show()

In [None]:
# Create the histogram chart
fig = px.histogram(online_food, x='Gender', title='Gender Distribution', color='Gender',
                   color_discrete_sequence=['#636EFA', '#EF553B'])

# Add data labels
fig.update_traces(texttemplate='%{value}', textposition='outside')

# Show the plot
fig.show()

In [None]:
# Extract the values from the 'Gender' column and count occurrences
marital_status_counts = online_food['Marital Status'].value_counts().reset_index()
marital_status_counts.columns = ['Marital Status', 'Count']

colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA']

# Create the pie chart
fig = px.pie(marital_status_counts, names='Marital Status', values='Count', title='Marital Status Distribution')

# Update the chart to add data labels
fig.update_traces(textinfo='percent+label')

# Show the plot
fig.show()

In [None]:
# Calculate the counts and percentages
occupation_income_counts = online_food['Occupation'].value_counts(normalize=True).reset_index()
occupation_income_counts.columns = ['Occupation', 'Percentage']
occupation_income_counts['Percentage'] *= 100

# Create the bar chart with color grading
fig = px.bar(occupation_income_counts, x='Occupation', y='Percentage', title='Occupation Distribution',
             labels={'Occupation': 'Occupation', 'Percentage': 'Percentage (%)'},
             color='Occupation', color_continuous_scale='Viridis')

# Add data labels
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside')

# Show the plot
fig.show()

In [None]:
# Calculate the counts and percentages
monthly_income_counts = online_food['Monthly Income'].value_counts(normalize=True).reset_index()
monthly_income_counts.columns = ['Monthly Income', 'Percentage']
monthly_income_counts['Percentage'] *= 100

# Create the bar chart with color grading
fig = px.bar(monthly_income_counts, x='Monthly Income', y='Percentage', title='Monthly Income Distribution',
             labels={'Monthly Income': 'Monthly Income', 'Percentage': 'Percentage (%)'},
             color='Monthly Income', color_continuous_scale='Viridis')

# Add data labels
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside')

# Show the plot
fig.show()

In [None]:
# Calculate the counts and percentages
edu_qual_counts = online_food['Educational Qualifications'].value_counts(normalize=True).reset_index()
edu_qual_counts.columns = ['Educational Qualifications', 'Percentage']
edu_qual_counts['Percentage'] *= 100

# Create the bar chart with color grading
fig = px.bar(edu_qual_counts, x='Educational Qualifications', y='Percentage', title='Educational Qualifications Distribution',
             labels={'Educational Qualifications': 'Educational Qualifications', 'Percentage': 'Percentage (%)'},
             color='Educational Qualifications', color_continuous_scale='Viridis')

# Add data labels
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside')

# Show the plot
fig.show()

In [None]:
# Calculate the counts and percentages
family_size_counts = online_food['Family size'].value_counts(normalize=True).reset_index()
family_size_counts.columns = ['Family size', 'Percentage']
family_size_counts['Percentage'] *= 100

# Create the bar chart with color grading
fig = px.bar(family_size_counts, x='Family size', y='Percentage', title='Family size Distribution',
             labels={'Family size': 'Family size', 'Percentage': 'Percentage (%)'},
             color='Family size', color_continuous_scale='Viridis')

# Add data labels
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside')

# Show the plot
fig.show()

In [None]:
# Extract the values from the 'Age' column and count occurrences
pin_code_counts = online_food['Pin code'].value_counts().reset_index()
pin_code_counts.columns = ['Pin Code', 'Count']

# Create the bar chart with color grading
fig = px.bar(pin_code_counts, x='Pin Code', y='Count', title='Pin Code Distribution',
             labels={'Pin Code': 'Pin Code', 'Count': 'Pin Code'},
             color='Pin Code', color_continuous_scale='Viridis')

# Show the plot
fig.show()

In [None]:
# Calculate the counts and percentages
output_counts = online_food['Output'].value_counts(normalize=True).reset_index()
output_counts.columns = ['Output', 'Percentage']
output_counts['Percentage'] *= 100

# Create the bar chart with color grading
fig = px.bar(output_counts, x='Output', y='Percentage', title='Output Distribution',
             labels={'Output': 'Output', 'Percentage': 'Percentage (%)'},
             color='Output', color_continuous_scale='Viridis')

# Add data labels
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside')

# Show the plot
fig.show()

In [None]:
# Calculate the counts and percentages
feedback_counts = online_food['Feedback'].value_counts(normalize=True).reset_index()
feedback_counts.columns = ['Feedback', 'Percentage']
feedback_counts['Percentage'] *= 100

# Create the bar chart with color grading
fig = px.bar(feedback_counts, x='Feedback', y='Percentage', title='Feedback Distribution',
             labels={'Feedback': 'Feedback', 'Percentage': 'Percentage (%)'},
             color='Feedback', color_continuous_scale='Viridis')

# Add data labels
fig.update_traces(texttemplate='%{y:.1f}%', textposition='inside')

# Show the plot
fig.show()

BIVARIATE ANALYSIS

In [None]:
#Bivariate analysis for 'Age' vs 'Output'
fig = px.box(online_food, x='Age', y='Output', title='Age vs Output')
fig.update_xaxes(title='Age')
fig.update_yaxes(title='Output')
fig.show()


# Bivariate analysis for 'Age' vs 'Marital Status'
fig = px.histogram(online_food, x='Age', color='Marital Status', 
                   title='Age vs Marital Status',
                   labels={'Age': 'Age', 'Marital Status': 'Marital Status', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Age' vs 'Monthly Income'
fig = px.histogram(online_food, x='Age', color='Monthly Income', 
             title='Age vs Monthly Income', 
             labels={'Age': 'Age', 'Monthly Income': 'Monthly Income', 'count': 'Count'})

fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Age' vs 'Occupation'
fig = px.histogram(online_food, x='Age', color='Occupation', 
             title='Age vs Occupation', 
             labels={'Age': 'Age', 'Occupation': 'Occupation', 'count': 'Count'})

fig.update_layout(xaxis_title='Age', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Gender' vs 'Output'
fig = px.histogram(online_food, x='Gender', color='Educational Qualifications', 
             title='Gender vs Educational Qualifications', 
             labels={'Gender': 'Gender', 'Educational Qualifications': 'Educational Qualifications', 'count': 'Count'})

fig.update_layout(xaxis_title='Gender', yaxis_title='Count')
fig.show()


#Bivariate analysis for 'Marital Status' vs 'Gender'
# Create a cross-tabulation
cross_tab = pd.crosstab(online_food['Gender'], online_food['Marital Status'])

# Plot the heatmap
fig = px.imshow(cross_tab,
                labels=dict(x="Marital Status", y="Gender", color="Count"),
                x=cross_tab.columns,
                y=cross_tab.index,
                color_continuous_scale='viridis')

fig.update_layout(title='Gender vs. Marital Status',
                  xaxis_title='Marital Status',
                  yaxis_title='Gender')

fig.show()


# Bivariate analysis for 'Gender' vs 'Output'
fig = px.histogram(online_food, x='Gender', color='Output', 
                   title='Gender vs Output',
                   labels={'Gender': 'Gender', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Gender', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Marital Status' vs 'Output'
fig = px.histogram(online_food, x='Marital Status', color='Output', 
                   title='Marital Status vs Output',
                   labels={'Marital Status': 'Marital Status', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Marital Status', yaxis_title='Count')
fig.show()

# Bivariate analysis for 'Marital Status' vs 'Monthly Income'
fig = px.histogram(online_food, x='Marital Status', color='Monthly Income', 
                   title='Marital Status vs Monthly Income',
                   labels={'Marital Status': 'Marital Status', 'Monthly Income': 'Monthly Income', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Marital Status', yaxis_title='Monthly Income')
fig.show()


# Bivariate analysis for 'Occupation' vs 'Output'
fig = px.histogram(online_food, x='Occupation', color='Output', 
                   title='Occupation vs Output',
                   labels={'Occupation': 'Occupation', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Occupation', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Educational Qualifications' vs 'Output'
fig = px.histogram(online_food, x='Educational Qualifications', color='Output', 
                   title='Educational Qualifications vs Output',
                   labels={'Educational Qualifications': 'Educational Qualifications', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Educational Qualifications', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Family size' vs 'Output'
fig = px.box(online_food, x='Output', y='Family size', 
             title='Family size vs Output', 
             labels={'Output': 'Output', 'Family size': 'Family size'})

fig.update_xaxes(title='Output')
fig.update_yaxes(title='Family size')
fig.show()


# Bivariate analysis for 'Feedback' vs 'Output'
fig = px.histogram(online_food, x='Feedback', color='Output', 
                   title='Feedback vs Output',
                   labels={'Feedback': 'Feedback', 'Output': 'Output', 'count': 'Count'},
                   barmode='group')

fig.update_layout(xaxis_title='Feedback', yaxis_title='Count')
fig.show()


# Bivariate analysis for 'Monthly Income' vs 'Output'
fig = px.histogram(online_food, x='Feedback', y='Monthly Income', 
                   title='Monthly Income vs Feedback',
                   labels={'Feedback': 'Feedback', 'Monthly Income': 'Monthly Income', 'count': 'Count'})

fig.update_layout(xaxis_title='Output', yaxis_title='Monthly Income')
fig.show()

MULTIVARIATE ANALYSIS

In [None]:
# Pairplot for numerical variables
# Create the pairplot
g = sns.pairplot(online_food, diag_kind='kde', height=3, aspect=1.5)
g.fig.suptitle('Pairplot of Numerical Variables', y=1.02)

# Adjust plot aesthetics
plt.tight_layout(rect=[0, 0, 1, 0.96])

# Show the plot
plt.show()


# Boxplot of 'Rating' by 'Gender' and 'Marital Status'
plt.figure(figsize=(12, 6))
sns.boxplot(x='Age', y='Marital Status', hue='Family size', data=online_food)
plt.title('Age by Marital Status and Family size')
plt.xlabel('Age')
plt.ylabel('Marital Status')
plt.show()


# Barplot of 'Feedback' by 'Occupation' and 'Educational Qualifications'
plt.figure(figsize=(10, 6))
sns.histplot(x='Occupation', y='Feedback', hue='Educational Qualifications', data=online_food)
plt.title('Feedback by Occupation and Educational Qualifications')
plt.xlabel('Occupation')
plt.ylabel('Feedback')
plt.xticks(rotation=45)
plt.show()


# Barplot of Feedback by Occupation and Educational Qualifications
fig = px.bar(online_food, x='Occupation', y='Feedback', color='Educational Qualifications',
             title='Feedback by Occupation and Educational Qualifications')

fig.update_layout(xaxis_title='Occupation', yaxis_title='Feedback')
fig.update_xaxes(tickangle=45)
fig.show()

LABEL ENCODING

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate over categorical columns and apply label encoding
for column in ['Gender', 'Marital Status', 'Occupation', 'Monthly Income','Educational Qualifications','Output','Feedback']:
    online_food[column] = label_encoder.fit_transform(online_food[column].values)

# Display the encoded dataset
online_food

CORRELATION HEATMAP

In [None]:
# Heatmap for correlation
plt.figure(figsize=(10, 10))
correlation_matrix = online_food.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

<b> MODELLING </b>

In [None]:
# Load the dataset
X = online_food[['Age', 'Monthly Income', 'Family size']]
y = online_food['Output']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}

# Create a pipeline for each model
pipelines = {}
for name, model in models.items():
    pipelines[name] = Pipeline([
        ("scaler", scaler),
        ("model", model)
    ])

# Train and evaluate models using pipelines
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification = classification_report(y_test, y_pred)
    print(f"{name} Metrics:")
    print(f"Accuracy: {accuracy}")
    print(f"Classification Report:\n{classification}\n")

CHECK FOR DATA IMBALANCE

In [None]:
# Assuming your data is in a DataFrame called 'df' with a target column 'target'
target_counts = online_food['Output'].value_counts()
sns.barplot(x=target_counts.index, y=target_counts.values)
plt.title('Class Distribution')
plt.show()

In [None]:
# Assuming your data is in X_train and y_train
print('Original dataset shape %s' % Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_resampled))

In [None]:
# Assuming your classifier is RandomForestClassifier
classifier = RandomForestClassifier(class_weight='balanced')

HYPERPARAMETER TUNING

In [None]:
# Define the model
model = RandomForestClassifier()

# Define the grid of hyperparameters
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform Grid Search
grid_search.fit(X, y)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

#### <b> ONLINE PREDICTION APP </b>

In [None]:
app = Flask(__app.py__)

# Load the pre-trained model
model = pickle.load(open('model.pkl', 'rb'))

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    # Get user input from the form
    age = int(request.form['age'])
    monthly_income = int(request.form['monthly_income'])
    family_size = int(request.form['family_size'])

    # Make prediction using the model
    prediction = model.predict([[age, monthly_income, family_size]])

    # Map prediction to human-readable output
    output = "Satisfied" if prediction[0] == 1 else "Not Satisfied"

    return render_template('index.html', prediction_text='Customer is {}'.format(output))

if __name__ == '__main__':
    app.run(debug=True)