In [278]:
# Installing requirements
# %pip install -r requirements.txt

In [279]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

%matplotlib inline
sns.set_style("whitegrid")

In [280]:
df = pd.read_csv("recipe_site_traffic_2212.csv")

In [281]:
# Size of dataset
df.shape

(947, 8)

In [282]:
df.head()

Unnamed: 0,recipe,calories,carbohydrate,sugar,protein,category,servings,high_traffic
0,1,,,,,Pork,6,High
1,2,35.48,38.56,0.66,0.92,Potato,4,High
2,3,914.28,42.68,3.09,2.88,Breakfast,1,
3,4,97.03,30.56,38.63,0.02,Beverages,4,High
4,5,27.05,1.85,0.8,0.53,Beverages,4,


# Removing identifier column


In [283]:
df.drop(columns=["recipe"], inplace=True)

# Checking data types


In [284]:
# Checking data types
df.dtypes

calories        float64
carbohydrate    float64
sugar           float64
protein         float64
category         object
servings         object
high_traffic     object
dtype: object

In [285]:
# Replace null with "0"
df["high_traffic"] = df["high_traffic"].fillna("0")
# Replace "High" with "1"
df["high_traffic"] = df["high_traffic"].replace("High", "1")
# COnvert to int
df["high_traffic"] = pd.to_numeric(df["high_traffic"]).astype(int)

In [286]:
df.head()

Unnamed: 0,calories,carbohydrate,sugar,protein,category,servings,high_traffic
0,,,,,Pork,6,1
1,35.48,38.56,0.66,0.92,Potato,4,1
2,914.28,42.68,3.09,2.88,Breakfast,1,0
3,97.03,30.56,38.63,0.02,Beverages,4,1
4,27.05,1.85,0.8,0.53,Beverages,4,0


# Handling missing values


In [287]:
# Percentage of missing values
df.isna().sum() / len(df) * 100

calories        5.491024
carbohydrate    5.491024
sugar           5.491024
protein         5.491024
category        0.000000
servings        0.000000
high_traffic    0.000000
dtype: float64

In [288]:
df[df["calories"].isna()].head()

Unnamed: 0,calories,carbohydrate,sugar,protein,category,servings,high_traffic
0,,,,,Pork,6,1
23,,,,,Meat,2,0
48,,,,,Chicken Breast,4,0
82,,,,,Meat,4,1
89,,,,,Pork,6,1


It appears that `calories`, `carbohydrate`, `sugar`, `protein` always miss together

My strategy to fill in null values

1. Split rows based on `category`
2. Plot distributions of columns above to determine what measure of central tendency will be used to impute i.e mean, mode, median.

-   Plotting is to check for _skew_ and _modality_

3. Impute accordingly


In [289]:
columns_with_missing_values = ["calories", "carbohydrate", "sugar", "protein"]

# for column in columns_with_missing_values:
#     sns.kdeplot(x=df[column], hue=df["category"], fill=True)
#     plt.title(f"Distribution of {column} per food category")
#     plt.show()

Data is left skewed for all food categories, the median shall be used for imputation


In [290]:
# df.dropna(inplace=True)
for column in columns_with_missing_values:
    df[column] = df[column].fillna(df[column].median())

# Handling duplicate values


In [291]:
df.duplicated().sum()

np.int64(23)

There are duplicate values


In [292]:
df.drop_duplicates(inplace=True, keep="first")

In [293]:
df.duplicated().sum()

np.int64(0)

Duplicate values removed


# Checking class imbalance


In [294]:
df["high_traffic"].value_counts(normalize=True).round(3)

high_traffic
1    0.601
0    0.399
Name: proportion, dtype: float64

Data is fairly balanced


# Check distributions of categorical columns


In [295]:
categorical_columns = df.select_dtypes(include=["object"]).columns

# categorical_columns_df = df.loc[:, categorical_columns]

# categorical_columns_df.loc[:, "high_traffic"] = df.loc[:, "high_traffic"]

# for column in categorical_columns:
#     fig, axes = plt.subplots(1, 2, figsize=(16, 6))

#     sns.countplot(ax=axes[0], x=categorical_columns_df[column], stat="percent")
#     axes[0].set_title(f"Count of {column}")
#     axes[0].tick_params(axis="x", labelrotation=90)

#     sns.countplot(
#         ax=axes[1],
#         x=categorical_columns_df[column],
#         hue=df["high_traffic"],
#         stat="percent",
#     )

#     axes[1].set_title(f"Count of {column} per high traffic")
#     axes[1].tick_params(axis="x", labelrotation=90)

#     plt.tight_layout()
#     plt.show()

# Checking distributions of numerical columns


In [296]:
numerical_columns = df.select_dtypes(include=["float64"]).columns
# print(numerical_columns)
# numerical_columns_df = df.loc[:, numerical_columns]
# numerical_columns_df.loc[:, "high_traffic"] = df.loc[:, "high_traffic"]

# for column in numerical_columns:
#     fig, axes = plt.subplots(1, 2, figsize=(16, 6))

#     sns.histplot(ax=axes[0], x=numerical_columns_df[column], kde=True, stat="percent")
#     skew = numerical_columns_df[column].skew()
#     axes[0].set_title(f"Percent of {column}")
#     axes[0].tick_params(axis="x", labelrotation=90)
#     axes[0].text(
#         0.5, 0.5, f"Skew: {skew:.3f}", transform=axes[0].transAxes, fontsize=12
#     )
#     sns.histplot(
#         ax=axes[1],
#         x=numerical_columns_df[column],
#         hue=df["high_traffic"],
#         kde=True,
#         stat="percent"
#     )

#     axes[1].set_title(f"Percent of {column} per high traffic")
#     axes[1].tick_params(axis="x", labelrotation=90)
#     axes[1].text(
#         0.5, 0.5, f"Skew: {skew:.3f}", transform=axes[1].transAxes, fontsize=12
#     )

#     plt.tight_layout()
#     plt.show()

# Encoding categorical columns


In [297]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in categorical_columns:
    df[f"{column}_encoded"] = label_encoder.fit_transform(df[column])

df = df.drop(columns=categorical_columns)

In [298]:
# scatter_columns = df.columns.to_list()
# scatter_columns.remove("high_traffic")
# scatter_columns.remove("category_encoded")

# for column in scatter_columns:
#     sns.scatterplot(x=df[column], y=df["category_encoded"], hue=df["high_traffic"], alpha=0.5)
#     plt.show()

# Transforming numerical columns


In [299]:
from scipy.stats import boxcox, yeojohnson
from sklearn.preprocessing import QuantileTransformer

numerical_transformation_df = pd.DataFrame()

for column in numerical_columns:
    numerical_transformation_df[f"log_{column}"] = np.log1p(df[column])
    numerical_transformation_df[f"sqrt_{column}"] = np.sqrt(df[column])
    if df[column].min() > 0:
        numerical_transformation_df[f"boxcox_{column}"], _ = boxcox(df[column])
    else:
        print(f"Boxcox transformation not possible for {column}")
    numerical_transformation_df[f"yeojohnson_{column}"], _ = yeojohnson(df[column])

    qt = QuantileTransformer(
        output_distribution="normal", random_state=42, n_quantiles=100
    )
    numerical_transformation_df[f"quantile_{column}"] = qt.fit_transform(
        df[column].values.reshape(-1, 1)
    )


# for column in numerical_transformation_df.columns:
#     sns.histplot(x=numerical_transformation_df[column], kde=True)

#     plt.show()

Boxcox transformation not possible for protein


In [300]:
from scipy.stats import kstest

ks_test_results = dict()
for transformation in numerical_transformation_df.columns:
    standardized_data = (
        numerical_transformation_df[transformation]
        - numerical_transformation_df[transformation].mean()
    ) / numerical_transformation_df[transformation].std()
    ks_stat, ks_p_value = kstest(standardized_data, "norm")
    ks_test_results[transformation] = (ks_stat, ks_p_value)


ks_test_results_df = pd.DataFrame.from_dict(
    ks_test_results, orient="index", columns=["KS Statistic", "P-Value"]
)
print(ks_test_results_df.round(5))

                         KS Statistic  P-Value
log_calories                  0.08768  0.00000
sqrt_calories                 0.07177  0.00014
boxcox_calories               0.02429  0.63763
yeojohnson_calories           0.02606  0.54799
quantile_calories             0.01804  0.91888
log_carbohydrate              0.05374  0.00926
sqrt_carbohydrate             0.08480  0.00000
boxcox_carbohydrate           0.02467  0.61845
yeojohnson_carbohydrate       0.03220  0.28737
quantile_carbohydrate         0.01789  0.92363
log_sugar                     0.03890  0.11887
sqrt_sugar                    0.12115  0.00000
boxcox_sugar                  0.03424  0.22359
yeojohnson_sugar              0.04264  0.06752
quantile_sugar                0.01915  0.88050
log_protein                   0.04863  0.02447
sqrt_protein                  0.10724  0.00000
yeojohnson_protein            0.04827  0.02611
quantile_protein              0.01904  0.88463


In [301]:
transformed_numerical_columns = numerical_transformation_df.filter(like="quantile")

df = pd.concat([df, transformed_numerical_columns], axis=1)
df = df.drop(columns=numerical_columns)
df.head()

Unnamed: 0,high_traffic,category_encoded,servings_encoded,quantile_calories,quantile_carbohydrate,quantile_sugar,quantile_protein
0,1,8,4,0.0,0.0,0.0,0.0
1,1,9,2,-1.381214,0.545544,-1.271127,-1.168949
2,0,1,0,1.161859,0.656545,-0.281338,-0.744574
3,1,0,2,-0.775762,0.321109,1.682867,-2.471148
4,0,0,2,-1.521998,-1.563081,-1.153283,-1.388335


# Scaling numerical columns


In [302]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

scaler = RobustScaler()
numerical_columns = transformed_numerical_columns.columns.to_list()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
print(df[df["high_traffic"].isna()])

Empty DataFrame
Columns: [high_traffic, category_encoded, servings_encoded, quantile_calories, quantile_carbohydrate, quantile_sugar, quantile_protein]
Index: []


# Handling redundant features

In [303]:
correlation_matrix_columns = df.columns.to_list()
correlation_matrix_columns.remove("high_traffic")

df[correlation_matrix_columns].corr()

Unnamed: 0,category_encoded,servings_encoded,quantile_calories,quantile_carbohydrate,quantile_sugar,quantile_protein
category_encoded,1.0,0.052497,0.08902,0.085571,-0.135607,0.135089
servings_encoded,0.052497,1.0,-0.025021,-0.038433,-0.017345,-0.028647
quantile_calories,0.08902,-0.025021,1.0,-0.03161,-0.074115,0.19601
quantile_carbohydrate,0.085571,-0.038433,-0.03161,1.0,0.014505,0.032585
quantile_sugar,-0.135607,-0.017345,-0.074115,0.014505,1.0,-0.084268
quantile_protein,0.135089,-0.028647,0.19601,0.032585,-0.084268,1.0


No features are redundant

# Feature Selection


In [304]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

X = df.drop(columns=["high_traffic"])
y = df["high_traffic"]

rf_classifier = RandomForestClassifier(
    criterion="entropy",
    min_samples_split=40,
    random_state=42,
    n_jobs=-1,
    ccp_alpha=0.015,
)

rfe = RFECV(
    estimator=rf_classifier,
    cv=StratifiedKFold(5),
    scoring="accuracy",
    n_jobs=-1,
    # min_features_to_select=3,
)
rfe.fit(X, y)

print("RFE done...")
print()
# Get the optimal number of features
print("Optimal number of features: %d" % rfe.n_features_)
print()

selected_features = []
print("Selected features")
print("=================")
for i, col in zip(range(X.shape[1]), X.columns):
    if rfe.support_[i]:
        selected_features.append(col)
        print(f"Column: {col}, Rank: {rfe.ranking_[i]}, Selected: {rfe.support_[i]}")

print()
print("Not selected features")
print("=====================")
for i, col in zip(range(X.shape[1]), X.columns):
    if not rfe.support_[i]:
        print(f"Column: {col}, Rank: {rfe.ranking_[i]}, Selected: {rfe.support_[i]}")

X = X[selected_features]

RFE done...

Optimal number of features: 3

Selected features
Column: category_encoded, Rank: 1, Selected: True
Column: quantile_calories, Rank: 1, Selected: True
Column: quantile_protein, Rank: 1, Selected: True

Not selected features
Column: servings_encoded, Rank: 4, Selected: False
Column: quantile_carbohydrate, Rank: 2, Selected: False
Column: quantile_sugar, Rank: 3, Selected: False


Only `category_encoded` is chosen.


In [305]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=2, random_state=42)
pca_transformed = pca.fit_transform(df[numerical_columns])
pca_columns = [f"pca_{i}" for i in range(pca_transformed.shape[1])]
df_pca = pd.DataFrame(pca_transformed, columns=pca_columns, index=df.index)

# Combine the PCA-transformed columns with the original DataFrame
df = pd.concat([df.drop(columns=numerical_columns), df_pca], axis=1)
X = df.drop(columns=["high_traffic"])
y = df["high_traffic"]

# Training and Evaluation Split


In [306]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model Training


In [307]:
import graphviz
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    root_mean_squared_error,
)
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import export_graphviz

classifiers = {
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "RandomForest": rf_classifier,
    "LogisticRegression": LogisticRegression(solver="liblinear", random_state=42),
    "SVC": SVC(random_state=42),
    "KNeighbours": KNeighborsClassifier(),
    "NaiveBayes": BernoulliNB(),
}

param_grids = {
    "AdaBoost": {},
    "RandomForest": {
        "n_estimators": [100, 200, 300],  # Number of trees in the forest
        "max_depth": [None, 5, 6],  # Maximum depth of the trees
    },
    "LogisticRegression": {
        "penalty": ["l1", "l2"],
        "C": [0.01, 0.1, 1, 10, 100],
    },
    "SVC": {
        "C": [0.1, 1, 10, 100],
        "gamma": [1, 0.1, 0.01, 0.001],
        "kernel": ["rbf", "linear"],
    },
    "KNeighbours": {
        "n_neighbors": [3, 5, 11, 19],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },
    "NaiveBayes": {},
}

best_estimators = dict()

for classifier_name in classifiers.keys():
    estimator = classifiers[classifier_name]
    param_grid = param_grids[classifier_name]

    search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5, n_jobs=-1)
    search.fit(X=X_train, y=y_train)

    print(f"Best parameters found for {classifier_name}:")
    print("-" * 20)
    pprint(search.best_params_)
    print()

    best_estimators[classifier_name] = search.best_estimator_


# Best parameters found
for estimator_name, estimator in best_estimators.items():
    print(f"Evaluation: for {estimator_name}")
    print("=" * 50)

    y_pred = estimator.predict(X_test)

    print()
    # Classification report
    print("Classification Report:")
    print("-" * 20)
    print(classification_report(y_true=y_test, y_pred=y_pred))
    print()

    print("Confusion Matrix")
    print("-" * 20)
    print(confusion_matrix(y_true=y_test, y_pred=y_pred))
    print()

    print("Root Mean Squared Error")
    print("-" * 20)
    print(round(root_mean_squared_error(y_true=y_test, y_pred=y_pred), 2))
    print()

    if estimator_name == "RandomForest":
        best_tree = estimator.estimators_[0]
        dot_data = export_graphviz(
            best_tree,
            out_file=None,
            feature_names=X.columns,
            class_names=list(map(str, y.unique().tolist())),
            filled=True,
            rounded=True,
            special_characters=True,
        )
        graph = graphviz.Source(dot_data)
        graph.render("best_tree")
        graph.view()



Best parameters found for AdaBoost:
--------------------
{}

Best parameters found for RandomForest:
--------------------
{'max_depth': 5, 'n_estimators': 100}

Best parameters found for LogisticRegression:
--------------------
{'C': 1, 'penalty': 'l2'}

Best parameters found for SVC:
--------------------
{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}

Best parameters found for KNeighbours:
--------------------
{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}

Best parameters found for NaiveBayes:
--------------------
{}

Evaluation: for AdaBoost

Classification Report:
--------------------
              precision    recall  f1-score   support

           0       0.79      0.74      0.76        74
           1       0.83      0.86      0.85       111

    accuracy                           0.82       185
   macro avg       0.81      0.80      0.81       185
weighted avg       0.82      0.82      0.82       185


Confusion Matrix
--------------------
[[55 19]
 [15 96]]

Root 

# Decision Tree drawing


In [308]:
# import graphviz
# from sklearn.tree import export_graphviz

# best_tree = best_rf.estimators_[0]
# dot_data = export_graphviz(
#     best_tree,
#     out_file=None,
#     feature_names=X.columns,
#     class_names=list(map(str, y.unique().tolist())),
#     filled=True,
#     rounded=True,
#     special_characters=True,
# )
# graph = graphviz.Source(dot_data)
# graph.render("best_tree")
# graph.view()