In [1]:
from tqdm.auto import tqdm
import seaborn as sns

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neural_network import BernoulliRBM, MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

from gensim.models import Word2Vec

  from .autonotebook import tqdm as notebook_tqdm


# Load

In [2]:
PATH_DATA = '/Users/nicolapitzalis/Documents/uni-ai/HLT/HLT-Project/dataset/distilbert_results/ner_recipes.csv'
PATH_LABELS = '/Users/nicolapitzalis/Documents/uni-ai/HLT/HLT-Project/dataset/distilbert_results/ner_labels.csv'
PATH_VOCABULARY = '/Users/nicolapitzalis/Documents/uni-ai/HLT/HLT-Project/dataset/distilbert_results/ner_ingredients.csv'
# PATH_RECIPES = '/Users/nicolapitzalis/Documents/uni-ai/HLT/HLT-Project/dataset/recipes_df_r.csv'

data = pd.read_csv(PATH_DATA)
vocabulary = pd.read_csv(PATH_VOCABULARY, header=None)
labels = pd.read_csv(PATH_LABELS)

# Preprocessing

In [3]:
vocabulary = vocabulary.fillna('Missing')

In [7]:
labels = labels['Vegetarian&Desserts'].to_numpy()

In [8]:
data_list = data.values.tolist()

In [92]:
# Train a Word2Vec model
model = Word2Vec(data_list, vector_size=768, window=20, min_count=1, workers=4)

In [93]:
model.wv.vectors.shape

(5984, 768)

In [94]:
embeddings = []

# Loop over each recipe in the data
for index, recipe in data.iterrows():
    embedding_recipe = np.zeros(model.wv.vectors.shape[1])
    
    for ingredient in recipe:
        if pd.isnull(ingredient):
            continue
        
        if ingredient in model.wv.key_to_index:
            embedding_recipe += model.wv[ingredient]
        
    embedding_recipe = embedding_recipe / recipe.dropna().shape[0]
    embeddings.append(embedding_recipe)

embeddings = np.array(embeddings)

In [95]:
embeddings.shape

(9770, 768)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42, stratify=labels)

In [97]:
unique, counts = np.unique(labels, return_counts=True)
print(dict(zip(unique, counts)))

{0: 4837, 1: 4933}


In [98]:
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))

{0: 3870, 1: 3946}


In [99]:
unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))

{0: 967, 1: 987}


In [101]:
random_forest = RandomForestClassifier(criterion='gini', n_estimators=300, random_state=42, max_depth=None, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.83      0.78       967
           1       0.81      0.71      0.75       987

    accuracy                           0.77      1954
   macro avg       0.77      0.77      0.77      1954
weighted avg       0.77      0.77      0.77      1954



In [105]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(4096), max_iter=300, alpha=0, activation='tanh',
                    solver='sgd', verbose=10, random_state=21, learning_rate_init=0.01)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Iteration 1, loss = 0.65534707
Iteration 2, loss = 0.55600702
Iteration 3, loss = 0.55159904
Iteration 4, loss = 0.54054722
Iteration 5, loss = 0.54172864
Iteration 6, loss = 0.53382748
Iteration 7, loss = 0.53220847
Iteration 8, loss = 0.53488956
Iteration 9, loss = 0.52752857
Iteration 10, loss = 0.53359101
Iteration 11, loss = 0.52205374
Iteration 12, loss = 0.52581901
Iteration 13, loss = 0.52227006
Iteration 14, loss = 0.52749048
Iteration 15, loss = 0.52972348
Iteration 16, loss = 0.52328026
Iteration 17, loss = 0.52056665
Iteration 18, loss = 0.52993007
Iteration 19, loss = 0.52168191
Iteration 20, loss = 0.52316473
Iteration 21, loss = 0.51687956
Iteration 22, loss = 0.51711567
Iteration 23, loss = 0.51669433
Iteration 24, loss = 0.51513952
Iteration 25, loss = 0.51253431
Iteration 26, loss = 0.51664358
Iteration 27, loss = 0.51319170
Iteration 28, loss = 0.51358868
Iteration 29, loss = 0.51415516
Iteration 30, loss = 0.51193500
Iteration 31, loss = 0.51826869
Iteration 32, los



Accuracy: 0.7318321392016377
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.77      0.74       967
           1       0.75      0.70      0.72       987

    accuracy                           0.73      1954
   macro avg       0.73      0.73      0.73      1954
weighted avg       0.73      0.73      0.73      1954



In [None]:
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))