In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch

from sklearn.ensemble import RandomForestClassifier  # or RandomForestRegressor for regression tasks
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score  # or an appropriate metric for your task
from sklearn.preprocessing import LabelEncoder

In [73]:
PATH_DATA = '/Users/nicolapitzalis/Documents/uni-ai/HLT/HLT-Project/dataset/ingredients_list_5k_cleaned.csv'
PATH_LABELS = '/Users/nicolapitzalis/Documents/uni-ai/HLT/HLT-Project/dataset/labels_5k.csv'
PATH_VOCABULARY = '/Users/nicolapitzalis/Documents/uni-ai/HLT/HLT-Project/dataset/ingredients_v_5k.csv'

data = pd.read_csv(PATH_DATA)
vocabulary = pd.read_csv(PATH_VOCABULARY)
labels = pd.read_csv(PATH_LABELS)

In [74]:
vocabulary = vocabulary.fillna('Missing')
ohe_vocabulary = pd.get_dummies(vocabulary, prefix='category').astype(int)  # Replace '0' with the actual column name if it exists

In [107]:
# Initialize an empty list to hold the one-hot encoded recipes
ohe_matrix = []

# Loop over each recipe in the data
for index, recipe in data.iterrows():
    # Initialize a one-hot encoded array for the recipe
    ohe_recipe = np.zeros(len(ohe_vocabulary.columns))
    
    # Loop over each ingredient in the recipe
    for ingredient in recipe:
        if pd.isnull(ingredient):
            continue
        
        # Check if the ingredient is in the one-hot vocabulary
        if f'category_{ingredient}' in ohe_vocabulary.columns:
            # Find the index for the ingredient
            ingredient_index = ohe_vocabulary.columns.get_loc(f'category_{ingredient}')
            
            # Set the corresponding position in ohe_recipe to 1
            ohe_recipe[ingredient_index] = 1

    # Append the one-hot encoded recipe to the matrix
    ohe_matrix.append(ohe_recipe)

# Convert the list of arrays into a 2D NumPy array
ohe_matrix = np.array(ohe_matrix)

In [76]:
labels = labels['Vegetarian&Desserts'].to_numpy()

In [78]:
# Example using a hypothetical dataset
X_train, X_test, y_train, y_test = train_test_split(ohe_matrix, labels, test_size=0.3, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.1min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 6.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 6.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 6.0min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 6.0min
[CV] END m

In [124]:
X_train, X_test, y_train, y_test = train_test_split(ohe_matrix, labels, test_size=0.3, random_state=42)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=256, min_samples_split=2, min_samples_leaf=2)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7193333333333334


In [118]:
from sklearn.tree import export_text

# Loop through each tree in the forest
for i, tree in enumerate(random_forest.estimators_):
    # Export the decision rules
    tree_rules = export_text(tree, feature_names=list(ohe_vocabulary.columns))
    
    # Print the rules for each tree
    print(f"Rules for tree {i}:\n")
    print(tree_rules)
    print("\n")

Rules for tree 0:

|--- category_birds garden peas <= 0.50
|   |--- category_ground black pepper <= 0.50
|   |   |--- category_soy sauce <= 0.50
|   |   |   |--- category_varti cheese <= 0.50
|   |   |   |   |--- category_chicken stock <= 0.50
|   |   |   |   |   |--- category_baby asparagus <= 0.50
|   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |   |--- category_baby asparagus >  0.50
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |--- category_chicken stock >  0.50
|   |   |   |   |   |--- category_red wine <= 0.50
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |--- category_red wine >  0.50
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |--- category_varti cheese >  0.50
|   |   |   |   |--- class: 0.0
|   |   |--- category_soy sauce >  0.50
|   |   |   |--- category_apple sliced <= 0.50
|   |   |   |   |--- category_minced chives <= 0.50
|   |   |   |   |   |--- category_sher salt <= 0.50
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   | 

In [111]:
from sklearn.tree import export_text

# Loop through each tree in the forest
for i, tree in enumerate(best_rf.estimators_):
    # Export the decision rules
    tree_rules = export_text(tree, feature_names=list(ohe_vocabulary.columns))
    
    # Print the rules for each tree
    print(f"Rules for tree {i}:\n")
    print(tree_rules)
    print("\n")

Rules for tree 0:

|--- class: 1.0



Rules for tree 1:

|--- class: 1.0



Rules for tree 2:

|--- class: 1.0



Rules for tree 3:

|--- class: 1.0



Rules for tree 4:

|--- class: 1.0



Rules for tree 5:

|--- class: 1.0



Rules for tree 6:

|--- class: 1.0



Rules for tree 7:

|--- class: 1.0



Rules for tree 8:

|--- class: 1.0



Rules for tree 9:

|--- class: 1.0



Rules for tree 10:

|--- class: 1.0



Rules for tree 11:

|--- class: 1.0



Rules for tree 12:

|--- class: 1.0



Rules for tree 13:

|--- class: 1.0



Rules for tree 14:

|--- class: 1.0



Rules for tree 15:

|--- class: 1.0



Rules for tree 16:

|--- class: 1.0



Rules for tree 17:

|--- class: 1.0



Rules for tree 18:

|--- class: 1.0



Rules for tree 19:

|--- class: 1.0



Rules for tree 20:

|--- class: 1.0



Rules for tree 21:

|--- class: 1.0



Rules for tree 22:

|--- class: 1.0



Rules for tree 23:

|--- class: 1.0



Rules for tree 24:

|--- class: 1.0



Rules for tree 25:

|--- class: 1.0

In [108]:
np.unique(ohe_matrix)

array([0., 1.])

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(ohe_matrix, labels, test_size=0.2, random_state=42, stratify=labels)

# Setup the RBM with a Logistic Regression classifier in a pipeline
rbm = BernoulliRBM(random_state=42, verbose=True)
logistic = LogisticRegression(max_iter=1000)

classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

# Grid search parameters
param_grid = {
    'rbm__learning_rate': [0.01, 0.1, 0.5],
    'rbm__n_iter': [20, 40, 80],
    'rbm__n_components': [50, 100, 200]  # number of hidden units
}

# Using GridSearchCV to find the best parameters
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(classifier, param_grid, n_jobs=-1, cv=3)
grid_search.fit(X_train, y_train)

# Best parameters and scores
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Evaluate the model
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))


In [123]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming ohe_matrix and labels are your data and labels loaded previously

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(ohe_matrix, labels, test_size=0.2, random_state=42, stratify=labels)

# Setup the RBM with specified hyperparameters and Logistic Regression classifier in a pipeline
rbm = BernoulliRBM(n_components=100, learning_rate=0.1, n_iter=40, random_state=42, verbose=True)
logistic = LogisticRegression(max_iter=1000)

classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

# Train the classifier
classifier.fit(X_train, y_train)

# Evaluate the classifier
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))


[BernoulliRBM] Iteration 1, pseudo-likelihood = -81.88, time = 7.75s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -86.19, time = 7.58s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -84.68, time = 7.66s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -118.27, time = 7.90s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -92.78, time = 8.39s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -88.60, time = 8.72s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -80.56, time = 8.54s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -67.89, time = 8.47s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -79.63, time = 9.11s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -82.80, time = 8.89s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -94.19, time = 8.80s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -78.43, time = 8.24s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -74.51, time = 8.17s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -84.17, time = 7.87s
[BernoulliRBM] Iteration 15,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
