##### **Disclaimer: We use some advanced packages here without detailed explanation. You can use these, but we do not provide any support.**

In [None]:
# To install them, you can uncomment the following lines:
# (%pip will call pip from the currently active python environment)

# Note: Some of these packages are still not compatible with Python 3.12 yet
# %pip install sweetviz
# %pip install ydata_profiling
# %pip install shap

## CRISP-DM

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
#import shap
#from ydata_profiling import ProfileReport
#import sweetviz as sv

#### Reproducibility 

A best practice in data analytics projects is to work with *seeds* to ensure the reproducability of results. 
This is especially important in the Analytics Cup, since the rules require you to write a self-contained
script that produces reproducable results. 

To achieve this, we can set seeds for all used random number generators.

In [3]:
seed = 2024

# pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
np.random.seed(seed)

### Phase 1: Business Understanding

Serves to assess use cases, feasibility, requirements, and
risks of the endeavored data driven project.

### Phase 2: Data Understanding

Assess the data quality and content.

In [31]:
# load the data
diet = pd.read_csv("diet.csv")
recipes = pd.read_csv("recipes.csv")
requests = pd.read_csv("requests.csv")
reviews = pd.read_csv("reviews.csv")

  reviews = pd.read_csv("reviews.csv")


have a look at the data and its attributes \
check if columns are properly named \
general overview over data, check for missing values, etc.

#### Diet

In [None]:
diet.head()

In [None]:
diet.info()
# To do: ver qual o valor nulo que tem na coluna Diet

#### Recipes

In [5]:
recipes.head()

Unnamed: 0,RecipeId,Name,CookTime,PrepTime,RecipeCategory,RecipeIngredientQuantities,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield
0,73440,Bow Ties With Broccoli Pesto,0,1800,Other,"c(""\""6\"""", ""\""2\"""", ""\""1 1/2\"""", ""\""1/4\"""", ""\...","c(""\""hazelnuts\"""", ""\""broccoli florets\"""", ""\""...",241.3,10.1,1.2,0.0,13.1,31.8,2.3,1.4,6.7,9.0,
1,365718,Cashew-chutney Rice,3600,600,Other,"c(""\""1\"""", ""\""3/4\"""", ""\""6\"""", ""\""5\"""", ""\""2\""...","c(""\""celery\"""", ""\""onion\"""", ""\""butter\"""", ""\""...",370.8,17.5,7.2,22.9,553.3,44.3,1.6,2.2,9.4,8.0,
2,141757,Copycat Taco Bell Nacho Fries BellGrande,3600,2700,Other,"c(""\""3\"""", ""\""1/2\"""", ""\""1\"""", ""\""1\"""", ""\""3\""...","c(""\""Copycat Taco Bell Seasoned Beef\"""", ""\""ye...",377.6,20.9,10.5,45.7,1501.8,36.6,3.8,6.1,12.9,8.0,
3,280351,Slow Cooker Jalapeno Cheddar Cheese Soup,18000,1800,Other,"c(""\""2\"""", ""\""1\"""", ""\""2\"""", ""\""2\"""", ""\""1\"""",...","c(""\""unsalted butter\"""", ""\""yellow onion\"""", ""...",282.8,16.5,10.3,50.5,630.2,22.8,2.3,2.7,11.7,6.0,
4,180505,Cool & Crisp Citrus Chiffon Pie,3600,1800,Other,"c(""\""1\"""", ""\""1/4\"""", ""\""1/2\"""", ""\""1/2\"""", ""\...","c(""\""unflavored gelatin\"""", ""\""water\"""", ""\""su...",257.5,8.6,2.4,110.7,160.9,39.8,0.4,30.2,6.3,6.0,


#### Recipes table pre-processing

In [33]:
# Change types of column
def refactorIngredients(ingredients):
    if ingredients == "character(0)":
        return []
    ingredients = ingredients.replace("\\", '').replace("\"", '').replace('c(','').replace(')', '')
    ingredients = ingredients.split(",")
    return ingredients

recipes["RecipeIngredientQuantities"] = recipes["RecipeIngredientQuantities"].apply(lambda x: refactorIngredients(x))
recipes["RecipeIngredientParts"] = recipes["RecipeIngredientParts"].apply(lambda x: refactorIngredients(x))

recipes.head()

Unnamed: 0,RecipeId,Name,CookTime,PrepTime,RecipeCategory,RecipeIngredientQuantities,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,isVegetarian
0,73440,Bow Ties With Broccoli Pesto,0,1800,Other,"[6, 2, 1 1/2, 1/4, 1/2, 4, 1 1/2, 1 1/2...","[hazelnuts, broccoli florets, fresh parsley ...",241.3,10.1,1.2,0.0,13.1,31.8,2.3,1.4,6.7,9.0,,True
1,365718,Cashew-chutney Rice,3600,600,Other,"[1, 3/4, 6, 5, 2, 1, 2]","[celery, onion, butter, chicken broth, lon...",370.8,17.5,7.2,22.9,553.3,44.3,1.6,2.2,9.4,8.0,,True
2,141757,Copycat Taco Bell Nacho Fries BellGrande,3600,2700,Other,"[3, 1/2, 1, 1, 3, 2, 1, 2 1/2, 2, 1, ...","[Copycat Taco Bell Seasoned Beef, yellow onio...",377.6,20.9,10.5,45.7,1501.8,36.6,3.8,6.1,12.9,8.0,,True
3,280351,Slow Cooker Jalapeno Cheddar Cheese Soup,18000,1800,Other,"[2, 1, 2, 2, 1, 1, 1/8, 1/4, 1, 4, 3...","[unsalted butter, yellow onion, carrots, ga...",282.8,16.5,10.3,50.5,630.2,22.8,2.3,2.7,11.7,6.0,,True
4,180505,Cool & Crisp Citrus Chiffon Pie,3600,1800,Other,"[1, 1/4, 1/2, 1/2, 1, 1/2, 4, 4, 1/2, ...","[unflavored gelatin, water, sugar, lemon, ...",257.5,8.6,2.4,110.7,160.9,39.8,0.4,30.2,6.3,6.0,,True


In [None]:
recipes.info()

In [35]:
# Add column to know if recipe is veggie
def isVegetarian(ingredients):
    meat_derivates = ["pork", "beef", "meat", "fish", "tuna", "chicken", "squid", "schrimp", "trout", "mussels", 
                      "fillet", "lamb", "scallops", "sardine", "salmon", "lobster", "steak"]
    for ingredient in ingredients:
        if any(word in ingredient.lower() for word in meat_derivates):
            return False
    return True
recipes["isVegetarian"] = recipes["RecipeIngredientParts"].apply(lambda x: isVegetarian(x))

# Create another table "recipe extra info" columns category, ingredient quatities, parts
selected_columns = ['RecipeCategory', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeServings', 'RecipeYield']
recipe_extra_info = recipes[selected_columns]
recipes = recipes.drop(columns=selected_columns)

recipes


Unnamed: 0,RecipeId,Name,CookTime,PrepTime,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,isVegetarian
0,73440,Bow Ties With Broccoli Pesto,0,1800,241.3,10.1,1.2,0.0,13.1,31.8,2.3,1.4,6.7,True
1,365718,Cashew-chutney Rice,3600,600,370.8,17.5,7.2,22.9,553.3,44.3,1.6,2.2,9.4,False
2,141757,Copycat Taco Bell Nacho Fries BellGrande,3600,2700,377.6,20.9,10.5,45.7,1501.8,36.6,3.8,6.1,12.9,False
3,280351,Slow Cooker Jalapeno Cheddar Cheese Soup,18000,1800,282.8,16.5,10.3,50.5,630.2,22.8,2.3,2.7,11.7,False
4,180505,Cool & Crisp Citrus Chiffon Pie,3600,1800,257.5,8.6,2.4,110.7,160.9,39.8,0.4,30.2,6.3,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75599,253577,Frijoles Negros- Crock Pot Mexican Black Beans,43200,28800,121.5,0.5,0.1,0.0,1175.1,22.2,7.8,0.6,7.9,True
75600,267827,Moose Moussaka,3600,2700,652.2,25.8,10.7,197.9,435.5,51.9,7.5,7.2,50.1,True
75601,266983,Cantonese Pepper Steak for Two (Or More),1800,900,223.9,9.2,3.6,78.3,725.9,7.3,1.1,1.7,26.7,False
75602,253739,Coconut Cream Cooler,300,120,2229.8,80.3,69.3,0.0,294.7,369.0,15.7,317.9,26.7,True


#### Reviews

In [None]:
reviews.head()

In [None]:
reviews.info()

# To do: entender os valores nulos das colunas Rating, Lika e TestSetId e o que fazer com eles

#### Requests

In [None]:
requests.head()

In [None]:
requests.info()
# no missing values: GOOD!

In [None]:
# renaming the columns
requests = requests.rename(columns={"HighCalories": "Calories", "HighProtein":"Protein", "LowFat": "Fat", "LowSugar": "Sugar", "HighFiber":"Fiber"})

# have a look at common statistics of the dataset
requests.describe()
requests.nunique()

# understanding the proportions in macronutrients 
nutrient_labels = ["Protein", "Calories", "Fiber", "Sugar", "Fat"]
proportions = {}

for nutrient in nutrient_labels:
    proportions[nutrient] = requests[nutrient].value_counts(normalize=True)
proportions

In [None]:
# relationship between macronutrients Fat-Sugar
grouped_size = requests.groupby(["Fat", "Sugar"]).size().reset_index(name='Count')
grouped_size

# calculate the percentage within each group
total_size = grouped_size['Count'].sum()
grouped_size['Percentage'] = (grouped_size['Count'] / total_size) * 100
grouped_size

In [None]:
# standardizing column Calorie to the same format
requests["Calories"] = requests["Calories"].astype("int")

# standardizing column Protein Yes->1
requests["Protein"] = requests["Protein"].replace("Yes","1")

# changing 0 -> 1 in column Sugar 
requests["Sugar"] = requests["Sugar"].replace("0","1")

# changing 0 -> 1 and 1 -> 0  column Fat
#requests["Fat"] = requests["Fat"].replace({1 : 0, 0 : 1})
requests["Fat"] = 1 - requests["Fat"]

# transforming macronutrients columns -> categories 
requests[["Calories", "Protein", "Fiber", "Sugar", "Fat"]] = requests[["Calories", "Protein", "Fiber", "Sugar", "Fat"]].astype("category")

requests


In [None]:
df_grouped_by_class = df.groupby(by="variety")

df_setosa = df_grouped_by_class.get_group("Setosa")
df_versicolor = df_grouped_by_class.get_group("Versicolor")
df_virginica = df_grouped_by_class.get_group("Virginica")

class_labels = {
    "Setosa" : {
        "color" : "blue",
        "data" : df_setosa
    },
    "Versicolor" : {
        "color" : "green",
        "data" : df_versicolor
    },
    "Virginica" : {
        "color" : "red",
        "data" : df_virginica
    }
}

for class_i in class_labels:
    class_color = class_labels[class_i]["color"]
    class_df = class_labels[class_i]["data"]
    p = sns.pairplot(class_df, diag_kind="hist", diag_kws={"color" : class_color}, plot_kws={"color" : class_color, "label" : class_i})
    p.fig.suptitle(class_i, y=1.0, size=15)

In [None]:
# We can also leverage the dataprep package to get a nice summary report
report = sv.analyze(df)
report.show_notebook()

# We can also leverage the yadata_profiling package to get a nice summary report
profile = ProfileReport(df, title="Iris Data - Summary Report")
profile

### Phase 3: Data Preparation

The goal is assure data quality: includes removing wrong/corrupt 
data entries and making sure the entries are standardized, e.g. enforcing certain encodings. 
Then transforms the data in order to make it suitable for the modelling step. This includes scaling, dimensionality
reduction, data augmentation, outlier removal, etc.\
 \
In practise, this will rarely be the case. On average, this step takes up to **80%** of 
the time of the whole project.

In [None]:
#To do: transform categorical feature into categorical variables (exemplo df["variety"] = df["variety"].astype("category"))
# fill/remove/change missing/corrupt values
# optionally save the cleaned datasets for versioning

In [None]:
# To do: ver se precisamos standardize alguma feature (exemplo na celula seguinte com o StandardScaler), se precisamos imputar valores em registros com valores nulos, 
# se precisamos lidar com outliers, se precisamos usar alguma estretégia de redução de dimensionalidade (tipo PCA na próxima celula)...

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# data scaling
transform_scaler = StandardScaler()

# dimensionality reduction
transform_pca = PCA()

# value imputing

# outlier detection/removal

#### Sampling

Split our data set into *train* and *test* data set.

In [None]:
# To do: ver se vamos usar um split para validação, ou usar cross validation

In [None]:
# split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
  train_test_split(df.iloc[:, :-1], df.iloc[:, -1:],
                   test_size=0.3, 
                   shuffle=True,
                   random_state=3)


### Phase 4: Modeling

In this phase, the model is trained and tuned.

In [None]:
# To do: escolher quais classifiers vamos testar

In [None]:
# Here, you want to find the best classifier. As candidates, consider
#   1. LogisticRegression
#   2. RandomForestClassifier
#   3. other algorithms from sklearn (easy to add)
#   4. custom algorithms (more difficult to implement)
    
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

model_logistic_regression = LogisticRegression(max_iter=30)
model_random_forest = RandomForestClassifier()
model_gradient_boosting = GradientBoostingClassifier()

# train the models
pipeline = Pipeline(steps=[("scaler", transform_scaler), 
                           ("pca", transform_pca),
                           ("model", None)])

parameter_grid_preprocessing = {
  "pca__n_components" : [1, 2, 3, 4],
}

parameter_grid_logistic_regression = {
  "model" : [model_logistic_regression],
  "model__C" : [0.1, 1, 10],  # inverse regularization strength
}

parameter_grid_gradient_boosting = {
  "model" : [model_gradient_boosting],
  "model__n_estimators" : [10, 20, 30]
}

parameter_grid_random_forest = {
  "model" : [model_random_forest],
  "model__n_estimators" : [10, 20, 50],  # number of max trees in the forest
  "model__max_depth" : [2, 3, 4],
}

meta_parameter_grid = [parameter_grid_logistic_regression,
                       parameter_grid_random_forest,
                       parameter_grid_gradient_boosting]

meta_parameter_grid = [{**parameter_grid_preprocessing, **model_grid}
                       for model_grid in meta_parameter_grid]

search = GridSearchCV(pipeline,
                      meta_parameter_grid, 
                      scoring="balanced_accuracy",
                      n_jobs=2, 
                      cv=5,  # number of folds for cross-validation 
                      error_score="raise"
)

# here, the actual training and grid search happens
search.fit(X_train, y_train.values.ravel())

print("best parameter:", search.best_params_ ,"(CV score=%0.3f)" % search.best_score_)

### Step 5: Evaluation

Once the appropriate models are chosen, they are evaluated on the test set. For
this, different evaluation metrics can be used. Furthermore, this step is where
the models and their predictions are analyzed resp. different properties, including
feature importance, robustness to outliers, etc.

In [None]:
# evaluate performance of model on test set
print("Score on test set:", search.score(X_test, y_test.values.ravel()))

# contingency table
ct = pd.crosstab(search.best_estimator_.predict(X_test), y_test.values.ravel(),
                 rownames=["pred"], colnames=["true"])
print(ct)

In [None]:
# (optional, if you're curious) 
# for a detailed look on the performance of the different models
def get_search_score_overview():
  for c,s in zip(search.cv_results_["params"],search.cv_results_["mean_test_score"]):
      print(c, s)

print(get_search_score_overview())

#### Interpretability

##### Disclaimer: This only works if shap is installed.

In addition to models and their predictions, it is often important to understand _why_ a model makes certain predictions. 
There is a lot of literature on how this can be achieved (explainability), but we will only show the use of Shapley values
using the python module "shap", which is a combination of Shapley values and LIME. 
You can find more information on this topic [here](https://christophm.github.io/interpretable-ml-book/shap.html).

In [None]:
# assume random forest model
model = RandomForestClassifier(n_estimators=10, random_state=seed)
model.fit(X_train, y_train.values.ravel())

# compute shapley values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
shap_interaction_values = explainer.shap_interaction_values(X_train)

expected_value = explainer.expected_value
print(expected_value)

In [None]:
# class dependent plots of shapley values for each feature
for i,c in enumerate(df.variety.unique()):
    shap.summary_plot(shap_values[i], X_train, show=False)
    plt.title("Shapley values for "+str(c))
    plt.show()

From the computed SHAP values, we can interpret that the *petal.width* has a positive impact on the output of the model 
if the feature value is moderate. For high aand low values, the impact is negative. The same observation
holds for *petal.length*. Besides, the impact of the *sepal.length* and *sepal.width* features are rather low. By impact on a 
the target, we model the probability that we classify that target. Thus, if *petal.width* is high, it is more likely
that we classify the data point as Versicolor.

### Step 6: Deployment

Now that you have chosen and trained your model, it is time to deploy it to your
clients system. 

In [None]:
def micro_service_classify_iris(datapoint):
    
  # make sure the provided datapoints adhere to the correct format for model input

  # fetch your trained model
  model = search.best_estimator_

  # make prediction with the model
  prediction = model.predict(datapoint)

  return prediction


In the Analytics Cup, you need to export your prediction in a very specific output format. This is a csv file without an index and two columns, *id* and *prediction*. Note that the values in both columns need to be integer values, and especially in the *prediction* column either 1 or 0.

In [None]:
# To do: arrumar a celula abaixo com os nossos dataframes

In [None]:
# Let's assume that our id column is the index of the dataframe
output = pd.DataFrame(df_flowers.variety)
output['id'] = df_flowers.index
output = output.rename(columns={'variety': 'prediction'})
output = output.reindex(columns=["id", "prediction"])
output.to_csv('analzticscuppredictionfile.csv', index=False)