##### **Disclaimer: We use some advanced packages here without detailed explanation. You can use these, but we do not provide any support.**

In [None]:
# To install them, you can uncomment the following lines:
# (%pip will call pip from the currently active python environment)
# %pip install scikit-learn

# Note: Some of these packages are still not compatible with Python 3.12 yet
# %pip install sweetviz
# %pip install ydata_profiling
# %pip install shap

## CRISP-DM

In [None]:
import pandas as pd
import numpy as np
import sklearn

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
#import shap
#from ydata_profiling import ProfileReport
#import sweetviz as sv

#### Reproducibility 

A best practice in data analytics projects is to work with *seeds* to ensure the reproducability of results. 
This is especially important in the Analytics Cup, since the rules require you to write a self-contained
script that produces reproducable results. 

To achieve this, we can set seeds for all used random number generators.

In [None]:
seed = 55

### Phase 1: Business Understanding

Serves to assess use cases, feasibility, requirements, and
risks of the endeavored data driven project.

Startup that suggests new recipes to users\
But we have been having many cancelations of subscriptions\
Problem was that the users found that the recipes suggested (even though they had high quality) did not match the customer's diet and needs\
Now we have a system of likes and dislikes for the recipes and a new user interface, where the users can enter information about what they want

### Phase 2: Data Understanding

Assess the data quality and content.

In [None]:
# load the data
diet = pd.read_csv("diet.csv")
recipes = pd.read_csv("recipes.csv")
requests = pd.read_csv("requests.csv")
reviews = pd.read_csv("reviews.csv")

have a look at the data and its attributes \
check if columns are properly named \
general overview over data, check for missing values, etc.

#### Diet pre-processing

In [None]:
diet["Diet"] = diet["Diet"].astype('category')

#### Recipes pre-processing

In [None]:
# Change types of column
def refactorIngredients(ingredients):
    if ingredients == "character(0)":
        return []
    ingredients = ingredients.replace("\\", '').replace("\"", '').replace('c(','').replace(')', '')
    ingredients = ingredients.split(",")
    return ingredients

recipes["RecipeIngredientQuantities"] = recipes["RecipeIngredientQuantities"].apply(lambda x: refactorIngredients(x))
recipes["RecipeIngredientParts"] = recipes["RecipeIngredientParts"].apply(lambda x: refactorIngredients(x))

recipes.head()

In [None]:
recipes.info()

In [None]:
# Determines if recipe is veggie, vegan or omnivore
def categorizeRecipe(ingredients):
    meat_derivates = ["pork", "beef", "meat", "fish", "tuna", "chicken", "squid", "schrimp", "trout", "mussels", 
                      "fillet", "lamb", "scallops", "sardine", "salmon", "lobster", "steak", "bacon", "ham", "oyster"]
    animal_derivates = ["milk", "egg", "honey", "gelatin", "butter", "mayonnaise", "cheese", "margarine", 
                    " heavy", "yogurt", "pudding", "shortening", "ice cream", "chocolate", "alfredo", "Miracle Whip", "half-and-half"]
    vegan_exclusions = ["substitute", "peanut", "apple", "vegan", "soymilk"]
    vegan = True
    for ingredient in ingredients:
        if any(word in ingredient.lower() for word in meat_derivates):
            return "Omnivore"
        if ingredient in vegan_exclusions:
            continue
        if any(word in ingredient.lower() for word in animal_derivates):
            vegan = False
    if vegan: 
        return "Vegan"
    else: 
        return "Vegetarian"

recipes["RecipeDiet"] = recipes["RecipeIngredientParts"].apply(lambda x: categorizeRecipe(x))
recipes['RecipeDiet'] = recipes['RecipeDiet'].astype('category')

# Create another table "recipe extra info" columns category, ingredient quatities, parts
selected_columns = ['RecipeCategory', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeServings', 'RecipeYield']
recipe_extra_info = recipes[selected_columns]
recipes = recipes.drop(columns=selected_columns)

recipes


#### Requests pre-processing

In [None]:
requests.head()

In [None]:
requests.info()
# no missing values: GOOD!

In [None]:
# renaming the columns
requests = requests.rename(columns={"HighCalories": "Calories", "HighProtein":"Protein", "LowFat": "Fat", "LowSugar": "Sugar", "HighFiber":"Fiber"})

In [None]:
# standardizing column Calorie to the same format
requests["Calories"] = requests["Calories"].astype("int")

# standardizing column Protein Yes->1
requests["Protein"] = requests["Protein"].replace("Yes","1")

# changing 0 -> 1 in column Sugar 
requests["Sugar"] = requests["Sugar"].replace("0","1")

# changing 0 -> 1 and 1 -> 0  column Fat
#requests["Fat"] = requests["Fat"].replace({1 : 0, 0 : 1})
requests["Fat"] = 1 - requests["Fat"]

# transforming macronutrients columns -> categories 
requests[["Calories", "Protein", "Fiber", "Sugar", "Fat"]] = requests[["Calories", "Protein", "Fiber", "Sugar", "Fat"]].astype("category")

requests


#### Reviews pre-processing

In [None]:
reviews = reviews.drop(columns = ["Rating"])

In [None]:
"""
df_grouped_by_class = df.groupby(by="variety")

df_setosa = df_grouped_by_class.get_group("Setosa")
df_versicolor = df_grouped_by_class.get_group("Versicolor")
df_virginica = df_grouped_by_class.get_group("Virginica")

class_labels = {
    "Setosa" : {
        "color" : "blue",
        "data" : df_setosa
    },
    "Versicolor" : {
        "color" : "green",
        "data" : df_versicolor
    },
    "Virginica" : {
        "color" : "red",
        "data" : df_virginica
    }
}

for class_i in class_labels:
    class_color = class_labels[class_i]["color"]
    class_df = class_labels[class_i]["data"]
    p = sns.pairplot(class_df, diag_kind="hist", diag_kws={"color" : class_color}, plot_kws={"color" : class_color, "label" : class_i})
    p.fig.suptitle(class_i, y=1.0, size=15)
"""

In [None]:
"""
# We can also leverage the dataprep package to get a nice summary report
report = sv.analyze(df)
report.show_notebook()

# We can also leverage the yadata_profiling package to get a nice summary report
profile = ProfileReport(df, title="Iris Data - Summary Report")
profile
"""

### Phase 3: Data Preparation

The goal is assure data quality: includes removing wrong/corrupt 
data entries and making sure the entries are standardized, e.g. enforcing certain encodings. 
Then transforms the data in order to make it suitable for the modelling step. This includes scaling, dimensionality
reduction, data augmentation, outlier removal, etc.\
 \
In practise, this will rarely be the case. On average, this step takes up to **80%** of 
the time of the whole project.

In [None]:
#To do: transform categorical feature into categorical variables (exemplo df["variety"] = df["variety"].astype("category"))
# fill/remove/change missing/corrupt values

# To do: ver se precisamos standardize alguma feature (exemplo na celula seguinte com o StandardScaler), se precisamos imputar valores em registros com valores nulos, 
# se precisamos lidar com outliers, se precisamos usar alguma estretégia de redução de dimensionalidade (tipo PCA na próxima celula)...

In [None]:
"""
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# data scaling
transform_scaler = StandardScaler()

# dimensionality reduction
transform_pca = PCA()

# value imputing

# outlier detection/removal
"""

Join das 4 tabelas
- há users na tabela "diet" que nao estao na tabela "reviews" -- Ok!
- match perfeito de recipeid and authorid entre requests e reviews -- Otimo!
- todas as receitas de "recipes" estao sendo mostradas para pelo menos um usuario -- Ok!

In [None]:
# tabelas: diet, requests, reviews, recipes
dietrequestsmerged = diet.merge(requests, on = ["AuthorId"])
dietrequestsreviewsmerged = dietrequestsmerged.merge(reviews, on = ["AuthorId", "RecipeId"])
dietrequestsreviewsmerged = dietrequestsreviewsmerged.rename(columns={"Calories" : "Requested_Calories"})
mergedtables = dietrequestsreviewsmerged.merge(recipes, on = ["RecipeId"])
mergedtables = mergedtables.rename(columns={"Calories" : "Recipe_Calories"})

In [None]:
submissiondataset = mergedtables[mergedtables["Like"].isna()] #com Null na coluna Like
trainandtestdataset = mergedtables[mergedtables["Like"].notna()] #sem Null na coluna Like

trainandtestdataset.info()

#### Sampling

Split our data set into *train* and *test* data set.

In [None]:
# TODO: ver se vamos usar um split para validação, ou usar cross validation

In [None]:
from sklearn.model_selection import train_test_split

# Drop columns that should not be considered
# Drop Name because is string and Random Forest doesn't accept strings
trainandtestdataset = trainandtestdataset.drop(columns=['AuthorId', 'RecipeId', 'TestSetId', 'Name'])

# Drop categorical values and transform them into one column for each of possible categories
# This also removes remaining string values
# ATTENTION: Eu nao sei se essa parte eh necessaria para o Linear Regression. Acredito que sim, mas, se nao, reorganizamos o codigo de repente
categorical_values = ['Diet', 'RecipeDiet', 'Requested_Calories', 'Protein', 'Fat', 'Sugar', 'Fiber']

for column in categorical_values:
    new_data = pd.get_dummies(trainandtestdataset[column], prefix=column)
    trainandtestdataset = pd.concat([trainandtestdataset, new_data], axis=1)
    
trainandtestdataset = trainandtestdataset.drop(columns=categorical_values)


In [None]:
# Separate train and test data and X and Y variables

X_features = trainandtestdataset.drop(columns="Like")
Y_classes = trainandtestdataset["Like"]
Y_classes = Y_classes.astype('category')

trainandtestdataset.info()

X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_classes,
                                                    test_size=0.2, 
                                                    shuffle=True,
                                                    random_state=seed) # for reproducibility

- X_train: 77.904 rows × 24 columns
- Y_train: 77.904 rows
- X_test: 19.477 rows × 24 columns
- Y_test: 19.477 rows

### Phase 4: Modeling

In this phase, the model is trained and tuned.

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

##### Random Forest Analysis

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)

true_negatives = confusion_matrix[0][0]
false_negatives = confusion_matrix[1][0]
false_positives = confusion_matrix[0][1]
true_positives = confusion_matrix[1][1]

sensitivity = true_positives / (true_positives + false_negatives)
specificity = true_negatives / (true_negatives + false_positives)

print("sensitivity = ", sensitivity)
print("specificity = ", specificity)

# Too many False predictions

# Possible ways to improve
# Re add the recipe name in some way - parse the string and see if the title is vegetarian. 
# Group the cook time in discrete chunks?
# Group the other nutritional facts columns of recipe in discrete chunks?
# Group age in chunks ?
# Drop some columns from recipe like sodium 
# Reduce dimensionality. I guess fat, saturated fat and cholesterol are correlated.


In [None]:
# Here, you want to find the best classifier. As candidates, consider
#   1. LogisticRegression
#   2. RandomForestClassifier
#   3. other algorithms from sklearn (easy to add)
#   4. custom algorithms (more difficult to implement)
    
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

model_logistic_regression = LogisticRegression(max_iter=30)
model_random_forest = RandomForestClassifier()
model_gradient_boosting = GradientBoostingClassifier()

# train the models
pipeline = Pipeline(steps=[("scaler", transform_scaler), 
                           ("pca", transform_pca),
                           ("model", None)])

parameter_grid_preprocessing = {
  "pca__n_components" : [1, 2, 3, 4],
}

parameter_grid_logistic_regression = {
  "model" : [model_logistic_regression],
  "model__C" : [0.1, 1, 10],  # inverse regularization strength
}

parameter_grid_gradient_boosting = {
  "model" : [model_gradient_boosting],
  "model__n_estimators" : [10, 20, 30]
}

parameter_grid_random_forest = {
  "model" : [model_random_forest],
  "model__n_estimators" : [10, 20, 50],  # number of max trees in the forest
  "model__max_depth" : [2, 3, 4],
}

meta_parameter_grid = [parameter_grid_logistic_regression,
                       parameter_grid_random_forest,
                       parameter_grid_gradient_boosting]

meta_parameter_grid = [{**parameter_grid_preprocessing, **model_grid}
                       for model_grid in meta_parameter_grid]

search = GridSearchCV(pipeline,
                      meta_parameter_grid, 
                      scoring="balanced_accuracy",
                      n_jobs=2, 
                      cv=5,  # number of folds for cross-validation 
                      error_score="raise"
)

# here, the actual training and grid search happens
search.fit(X_train, Y_train.values.ravel())

print("best parameter:", search.best_params_ ,"(CV score=%0.3f)" % search.best_score_)

### Step 5: Evaluation

Once the appropriate models are chosen, they are evaluated on the test set. For
this, different evaluation metrics can be used. Furthermore, this step is where
the models and their predictions are analyzed resp. different properties, including
feature importance, robustness to outliers, etc.

In [None]:
# evaluate performance of model on test set
print("Score on test set:", search.score(X_test, Y_test.values.ravel()))

# contingency table
ct = pd.crosstab(search.best_estimator_.predict(X_test), Y_test.values.ravel(),
                 rownames=["pred"], colnames=["true"])
print(ct)

In [None]:
# (optional, if you're curious) 
# for a detailed look on the performance of the different models
def get_search_score_overview():
  for c,s in zip(search.cv_results_["params"],search.cv_results_["mean_test_score"]):
      print(c, s)

print(get_search_score_overview())

#### Interpretability

##### Disclaimer: This only works if shap is installed.

In addition to models and their predictions, it is often important to understand _why_ a model makes certain predictions. 
There is a lot of literature on how this can be achieved (explainability), but we will only show the use of Shapley values
using the python module "shap", which is a combination of Shapley values and LIME. 
You can find more information on this topic [here](https://christophm.github.io/interpretable-ml-book/shap.html).

In [None]:
# assume random forest model
model = RandomForestClassifier(n_estimators=10, random_state=seed)
model.fit(X_train, Y_train.values.ravel())

# compute shapley values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
shap_interaction_values = explainer.shap_interaction_values(X_train)

expected_value = explainer.expected_value
print(expected_value)

In [None]:
# class dependent plots of shapley values for each feature
for i,c in enumerate(df.variety.unique()):
    shap.summary_plot(shap_values[i], X_train, show=False)
    plt.title("Shapley values for "+str(c))
    plt.show()

From the computed SHAP values, we can interpret that the *petal.width* has a positive impact on the output of the model 
if the feature value is moderate. For high aand low values, the impact is negative. The same observation
holds for *petal.length*. Besides, the impact of the *sepal.length* and *sepal.width* features are rather low. By impact on a 
the target, we model the probability that we classify that target. Thus, if *petal.width* is high, it is more likely
that we classify the data point as Versicolor.

### Step 6: Deployment

Now that you have chosen and trained your model, it is time to deploy it to your
clients system. 

In [None]:
def micro_service_classify_iris(datapoint):
    
  # make sure the provided datapoints adhere to the correct format for model input

  # fetch your trained model
  model = search.best_estimator_

  # make prediction with the model
  prediction = model.predict(datapoint)

  return prediction


In the Analytics Cup, you need to export your prediction in a very specific output format. This is a csv file without an index and two columns, *id* and *prediction*. Note that the values in both columns need to be integer values, and especially in the *prediction* column either 1 or 0.

In [None]:
# To do: arrumar a celula abaixo com os nossos dataframes

In [None]:
# Let's assume that our id column is the index of the dataframe
output = pd.DataFrame(df_flowers.variety)
output['id'] = df_flowers.index
output = output.rename(columns={'variety': 'prediction'})
output = output.reindex(columns=["id", "prediction"])
output.to_csv('analzticscuppredictionfile.csv', index=False)