## CRISP-DM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

#### Reproducibility 

A best practice in data analytics projects is to work with *seeds* to ensure the reproducability of results. 
This is especially important in the Analytics Cup, since the rules require you to write a self-contained
script that produces reproducable results. 

To achieve this, we can set seeds for all used random number generators.

In [None]:
seed = 55

### Phase 1: Business Understanding

Serves to assess use cases, feasibility, requirements, and
risks of the endeavored data driven project.

Startup that suggests new recipes to users\
But we have been having many cancelations of subscriptions\
Problem was that the users found that the recipes suggested (even though they had high quality) did not match the customer's diet and needs\
Now we have a system of likes and dislikes for the recipes and a new user interface, where the users can enter information about what they want

### Phase 2: Data Understanding

Assess the data quality and content.

In [None]:
# load the data
diet = pd.read_csv("diet.csv")
recipes = pd.read_csv("recipes.csv")
requests = pd.read_csv("requests.csv")
reviews = pd.read_csv("reviews.csv")

have a look at the data and its attributes \
check if columns are properly named \
general overview over data, check for missing values, etc.

#### Diet pre-processing

In [None]:
diet["Diet"] = diet["Diet"].astype('category')

#### Recipes pre-processing

In [None]:
# Change types of column
def refactorIngredients(ingredients):
    if ingredients == "character(0)":
        return []
    ingredients = ingredients.replace("\\", '').replace("\"", '').replace('c(','').replace(')', '')
    ingredients = ingredients.split(",")
    return ingredients

recipes["RecipeIngredientQuantities"] = recipes["RecipeIngredientQuantities"].apply(lambda x: refactorIngredients(x))
recipes["RecipeIngredientParts"] = recipes["RecipeIngredientParts"].apply(lambda x: refactorIngredients(x))

In [None]:
# Determines if recipe is veggie, vegan or omnivore
def categorizeRecipe(ingredients):
    meat_derivates = ["pork", "beef", "meat", "fish", "tuna", "chicken", "squid", "schrimp", "trout", "mussels", 
                      "fillet", "lamb", "scallops", "sardine", "salmon", "lobster", "steak", "bacon", "ham", "oyster"]
    animal_derivates = ["milk", "egg", "honey", "gelatin", "butter", "mayonnaise", "cheese", "margarine", 
                    " heavy", "yogurt", "pudding", "shortening", "ice cream", "chocolate", "alfredo", "Miracle Whip", "half-and-half"]
    vegan_exclusions = ["substitute", "peanut", "apple", "vegan", "soymilk"]
    vegan = True
    for ingredient in ingredients:
        if any(word in ingredient.lower() for word in meat_derivates):
            return "Omnivore"
        if ingredient in vegan_exclusions:
            continue
        if any(word in ingredient.lower() for word in animal_derivates):
            vegan = False
    if vegan: 
        return "Vegan"
    else: 
        return "Vegetarian"

recipes["RecipeDiet"] = recipes["RecipeIngredientParts"].apply(lambda x: categorizeRecipe(x))
recipes['RecipeDiet'] = recipes['RecipeDiet'].astype('category')

# Create another table "recipe extra info" columns category, ingredient quatities, parts
selected_columns = ['RecipeCategory', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeServings', 'RecipeYield']
recipe_extra_info = recipes[selected_columns]
recipes = recipes.drop(columns=selected_columns)

recipes

recipe_extra_info.head()


#### Requests pre-processing

In [None]:
# renaming the columns
requests = requests.rename(columns={"HighCalories": "Calories", "HighProtein":"Protein", "LowFat": "Fat", "LowSugar": "Sugar", "HighFiber":"Fiber"})

In [None]:
# standardizing column Calorie to the same format
requests["Calories"] = requests["Calories"].astype("int")

# standardizing column Protein Yes->1
requests["Protein"] = requests["Protein"].replace("Yes","1")

# changing 0 -> 1 in column Sugar 
requests["Sugar"] = requests["Sugar"].replace("0","1")

# changing 0 -> 1 and 1 -> 0  column Fat
#requests["Fat"] = requests["Fat"].replace({1 : 0, 0 : 1})
requests["Fat"] = 1 - requests["Fat"]

# transforming macronutrients columns -> categories 
#requests[["Calories", "Protein", "Fiber", "Sugar","]] = requests[["Calories", "Protein", "Fiber", "Sugar", "Fat"]].astype("category")

requests


#### Reviews pre-processing

In [None]:
reviews = reviews.drop(columns = ["Rating"])

### Phase 3: Data Preparation

The goal is assure data quality: includes removing wrong/corrupt 
data entries and making sure the entries are standardized, e.g. enforcing certain encodings. 
Then transforms the data in order to make it suitable for the modelling step. This includes scaling, dimensionality
reduction, data augmentation, outlier removal, etc.\
 \
In practise, this will rarely be the case. On average, this step takes up to **80%** of 
the time of the whole project.

In [None]:
# tabelas: diet, requests, reviews, recipes
dietrequestsmerged = diet.merge(requests, on = ["AuthorId"])
dietrequestsreviewsmerged = dietrequestsmerged.merge(reviews, on = ["AuthorId", "RecipeId"])
dietrequestsreviewsmerged = dietrequestsreviewsmerged.rename(columns={"Calories" : "Requested_Calories"})
mergedtables = dietrequestsreviewsmerged.merge(recipes, on = ["RecipeId"])
mergedtables = mergedtables.rename(columns={"Calories" : "Recipe_Calories"})

In [None]:
mergedtables["Total_time_recipe"] = mergedtables["CookTime"] + mergedtables["PrepTime"]
mergedtables = mergedtables.drop(columns=["PrepTime", "CookTime"])
mergedtables["Time"] = np.where(mergedtables["Time"] < 0, 28_000_000, mergedtables["Time"])
mergedtables["Recipe_Time_Match"] = (mergedtables["Total_time_recipe"] <= (1.2 * mergedtables["Time"]))


In [None]:
mergedtables[(mergedtables["Recipe_Time_Match"] == False)][["Recipe_Time_Match", "Total_time_recipe", "Time"]]
mergedtables = mergedtables.drop(columns=["Time", "Total_time_recipe"])
mergedtables = mergedtables.drop(columns=["SaturatedFatContent", "CholesterolContent", "SodiumContent", "CarbohydrateContent"])

In [None]:
def diet_match(person_diet, recipe_diet):
    if person_diet == "Omnivore":
        return True
    if person_diet == "Vegetarian" and recipe_diet != "Omnivore":
        return True
    if person_diet == "Vegan" and recipe_diet == "Vegan":
        return True
    
    return False

mergedtables["RecipeMatch"] = mergedtables.apply(lambda row: diet_match(row["Diet"], row["RecipeDiet"]), axis= 1)

mergedtables[["RecipeMatch", "Diet", "RecipeDiet"]]


In [None]:
mergedtables = mergedtables.drop(columns=["Diet", "RecipeDiet"])

categorical_values = ["Requested_Calories", "Protein", "Fat", "Sugar", "Fiber"] # 'Diet', 'RecipeDiet', 

# Drop categorical values and transform them into one column for each of possible categories
# This also removes remaining string values
for column in categorical_values:
    new_data = pd.get_dummies(mergedtables[column], prefix=column)
    mergedtables = pd.concat([mergedtables, new_data], axis=1)

mergedtables = mergedtables.drop(columns=categorical_values)

# Drop columns that should not be considered
# Drop Name because is string and Random Forest doesn't accept strings

submissiondataset = mergedtables[mergedtables["Like"].isna()] #com Null na coluna Like
trainandtestdataset = mergedtables[mergedtables["Like"].notna()] #sem Null na coluna Like

selected_columns_test = ['AuthorId', 'RecipeId', 'TestSetId', 'Name', "Sugar_1", "Fat_0"]
submission_extra_info = submissiondataset[selected_columns_test]
training_extra_ingo = trainandtestdataset[selected_columns_test]

submissiondataset = submissiondataset.drop(columns= selected_columns_test)
trainandtestdataset = trainandtestdataset.drop(columns= selected_columns_test)


#### Sampling

Split our data set into *train* and *test* data set.

In [None]:
# Under + Oversampling

df_majority = trainandtestdataset[trainandtestdataset['Like'] == 0]
df_minority = trainandtestdataset[trainandtestdataset['Like'] == 1]

length_majority = len(df_majority)
length_minority = len(df_minority)
difference = length_majority - length_minority
final_length = int(difference * 0.7 + length_minority)

print(length_majority, length_minority)
# Downsample the majority class
df_majority = resample(df_majority, replace=False, n_samples= final_length, random_state=seed)

# Oversample the minority class
df_minority = resample(df_minority, replace=True, n_samples= final_length, random_state=seed)

# Combine the downsampled majority class with the original minority class
trainandtestdataset = pd.concat([df_minority, df_majority])


In [None]:
# Separate train and test data and X and Y variables

X_features = trainandtestdataset.drop(columns="Like")
Y_classes = trainandtestdataset["Like"]
Y_classes = Y_classes.astype('category')

trainandtestdataset.info()

X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y_classes,
                                                    test_size=0.2, 
                                                    shuffle=True,
                                                    random_state=seed) # for reproducibility
train_df = X_train
train_df["Y_train"] = Y_train

columns_with_outliers = ["Recipe_Calories", "FatContent", "SugarContent", "ProteinContent", "FiberContent"]

for item in columns_with_outliers:
    mean = np.mean(train_df[item])
    std = train_df[item].std()
    upper_bound = (mean + 5 * std)
    lower_bound = max(0, (mean - 5 * std))
    train_df = train_df[(train_df[item] >= lower_bound) & (train_df[item] < upper_bound)]

train_df = train_df.drop(columns=["Recipe_Calories"])
X_test = X_test.drop(columns=["Recipe_Calories"])
submissiondataset = submissiondataset.drop(columns=["Recipe_Calories"])

X_train = train_df.drop(columns=["Y_train"])
Y_train = train_df["Y_train"]


- X_train: 77.904 rows × 24 columns
- Y_train: 77.904 rows
- X_test: 19.477 rows × 24 columns
- Y_test: 19.477 rows

### Phase 4: Modeling

In this phase, the model is trained and tuned.

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

random_forest.fit(X_train, Y_train)

In [None]:
print(X_train.columns)
Y_pred = random_forest.predict(X_test)


##### Random Forest Analysis

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
confusion_matrix = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix)

true_negatives = confusion_matrix[0][0]
false_negatives = confusion_matrix[1][0]
false_positives = confusion_matrix[0][1]
true_positives = confusion_matrix[1][1]

sensitivity = true_positives / (true_positives + false_negatives)
specificity = true_negatives / (true_negatives + false_positives)

print("sensitivity = ", sensitivity)
print("specificity = ", specificity)
print(Y_test, Y_pred)


In [None]:
(specificity + sensitivity) / 2

#### Submission

In [None]:
X_features_submission = submissiondataset.drop(columns="Like")

In [None]:
# submissiion

# Let's assume that our id column is the index of the dataframe

id = submission_extra_info['TestSetId']
Y_pred_submission = random_forest.predict(X_features_submission)


output = pd.DataFrame({'id': id, 'prediction': Y_pred_submission})

#output
output.info()
output['id'] = output["id"].astype('int')
output['prediction'] = output["prediction"].astype('int')
output.to_csv('analzticscuppredictionfile.csv', index=False)

In [None]:
print(len(Y_pred_submission), len(id))