# Intro

* Loading Data and Libraries

In [None]:
# loading libraries

import numpy as np
import pandas as pd

In [None]:
# reading dataset

df = pd.read_csv("pre-processed.csv")
print("the dataset has " + str(df.shape[0]) + " observations (ice cream flavors) and " + str(df.shape[1]) + " features " + str([col for col in df.columns]))

# dataset: https://www.kaggle.com/datasets/tysonpo/ice-cream-dataset

# Inspiration Notebooks:
# Using Ice Cream Ingredients to Predict Rating: https://www.kaggle.com/code/gcdatkin/using-ice-cream-ingredients-to-predict-rating
# Finding the Best Ice Cream: https://www.kaggle.com/code/kelvintran1998/finding-the-best-ice-cream
# EDA ideas: McDonalds Ice Cream Machines Breaking: https://www.kaggle.com/code/aashidutt3/eda-mcdonalds-ice-cream-machines-breaking

In [None]:
df.head(10)

In [None]:
df.head().to_csv('icecream2_head.csv', index=False, header=True, encoding='cp1252')

# Correcting the ingredient list and getting an unique list with all Ingredients

I want to use generate dummy variables to check if that ingredient is in that ice cream and with these dummy variables I will do the modeling
for that i need a unique list of ingredients

* Data Cleaning

In [None]:
# increasing column character limit for better visualization

pd.set_option('display.max_colwidth', 50000)

In [None]:
# leave the column list in lower case

df['ingredients_cleaned'] = df['ingredients'].str.lower()
df['ingredients_cleaned'].head()

In [None]:
# checking for all unique ingredients to find errors

all_ingredients = set()

for row in df.iterrows():
    ingredients = df.loc[row[0], 'ingredients_cleaned']
    for ingredient in ingredients.split(','):
        if ingredient not in all_ingredients:
            all_ingredients.add(ingredient)

# iterrows(): returns a tuple containing the index and the rest of the row

In [None]:
all_ingredients

# we can se all the problems with the description ingredients (that subingredients of an ingredient)

# 1) parentheses capture all the content inside parentheses
# 2) AND and OR
# 3) special characters
# 4) excess of whitespace
# 5) use of synonyms or equivalent ingredients

# PorterStemmer() and stem function()
# takes all words and break them down to their root. This reduces significantly duplicated words (ex: egg x eggs)

In [None]:
# importing regex module
import re

In [None]:
# eliminating content inside parentheses '()'

df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(r'\([^()]*\)',"", regex=True)
df['ingredients_cleaned'].head() # checking

In [None]:
# remove_excess_whitespace
df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(r'\s*,\s*',", ", regex=True)
df['ingredients_cleaned'].head() # checking

In [None]:
# because this will be a repetitive task, let's create a function for spliting text: corrects and process text

def ingredients_processing(text):
    bool = df['ingredients_cleaned'].str.contains(text, regex=False)
    val_to_replace = df['ingredients_cleaned'][bool].str.split(text).str[0]
    df['ingredients_cleaned'] = df['ingredients_cleaned'].mask(bool, val_to_replace)
    return

In [None]:
# problematic text: spliting by and selecting only first column

problematic_text_ingredients_list = ['\ncontains', 'contains', '.']

for text in problematic_text_ingredients_list:
    ingredients_processing(text)

In [None]:
#checking results

df['ingredients_cleaned']

In [None]:
# observations with ':' problem - spliting by and selecting only second column

bool = df['ingredients_cleaned'].str.contains(":", regex=False)
val_to_replace = df['ingredients_cleaned'][bool].str.split(": ").str[1] # here we will use the second column, not the first
df['ingredients_cleaned'] = df['ingredients_cleaned'].mask(bool, val_to_replace)

df['ingredients_cleaned'] #checking results

* Data Treatment: Manual Correction

In [None]:
# checking tests

#df[df['ingredients_cleaned'].str.contains("mono and diglycerides", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains(" and ", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("and/or", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("†", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("*", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains(")", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("/", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("vanilla ice cream", regex=False) == True]

In [None]:
# treating the problem with 'mono and diglycerides'

df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace("mono and diglycerides", "monoglycerides, diglycerides", regex=False)

In [None]:
# treating the problem with 'and/or' + 'and':

unwanted_text = [' and ', 'and/or']

for word in unwanted_text:
    df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(word, ',', regex=False)

In [None]:
# replacing strange text to ''

unwanted_text = ['†', ')' , 'organic', 'pasteurized','(sugar', 'unbleached', 'enriched', 'unenriched', 'vanilla ice cream', 'coffee ice cream', 'white chocolate ice cream', 
'unsweetened', 'whole', 'pieces', 'concentrates', 'concentrate', 'juice', 'pasteurized', 'puree',  'vegetable s', ' lake']

for word in unwanted_text:
    df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(word, '', regex=False)

In [None]:
# replacing strange text to ' '

unwanted_text = ['*', '/']

for word in unwanted_text:
    df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(word, ' ', regex=False)

In [None]:
# Word Correcting

# Ok. I'm feeling very dumb for doing this checking mannually. 
# I could use PorterStemmer library, which is based on suffixes in the English language. But after some tests, i'm not feeling very confident to use that method.

# Creating a dictionary
word_correcting = {"reese's mini ": "reese's peanut butter", "reese's peanut butter cup ": "reese's peanut butter", "reese's peanut butter cups": "reese's peanut butter",
                   "reese's peanut butter sauce": "reese's peanut butter", "reese's peanut butter swirl": "reese's peanut butter", 'almonds roasted in vegetable oil': 'almonds',
                   'roasted almonds': 'almonds', 'dried apples': 'apple', 'apples': 'apple', 'artificial color lake': 'artificial color', 'color added': 'artificial color',
                   'annatto': 'artificial color', 'blue 1': 'artificial color', 'blue 1': 'artificial color', 'blue 1 lake': 'artificial color', 'blue 2': 'artificial color',
                   'blue 2 lake': 'artificial color', 'yellow 5': 'artificial color', 'yellow 5 lake': 'artificial color', 'yellow 6': 'artificial color', 'yellow 6 lake': 'artificial color',
                   'red 40': 'artificial color', 'red 40 lake': 'artificial color', 'reb a': 'artificial color', 'artificial color lake': 'artificial color',
                   'artificial flavoring': 'artificial flavor', 'artificial flavors': 'artificial flavor', 'banana puree': 'banana', 'bananas': 'banana', 'black carrot s': 'black carrot',
                   'black raspberry puree': 'black raspberry', 'black raspberries': 'black raspberry', 'blueberries': 'blueberry', 'butteroil': 'butter', 'butterfat': 'butter',
                   'butter oil': 'butter', 'dried cane syrup': 'cane syrup', 'caramelized sugar': 'caramel', 'caramel color': 'caramel', 'caramel flavor': 'caramel',
                   'caramel swirl': 'caramel', 'caramel syrup': 'caramel', 'caramelized sugar syrup': 'caramelized sugar', 'carob bean': 'carob bean gum', 'carob gum': 'carob bean gum',
                   'cheese cultures': 'cheese culture', 'cherries': 'cherry', 'cherry': 'cherry', 'cherry puree': 'cherry', 'cherry concentrate': 'cherry', 'cherry juice concentrate': 'cherry',
                   'chocolate chip cookies': 'chocolate', 'chocolate chips': 'chocolate', 'chocolate cookie pieces': 'chocolate', 'chocolate flavored coating': 'chocolate', 'chocolate liquor':
                   'chocolate', 'chocolate processed with alkali': 'chocolate', 'chocolatey chips': 'chocolate', 'milk chocolate candies': 'chocolate',
                   'milk chocolate': 'chocolate','dark chocolate': 'chocolate', 'semi-sweet chocolate chunks': 'chocolate', 'chocolaty coated cone': 'chocolatey coated cone',
                   'black cocoa processed with alkali': 'cocoa powder', 'dutched cocoa': 'cocoa powder', 'coconut concentrate': 'coconut', 'coconut cream': 'coconut', 'coconut extract': 'coconut',
                   'desiccated coconut': 'coconut', 'coffee extract': 'coffee', 'coffee extract concentrate': 'coffee', 'condensed skim milk': 'condensed milk',
                   'evaporated milk': 'condensed milk', 'sweetened condensed milk': 'condensed milk', 'sweetened condensed skim milk': 'condensed milk', 'corn syrup solids': 'corn syrup',
                   'high fructose corn syrup': 'corn syrup', 'corn starch': 'cornstarch', 'modified corn starch': 'cornstarch', 'modified food starch': 'cornstarch',
                   'modified cornstarch': 'cornstarch', 'eggs': 'egg', 'whole egg': 'egg', 'whole eggs': 'egg', 'egg whites': 'egg white',
                   'egg yolks': 'egg yolk', 'guar': 'guar gum', 'ground heath toffee': 'heath toffee bar', 'hibiscus flower': 'hibiscus', 'hibiscus powder': 'hibiscus',
                   'invert cane sugar': 'inverted sugar syrup', 'invert sugar': 'inverted sugar syrup', 'lactase enzyme': 'lactase', 'lemon juice': 'lemon', 'lemon juice concentrate': 'lemon',
                   'locust bean': 'locust bean gum', 'dry malt extract': 'malt extract', 'maltitol': 'maltitol syrup', 'corn maltodextrin': 'maltodextrin', 'mango puree': 'mango',
                   'mangos': 'mango', 'organic milk': 'milk', 'anhydrous milkfat': 'milk fat', 'milkfat': 'milk fat', 'nonfat milk solids': 'milk powder', 'nonfat dry milk': 'milk powder',
                   'whole milk powder': 'milk powder', 'skim milk powder': 'milk powder', 'natural flavors': 'natural flavor', 'rolled oats': 'oats', 'vitamin a palm oil oilitate': 'palm oil',
                   '^palm$': 'palm oil', 'palm': 'palm oil', 'palm kernel': 'palm oil', 'palm kernel oil': 'palm oil', 'partially defatted peanut flour': 'peanut', 'roasted peanuts': 'peanut',
                   'peppermint bark': 'peppermint', 'peppermint extract': 'peppermint', 'peppermint oil': 'peppermint', 'peppermint twists candy': 'peppermint', 'pistachio paste': 'pistachio',
                   'pistachios': 'pistachio', 'raspberries': 'raspberry', 'raspberry swirl': 'raspberry', 'sea salt': 'salt', 'organic skim milk': 'skim milk', 'lactose reduced skim milk': 'skim milk',
                   'nonfat milk': 'skim milk', 'soy lecithin': 'soybean lecithin', 'soybean lecithin natural flavor': 'soybean lecithin','soybean oils': 'soybean oil',
                   'expeller pressed soybean oil': 'soybean oil', 'spice': 'spices', 'strawberries': 'strawberry', 'strawberry swirl': 'strawberry',
                   'powdered sugar': 'sugar', 'cane sugar': 'sugar', 'liquid sugar': 'sugar syrup', 'sunflower oils': 'sunflower oil', 'tapioca flour': 'tapioca', 'tapioca starch': 'tapioca',
                   'tapioca syrup': 'tapioca', 'tara': 'tara gum', 'thiamin mononitrate': 'thiamine mononitrate', 'vanilla bean seeds': 'vanilla', 'vanilla beans': 'vanilla',
                   'vanilla extract': 'vanilla', 'ground vanilla': 'vanilla', 'processed vanilla': 'vanilla', 'vegetable gums': 'vegetable gum', 'vegetable oil coating': 'vegetable oil',
                   'waffle cone': 'waffle cone pieces', 'un wheat flour': 'wheat flour', 'bleached wheat flour': 'wheat flour', 'whey protein concentrate': 'whey',
                   'milk protein concentrate': 'whey', 'whey protein': 'whey', 'heavy cream': 'cream'
}

In [None]:
# using the dictionary to correct wrong words

df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(word_correcting, regex=True)

df['ingredients_cleaned'] #checking results

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(' , ', ', ') # trim excess of whitespace

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(' , ', ', ') # trim excess of whitespace

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(',,', ', ') # trim excess of comma

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(r'"', '') # trim quotation marks

In [None]:
# is working?
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'gum gum', value='gum', regex=True) # duplicated gum words

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'oil oil', value='oil', regex=True) # duplicated oil words

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'syrup syrup', value='syrup', regex=True) # duplicated syrup words

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'peanuts', value='peanut', regex=True)

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'palm oil kernel oil', value='palm oil', regex=True)

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'palm oil kernel', value='palm oil', regex=True)

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'vitamin a palm oilitate', value='vitamin a palmitate', regex=True)

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'cocoa processed with alkali', value='cocoa powder', regex=True)

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r', cocoa,', value=', cocoa powder,', regex=True)
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r', cocoa,', value=', cocoa powder,', regex=True) # yeah... i really need to do this 2x

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'spicess', value='spices', regex=True)

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'soybean lecithin natural flavor', value='soybean lecithin', regex=True)

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'^\s*,\s*', value='', regex=True) # trim ingredient list wich starts with ','

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'^\s+', value='', regex=True) # trim ingredient list wich starts with whitespace

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'\s*,\s*', value=', ', regex=True) # trim leading and trailing whitespace

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r',\s,\s', value=', ', regex=True) # trim excess of comma

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'\s{2,}', value='', regex=True) # trim excess of whitespace

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(to_replace=r'\s+$', value='', regex=True) # remove all whitespaces at the end of a string

In [None]:
# small corrections

df = df.drop('ingredients', axis=1)

In [None]:
#checking the ingredients list

df['ingredients_cleaned']

* Data Treatment: unique ingredients list

In [None]:
# convert the ingredients list in a python list to correct problems like repeated words and excess of whitespace

all_ingredients = set()

for row in df.iterrows():
    ingredient_list = df.loc[row[0], 'ingredients_cleaned'].split(', ')
    for ingredient in ingredient_list:
        if ingredient not in all_ingredients:
            all_ingredients.add(ingredient)

In [None]:
#checking the unique ingredients list for 'dummerize' it

all_ingredients

In [None]:
unique_ingredients_lenght = len(all_ingredients)

print("There are " + str(unique_ingredients_lenght) + " unique ingredients")

In [None]:
# Common Ingredients for top and bottom ice cream

# Common Ingredients for top and bottom ice cream

Here, we'll be looking at the bottom ice cream ingredients for each brand and all of them together.

We'll mainly be looking at the top ingredients with the highest count.
reescrever

In [None]:
# Bottom Ben & Jerry's Ingredients

In [None]:
# bottom combined


In [None]:
#  wordcloud
#wordcloud2(bcombined_word, size=1, color='random-dark')


Ingredients Difference

In [None]:
# Find Difference between Top and Bottom Ingredients

#Ben & Jerry's Difference
# ntop x nbot x ndiff

In [None]:
df

# under: part 3

# Getting Dummy Matrix

we will get the dummy matrix and concatenate it with the dataframe

In [None]:
y = df.loc[:, 'rating']
X = df.copy()
#X = df.drop(['rating', 'name'], axis=1)
df_bckp = df.copy()

In [None]:
X.head()

In [None]:
X['ingredients_cleaned'] = X['ingredients_cleaned'].str.split(',')
ingredients_df = X['ingredients_cleaned']
ingredients_df.head()

In [None]:
# generating ingredient dummy matrix

# convert unique ingredients set to pandas series
ingredient_dummy = pd.DataFrame(columns=list(all_ingredients))

# checking if the wordlist contains the column name and generating the dummy matrix 
for column_name in ingredient_dummy.columns:
    ingredient_dummy[column_name] = X['ingredients_cleaned'].apply(lambda x: 1 if column_name in x else 0)

# concatenating the dummy matrix with X
X = pd.concat([X, ingredient_dummy], axis=1)
#X = X.drop('ingredients_cleaned', axis=1)

# checking the result
X

# Classifying ingredients into categories: common Ingredients for top and bottom ice cream

In [None]:
# attempt to classify ingredients according to fat, milk/water, sweetening, flavor, dye and other
# generating filter lists to select only some dummies

dye = ['artificial color', 'black carrot', 'red cabbage']
fat = ['butter', 'canola oil', 'carnauba wax', 'cocoa butter', 'coconut oil', 'corn oil', 'cottonseed oil', 'cream', 'hydrogenated vegetable oil', 'milk fat', 'palm oil', 'safflower oil', 'soybean oil', 'sunflower oil', 'vegetable oil']
flavor = ['almond extract', 'almonds', 'apple', 'balsamic vinegar', 'banana', 'belgian chocolate', 'black cherry', 'black raspberry', 'blackberry', 'blueberry', 'bourbon', 'carrot', 'cheesecake base', 'cheesecakewith graham crust', 'cherry', 'chocolate', 'chocolate cookie', 'chocolatey coated cone', 'chocolatey coated peanut', 'cinnamon', 'cloves', 'cocoa powder', 'coconut', 'coffee', 'cone', "confectioner's glaze", 'cream cheese', 'fruit', 'fudge coating', 'fudge covered waffle cone pieces', 'fudge swirl', 'ginger', 'graham crumb', 'graham flour', 'grape', 'green tea', 'hazelnuts', 'heath toffee bar', 'hibiscus', 'leavening', 'lemon', 'lemon oil', 'lemon peel', 'lime', "m&m's minis chocolate with cocoa butter", 'malted barley flour', 'mango', 'matcha green tea', 'mint leaf extractives', 'natural flavor', 'nutmeg', 'oakwood extract', 'oats', 'orange', 'oreo cookie', 'pea protein', 'peaches', 'peanut', 'peanut butter', 'peanut extract', 'peanut flour', 'peanut oil', 'pecans', 'peppermint', 'pineapple', 'pistachio', 'plum', 'pumpkin', 'raisins', 'raspberry', "reese's peanut butter", 'rum', 'salt', 'snickers', 'soybean', 'spices', 'strawberry', 'sweetened lemon zest', 'toffee', 'vanilla', 'vermouth', 'waffle cone pieces', 'walnuts', 'wheat bran', 'wheat germ', 'wheat graham flour', 'whiskey', 'wine']
milk_water = ['almond milk', 'milk', 'milk powder', 'milk protein', 'skim milk', 'water', 'whey']
other = ['artificial flavor', 'ascorbic acid', 'baking powder', 'baking soda', 'calcium carbonate', 'calcium phosphate', 'carob bean gum', 'carrageenan', 'carrot powder', 'cellulose gum', 'cheese culture', 'citric acid', 'cornstarch', 'cream of tartar', 'diglycerides', 'egg', 'egg white', 'egg yolk', 'enzymes', 'flour', 'folic acid', 'glycerin', 'guar gum', 'lactase', 'lactic acid', 'lactose', 'lecithin', 'locust bean gum', 'malic acid', 'malt extract', 'modified cornstarch', 'monocalcium phosphate', 'monoglycerides', 'niacin', 'pectin', 'pgpr', 'potassium sorbate', 'potato', 'potato flour', 'propylene glycol monoesters', 'reduced iron', 'riboflavin', 'rice flour', 'rice starch', 'sodium acid pyrophosphate', 'sodium bicarbonate', 'sodium caseinate', 'sodium citrate', 'soluble corn fiber', 'soybean lecithin', 'sunflower lecithin', 'tapioca', 'tara gum', 'tbhq', 'thiamine mononitrate', 'vanillin', 'vegetable', 'vegetable gum', 'vegetablecolor', 'vitamin a palmitate', 'wheat flour', 'xanthan gum', 'yeast extract']
sweetening = ['acesulfame potassium', 'barley malt', 'beet', 'brown sugar', 'cane syrup', 'caramel', 'caramel syrup', 'condensed milk', 'corn syrup', 'dextrin', 'dextrose', 'erythritol', 'fructose', 'honey', 'inverted sugar syrup', 'maltitol syrup', 'maltodextrin', 'maple syrup', 'molasses', 'polydextrose', 'sorbitol', 'sucralose', 'sugar', 'sugar syrup', 'sweetened condensed milk']

# generating aux brand filter
brand = ['brand_BenJerrys', 'brand_Breyers', 'brand_HaagenDazs', 'brand_Talenti']

* Comparing top and bot ingredients

we will create three function:
ingredient_dummy_filter
parse_top_bot_ingredient

In [None]:
# 1
# possible parameters
# ingredient_list: dye, fat, flavor, milk_water, other or sweetening

def ingredient_dummy_filter(ingredient_list):
    filtered = pd.concat([X.iloc[:, :5], X[ingredient_list]], axis=1)
    return filtered

In [104]:
# 2
# possible parameters
# brand (string): all, BenJerrys, Breyers, HaagenDazs, Talenti
# filtered (df):  filter_dye, filter_fat, filter_flavor, filter_milk_water, filter_other, filter_sweetening


def parse_top_bot_ingredients(brand, filtered):
    # classifying and parsing the top and bot df
    if brand == 'all':
        top = filtered.sort_values('rating', ascending=False).head(10)
        bot = filtered.sort_values('rating', ascending=False).tail(10)
    else:
        top = filtered[filtered['brand'] == brand].sort_values('rating', ascending=False).head(10)
        bot = filtered[filtered['brand'] == brand].sort_values('rating', ascending=False).tail(10)


    #creating a top10 and bot10 unique ingredients list
    top10_unique_ingredients = set()
    bot10_unique_ingredients = set()
    df_list = [top, bot]
    for topbotdf in df_list:
        for row in topbotdf.iterrows():
            ingredient_list = topbotdf.loc[row[0], 'ingredients_cleaned']
            for ingredient in ingredient_list:
                if ingredient not in top10_unique_ingredients:
                    top10_unique_ingredients.add(ingredient)
                if ingredient not in bot10_unique_ingredients:
                    bot10_unique_ingredients.add(ingredient)

    return top10_unique_ingredients, bot10_unique_ingredients

In [105]:
# 3
# possible parameters
# brand (string): 
# filtered (df): 

# count ocurrencies

def tb_ingredients_count(tb10_filtered, tb10_unique_set):
    # tb10 prefix is for top10 or bot10. this function works for both
    tb10_dict_ingredients = dict.fromkeys(tb10_unique_set, 0) # before counting ocurrencies, we need to create dictionary
    for ingredient in tb10_dict_ingredients:
        count = tb10_filtered['ingredients_cleaned'].str.count(ingredient, re.I).sum() # count of occurrences of the ingredient
        tb10_dict_ingredients[ingredient] = count
    return tb10_dict_ingredients

In [108]:
filtered_df_dye = ingredient_dummy_filter(dye)
filtered_df_fat = ingredient_dummy_filter(fat)
filtered_df_flavor = ingredient_dummy_filter(flavor)
filtered_df_milk_water = ingredient_dummy_filter(milk_water)
filtered_df_other = ingredient_dummy_filter(other)
filtered_df_sweetening = ingredient_dummy_filter(sweetening)

In [109]:
top10_unique_ingredients_dye, bot10_unique_ingredients_dye = parse_top_bot_ingredients('all', filtered_df_dye)
top10_unique_ingredients_fat, bot10_unique_ingredients_fat = parse_top_bot_ingredients('all', filtered_df_fat)
top10_unique_ingredients_flavor, bot10_unique_ingredients_flavor = parse_top_bot_ingredients('all', filtered_df_flavor)
top10_unique_ingredients_milk_water, bot10_unique_ingredients_milk_water = parse_top_bot_ingredients('all', filtered_df_milk_water)
top10_unique_ingredients_other, bot10_unique_ingredients_other = parse_top_bot_ingredients('all', filtered_df_other)
top10_unique_ingredients_sweetening, bot10_unique_ingredients_sweetening = parse_top_bot_ingredients('all', filtered_df_sweetening)

In [116]:
filtered_df_dye

Unnamed: 0,brand,name,rating,rating_count,ingredients_cleaned,artificial color,black carrot,red cabbage
0,BenJerrys,Salted Caramel Core,3.7,208,"[cream, skim milk, sugar syrup, water, brown sugar, sugar, milk, wheat flour, egg yolk, corn syrup, egg, butter, butter, pectin, salt, soybean oil, vanilla, guar gum, soybean lecithin, baking powder, baking soda, salt, carrageenan, lactase]",0,0,0
1,BenJerrys,Netflix & Chilll'd™,4.0,127,"[cream, skim milk, sugar syrup, water, sugar, peanut, wheat flour, canola oil, egg yolk, cornstarch, peanut oil, cocoa powder, salt, soybean oil, inverted sugar syrup, milk fat, egg, egg white, guar gum, soybean lecithin, tapioca, baking soda, carrageenan, vanilla, barley malt, malted barley flour]",0,0,0
2,BenJerrys,Chip Happens,4.7,130,"[cream, sugar syrup, skim milk, water, sugar, cocoa powder, potato, coconut oil, corn syrup, soybean oil, egg yolk, rice starch, sunflower oil, barley malt, cocoa powder, wheat flour, milk, salt, soybean lecithin, yeast extract, natural flavor, guar gum, salt, milk fat, vanilla, enzymes, carrageenan, baking soda]",0,0,0
3,BenJerrys,Cannoli,3.6,70,"[cream, skim milk, sugar syrup, water, corn syrup, coconut oil, sugar, cane syrup, egg yolk, wheat flour, milk, cocoa powder, natural flavor, guar gum, soybean lecithin, butter, natural flavor, locust bean gum, salt, citric acid, vanilla, soybean oil, tapioca, lactic acid, carrageenan]",0,0,0
4,BenJerrys,Gimme S’more!™,4.5,281,"[cream, skim milk, water, sugar syrup, sugar, canola oil, soybean oil, egg yolk, corn syrup, wheat flour, coconut oil, cornstarch, cocoa powder, corn syrup, cocoa powder, graham flour, salt, egg white, butter, tapioca, guar gum, soybean lecithin, baking soda, natural flavor, molasses, honey, vanilla, pectin, caramel syrup, carrageenan]",0,0,0
...,...,...,...,...,...,...,...,...
236,Breyers,CINNABON®,4.0,28,"[milk, corn syrup, sugar, brown sugar, soybean oil, water, butter, cream, salt, corn syrup, spices, salt, soybean lecithin, vanilla, cane syrup, wheat flour, coconut oil, water, butter, cream, salt, molasses, sodium bicarbonate, spices, corn syrup, soybean lecithin, salt, fructose, coconut oil, whey, cream, vegetable gum, guar gum, carob bean gum, monoglycerides, diglycerides, natural flavor]",0,0,0
237,Breyers,CarbSmart™ Caramel Swirl Bar,4.7,18,"[milk, water, caramel, sugar, water, corn syrup, corn syrup, milk powder, butter, cream, salt, salt, molasses, pectin, soybean lecithin, natural flavor, potassium sorbate, sodium citrate, lactic acid, maltitol syrup, polydextrose, coconut oil, whey, glycerin, cream, soluble corn fiber, vegetable gum, carob bean gum, guar gum, caramel, monoglycerides, diglycerides, natural flavor, acesulfame potassium, sucralose]",0,0,0
238,Breyers,Layered Dessert S'mores,2.5,31,"[milk, corn syrup, sugar, wheat flour, butter, cream, salt, palm oil, corn syrup, skim milk, water, rice flour, natural flavor, salt, wheat flour, sugar, palm oil, molasses, spices, salt, baking soda, natural flavor, soybean lecithin, fructose, coconut oil, cocoa powder, whey, cream, vegetable gum, guar gum, carob bean gum, monoglycerides, diglycerides, natural flavor, vanilla]",0,0,0
239,Breyers,Layered Dessert Peach Cobbler,3.2,38,"[milk, corn syrup, wheat flour, wheat flour, niacin, reduced iron, thiamine mononitrate, riboflavin, folic acid, sugar, butter, cream, salt, canola oil, milk powder, salt, sodium bicarbonate, peaches, fructose, coconut oil, sugar, whey, cream, vegetable gum, guar gum, carob bean gum, tara gum, monoglycerides, diglycerides, natural flavor, vanilla, artificial color, vanilla]",0,0,0


In [114]:
tb10_dict_ingredients = dict.fromkeys(top10_unique_ingredients_milk_water, 0) # before counting ocurrencies, we need to create dictionary
for ingredient in tb10_dict_ingredients:
    print(ingredient)
    count = filtered_df_milk_water['ingredients_cleaned'].str.count(ingredient, re.I).sum() # count of occurrences of the ingredient
    print(count)
    """PROBLEMA DETECTADO: O DICTIONARY ESTÁ PEGANDO WHITESPACE COMO SUFIXO"""
#    tb10_dict_ingredients[ingredient] = count
#return tb10_dict_ingredients
tb10_dict_ingredients

 peppermint
0.0
 water
0.0
 pecans
0.0
 molasses
0.0
 riboflavin
0.0
 carrageenan
0.0
 soybean lecithin
0.0
 maltodextrin
0.0
 brown sugar
0.0
 calcium phosphate
0.0
 flour
0.0
 artificial flavor
0.0
 fructose
0.0
 sorbitol
0.0
 cocoa powder
0.0
 caramel
0.0
cream
0.0
 spices
0.0
 glycerin
0.0
 niacin
0.0
 lemon peel
0.0
 peanut oil
0.0
 cane syrup
0.0
 lemon
0.0
 diglycerides
0.0
water
0.0
 baking soda
0.0
 peanut
0.0
 oreo cookie
0.0
 cellulose gum
0.0
 milk fat
0.0
 folic acid
0.0
 egg yolk
0.0
 milk
0.0
 cream
0.0
 canola oil
0.0
 vanilla
0.0
 sugar syrup
0.0
 sunflower lecithin
0.0
 banana
0.0
 wheat flour
0.0
 palm oil
0.0
 pectin
0.0
 chocolate
0.0
 whey
0.0
 corn syrup
0.0
 almonds
0.0
 milk powder
0.0
 butter
0.0
 thiamine mononitrate
0.0
 carob bean gum
0.0
 sunflower oil
0.0
 lactase
0.0
 cornstarch
0.0
 acesulfame potassium
0.0
 soybean oil
0.0
 honey
0.0
 reduced iron
0.0
 safflower oil
0.0
 skim milk
0.0
 tapioca
0.0
 tara gum
0.0
 monoglycerides
0.0
 salt
0.0
 condensed 

{' peppermint': 0,
 ' water': 0,
 ' pecans': 0,
 ' molasses': 0,
 ' riboflavin': 0,
 ' carrageenan': 0,
 ' soybean lecithin': 0,
 ' maltodextrin': 0,
 ' brown sugar': 0,
 ' calcium phosphate': 0,
 ' flour': 0,
 ' artificial flavor': 0,
 ' fructose': 0,
 ' sorbitol': 0,
 ' cocoa powder': 0,
 ' caramel': 0,
 'cream': 0,
 ' spices': 0,
 ' glycerin': 0,
 ' niacin': 0,
 ' lemon peel': 0,
 ' peanut oil': 0,
 ' cane syrup': 0,
 ' lemon': 0,
 ' diglycerides': 0,
 'water': 0,
 ' baking soda': 0,
 ' peanut': 0,
 ' oreo cookie': 0,
 ' cellulose gum': 0,
 ' milk fat': 0,
 ' folic acid': 0,
 ' egg yolk': 0,
 ' milk': 0,
 ' cream': 0,
 ' canola oil': 0,
 ' vanilla': 0,
 ' sugar syrup': 0,
 ' sunflower lecithin': 0,
 ' banana': 0,
 ' wheat flour': 0,
 ' palm oil': 0,
 ' pectin': 0,
 ' chocolate': 0,
 ' whey': 0,
 ' corn syrup': 0,
 ' almonds': 0,
 ' milk powder': 0,
 ' butter': 0,
 ' thiamine mononitrate': 0,
 ' carob bean gum': 0,
 ' sunflower oil': 0,
 ' lactase': 0,
 ' cornstarch': 0,
 ' acesulfam

In [110]:
hitop = tb_ingredients_count(filtered_df_milk_water,top10_unique_ingredients_milk_water)
hitop

TypeError: 'set' object is not subscriptable

In [102]:
top_aeho = tb_ingredients_count(top10_ingredients_milk_water)
aeho

TypeError: 'set' object is not subscriptable

In [None]:
hi1 = pd.Series(top10_dict_ingredients, name='top_count')
hi1

In [None]:
hi2 = pd.Series(bot10_dict_ingredients, name='bottom_count')
hi2

In [None]:
hi = pd.concat([hi1, hi2], axis=1)
hi = hi.fillna(0)
hi['diff'] = hi['top_count'] - hi['bottom_count']
hi = hi.sort_values('diff', ascending=False).reset_index().rename(columns={'index':'ingredient'})
#hi.to_csv('icecream2_topbot_diff.csv', index=True, header=True, encoding='utf-8')
hi

In [None]:
import seaborn as sns

In [None]:
sns.barplot(data=hi, x="diff", y="ingredient", orient='h', palette='Spectral')

* flavor

In [None]:
# selecting 4 first columns and concatenating with the selected ingredients based on a list filter
filtered = pd.concat([X.iloc[:, :4], X[flavor]], axis=1)

In [None]:
#pd.options.display.max_rows = None
#pd.options.display.max_columns = None