# Intro

* Loading Data and Libraries

In [2]:
# loading libraries

import numpy as np
import pandas as pd

In [3]:
# reading dataset

df = pd.read_csv("pre-processed.csv")
print("the dataset has " + str(df.shape[0]) + " observations (ice cream flavors) and " + str(df.shape[1]) + " features " + str([col for col in df.columns]))

# dataset: https://www.kaggle.com/datasets/tysonpo/ice-cream-dataset

# Inspiration Notebooks:
# Using Ice Cream Ingredients to Predict Rating: https://www.kaggle.com/code/gcdatkin/using-ice-cream-ingredients-to-predict-rating
# Finding the Best Ice Cream: https://www.kaggle.com/code/kelvintran1998/finding-the-best-ice-cream
# EDA ideas: McDonalds Ice Cream Machines Breaking: https://www.kaggle.com/code/aashidutt3/eda-mcdonalds-ice-cream-machines-breaking

the dataset has 241 observations (ice cream flavors) and 5 features ['brand', 'name', 'rating', 'rating_count', 'ingredients']


In [4]:
df.head(10)

Unnamed: 0,brand,name,rating,rating_count,ingredients
0,BenJerrys,Salted Caramel Core,3.7,208,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
1,BenJerrys,Netflix & Chilll'd™,4.0,127,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
2,BenJerrys,Chip Happens,4.7,130,"CREAM, LIQUID SUGAR (SUGAR, WATER), SKIM MILK,..."
3,BenJerrys,Cannoli,3.6,70,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
4,BenJerrys,Gimme S’more!™,4.5,281,"CREAM, SKIM MILK, WATER, LIQUID SUGAR (SUGAR, ..."
5,BenJerrys,Peanut Butter Half Baked®,4.9,14,"CREAM, LIQUID SUGAR (SUGAR, WATER), SKIM MILK,..."
6,BenJerrys,Berry Sweet Mascarpone,4.6,10,"CREAM, SKIM MILK, WATER, LIQUID SUGAR (SUGAR, ..."
7,BenJerrys,Chocolate Peanut Butter Split,5.0,7,"CREAM, SKIM MILK, LIQUID SUGAR (SUGAR, WATER),..."
8,BenJerrys,Justice ReMix'd ™,4.3,110,"CREAM, LIQUID SUGAR (SUGAR, WATER), SKIM MILK,..."
9,BenJerrys,Boots on the Moooo’n™,4.7,42,"CREAM, LIQUID SUGAR (SUGAR, WATER), SKIM MILK,..."


# Getting Unique List of All Ingredients

* Data Cleaning

In [5]:
# increasing column character limit for better visualization
pd.set_option('display.max_colwidth', 50000)

In [6]:
# eliminating content inside parentheses ()
df['ingredients_cleaned'] = df['ingredients'].str.lower()
df['ingredients_cleaned'].head()

0                                   cream, skim milk, liquid sugar (sugar, water), water, brown sugar, sugar, milk, wheat flour, egg yolks, corn syrup, eggs, butter (cream, salt), butteroil, pectin, sea salt, soybean oil, vanilla extract, guar gum, soy lecithin, baking powder (sodium acid pyrophosphate, sodium bicarbonate, corn starch, monocalcium phosphate), baking soda, salt, carrageenan, lactase
1                                                                          cream, skim milk, liquid sugar (sugar, water), water, sugar, peanuts, wheat flour, canola oil, egg yolks, corn starch, peanut oil, cocoa powder, salt, soybean oil, invert cane sugar, milk fat, eggs, egg whites, guar gum, soy lecithin, tapioca starch, baking soda, carrageenan, vanilla extract, barley malt, malted barley flour
2    cream, liquid sugar (sugar, water), skim milk, water, sugar, cocoa (processed with alkali), potato, coconut oil, corn syrup solids, soybean oil, egg yolks, rice starch, sunflower oil, barley 

In [7]:
# generate all unique ingredients
all_ingredients = set()

for row in df.iterrows():
    ingredients = df.loc[row[0], 'ingredients_cleaned']
    for ingredient in ingredients.split(','):
        if ingredient not in all_ingredients:
            all_ingredients.add(ingredient)

# iterrows(): returns a tuple containing the index and the rest of the row

In [8]:
all_ingredients

# we can se all the problems with the description ingredients (that subingredients of an ingredient)

# 1) parentheses capture all the content inside parentheses
# 2) AND and OR
# 3) special character

# PorterStemmer()
# stem function()
# takes all words and break them down to their root. This reduces significantly duplicated words (ex: egg x eggs)

{'  water',
 ' acesulfame potassium',
 ' almond extract',
 ' almonds',
 ' almonds roasted in vegetable oil',
 ' and/or baking soda',
 ' and/or calcium phosphate',
 ' and/or canola oil',
 ' and/or palm oil',
 ' and/or sunflower oil)',
 ' anhydrous milkfat',
 ' annatto (color)',
 ' annatto (for color)',
 ' apple juice',
 ' artificial color',
 ' artificial flavor',
 ' artificial flavoring',
 ' artificial flavors',
 ' ascorbic acid',
 ' baking powder',
 ' baking powder (sodium acid pyrophosphate',
 ' baking soda',
 ' baking soda and/or calcium phosphate',
 ' baking soda. contains milk',
 ' balsamic vinegar (red wine vinegar',
 ' banana puree',
 ' bananas',
 ' barley malt',
 ' beet juice (for color)',
 ' belgian chocolate',
 ' black carrot concentrate (for color)',
 ' black cherries',
 ' black raspberries',
 ' black raspberry puree',
 ' blackberry juice concentrate',
 ' bleached wheat flour',
 ' blue 1',
 ' blue 1 lake',
 ' blue 2',
 ' blue 2 lake',
 ' blueberries',
 ' blueberry puree conce

In [9]:
# importing regex module
import re

In [10]:
# eliminating content inside parentheses '()'
df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(r'\([^()]*\)',"", regex=True)
df['ingredients_cleaned'].head() # checking

0                                                                                                            cream, skim milk, liquid sugar , water, brown sugar, sugar, milk, wheat flour, egg yolks, corn syrup, eggs, butter , butteroil, pectin, sea salt, soybean oil, vanilla extract, guar gum, soy lecithin, baking powder , baking soda, salt, carrageenan, lactase
1                                                   cream, skim milk, liquid sugar , water, sugar, peanuts, wheat flour, canola oil, egg yolks, corn starch, peanut oil, cocoa powder, salt, soybean oil, invert cane sugar, milk fat, eggs, egg whites, guar gum, soy lecithin, tapioca starch, baking soda, carrageenan, vanilla extract, barley malt, malted barley flour
2    cream, liquid sugar , skim milk, water, sugar, cocoa , potato, coconut oil, corn syrup solids, soybean oil, egg yolks, rice starch, sunflower oil, barley malt, cocoa powder, wheat flour, milk, salt, soy lecithin, yeast extract, natural flavor, guar gum, sea salt, m

In [11]:
# this function for spliting text: corrects and process text
def ingredients_processing(text):
    bool = df['ingredients_cleaned'].str.contains(text, regex=False)
    val_to_replace = df['ingredients_cleaned'][bool].str.split(text).str[0]
    df['ingredients_cleaned'] = df['ingredients_cleaned'].mask(bool, val_to_replace)
    return

In [12]:
# problematic text: spliting by and selecting only first column

problematic_text_ingredients_list = ['\ncontains', 'contains', '.']

for text in problematic_text_ingredients_list:
    ingredients_processing(text)

In [13]:
#checking results

df['ingredients_cleaned']

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       cream, skim milk, liquid sugar , water, brown sugar, sugar, milk, wheat flour, egg yolks, corn syrup, eggs, butter , butteroil, pectin, sea salt, soybean oil, vanilla extract, guar gum, soy lecithin, baking powder , baking soda, salt, carrageenan, lactase
1                                                                                               

In [None]:
# Observations with ':' 
# These observations strongly indicates that ice cream have additional items (coating, caramel/fudge, chocolate chips, chocolate drizzle, peanut butter, etc)

# 1) First, I'm going to create a column that checks for existing additional items. Then, if necessary, I will manually make corrections.
# 2) Secondl, because the first ':' indicates the first item (ice cream) ingredients, i'll split that row and select second column ("ice cream: milk, ...")

In [14]:
# checking ice creams flavors that have additional items
bool = df['ingredients_cleaned'].str.contains(":", regex=False)
df['have_additional_items'] = bool

In [15]:
# checking inconsistencies

df[df['have_additional_items']== True]

Unnamed: 0,brand,name,rating,rating_count,ingredients,ingredients_cleaned,have_additional_items
62,HaagenDazs,Caramel Soft Dipped Ice Cream Bar,4.9,8,"CARAMEL ICE CREAM: CREAM, SKIM MILK, SUGAR, EGG YOLKS, SWEETENED CONDENSED MILK (CONDENSED MILK SUGAR), CORN SYRUP, SALT, PECTIN, BAKING SODA, VANILLA EXTRACT. CHOCOLATY COATING: SUGAR, COCOA PROCESSED WITH ALKALI, CANOLA OIL, PALM OIL, MILKFAT, SOY LECITHIN. SALTED CARAMEL SWIRL: CORN SYRUP, SWEETENED CONDENSED MILK (CONDENSED MILK SUGAR), CREAM, SUGAR, WATER, BUTTER (CREAM, SALT), SALT, PECTIN, MILKFAT, SOY LECITHIN.\nCONTAINS: MILK, EGG AND SOY INGREDIENTS","caramel ice cream: cream, skim milk, sugar, egg yolks, sweetened condensed milk , corn syrup, salt, pectin, baking soda, vanilla extract",True
67,HaagenDazs,Chocolate Soft Dipped Ice Cream Bar,2.6,25,"CHOCOLATE ICE CREAM: CREAM, SKIM MILK, SUGAR, COCOA PROCESSED WITH ALKALI, EGG YOLKS. CHOCOLATY COATING: SUGAR, COCOA PROCESSED WITH ALKALI, CANOLA OIL, PALM OIL, SOY LECITHIN.\nCONTAINS: MILK, EGG AND SOY INGREDIENTS","chocolate ice cream: cream, skim milk, sugar, cocoa processed with alkali, egg yolks",True
84,HaagenDazs,Double Belgian Chocolate Chip Ice Cream,4.8,158,"CHOCOLATE ICE CREAM: CREAM, SKIM MILK, DARK CHOCOLATE (BELGIAN CHOCOLATE, SUGAR, COCOA BUTTER, SOY LECITHIN, VANILLA), SUGAR, EGG YOLKS, COCOA PROCESSED WITH ALKALI, VANILLA EXTRACT. CHOCOLATE CHUNKS WITH VEGETABLE OIL: BELGIAN CHOCOLATE (SUGAR, CHOCOLATE, COCOA BUTTER, SOY LECITHIN, NATURAL FLAVOR), COTTONSEED OIL, COCONUT OIL. CONTAINS: MILK, EGG AND SOY INGREDIENTS","chocolate ice cream: cream, skim milk, dark chocolate , sugar, egg yolks, cocoa processed with alkali, vanilla extract",True
85,HaagenDazs,Dulce de Leche Cookie Squares,3.9,35,"ULCE DE LECHE ICE CREAM: CREAM, SKIM MILK, SUGAR, SWEETENED CONDENSED MILK (MILK,SUGAR), EGG YOLKS, CORN SYRUP, BAKING SODA, SALT, VANILLA EXTRACT. MILK CHOCOLATE AND VEGETABLE OIL COATING: MILK CHOCOLATE (SUGAR, WHOLE MILK POWDER, CHOCOLATE, COCOA BUTTER, SOY LECITHIN, VANILLA EXTRACT), COCONUT OIL. CHOCOLATE COOKIE: BLEACHED WHEAT FLOUR, SUGAR, PALM OIL, COCOA PROCESSED WITH ALKALI, CORN SYRUP, MOLASSES, BAKING SODA, SALT, SOY LECITHIN. CARAMEL SWIRL: SWEETENED CONDENSED MILK (MILK, SUGAR), SUGAR, WATER, CORN SYRUP, COCONUT OIL, PECTIN, SOY LECITHIN, SALT, BAKING SODA, VANILLA EXTRACT. DARK CHOCOLATE DRIZZLE WITH VEGETABLE OIL: DARK CHOCOLATE (SUGAR, CHOCOLATE, SOY LECITHIN, VANILLA EXTRACT), COCONUT OIL, SOYBEAN OIL","ulce de leche ice cream: cream, skim milk, sugar, sweetened condensed milk , egg yolks, corn syrup, baking soda, salt, vanilla extract",True
99,HaagenDazs,Peanut Butter Chocolate Fudge Non-Dairy Bar,4.8,32,"PEANUT BUTTER FROZEN DESSERT: WATER, CORN SYRUP, SUGAR, PEANUTS, PEANUT OIL, COCONUT OIL, PECTIN, SALT. DARK CHOCOLATE AND VEGETABLE OIL COATING: DARK CHOCOLATE (SUGAR, CHOCOLATE, SOY LECITHIN, VANILLA EXTRACT), COCONUT OIL, SOYBEAN OIL. SALTED FUDGE SWIRL: POWDERED CANE SUGAR (CANE SUGAR, CORN STARCH), SUNFLOWER OIL, COCOA, SALT, SUNFLOWER LECITHIN","peanut butter frozen dessert: water, corn syrup, sugar, peanuts, peanut oil, coconut oil, pectin, salt",True
101,HaagenDazs,Peppermint Bark Ice Cream Bar,5.0,8,"WHITE CHOCOLATE ICE CREAM: CREAM, SKIM MILK, SUGAR, CORN SYRUP, EGG YOLKS, NATURAL FLAVOR, COCOA BUTTER. DARK CHOCOLATE AND VEGETABLE OIL COATING WITH PEPPERMINT CANDY PIECES: DARK CHOCOLATE (SUGAR, CHOCOLATE, SOY LECITHIN, VANILLA EXTRACT), PEPPERMINT CANDY PIECES: (SUGAR, CORN SYRUP, COCONUT OIL, NATURAL FLAVOR, SOY LECITHIN), COCONUT OIL, SOYBEAN OIL","white chocolate ice cream: cream, skim milk, sugar, corn syrup, egg yolks, natural flavor, cocoa butter",True
103,HaagenDazs,Pistachio Ice Cream,3.8,64,"WEET CREAM ICE CREAM: CREAM, SKIM MILK, SUGAR, LACTOSE REDUCED SKIM MILK, CORN SYRUP, EGG YOLKS, SALT. ROASTED PISTACHIOS: PISTACHIOS, SAFFLOWER OIL","weet cream ice cream: cream, skim milk, sugar, lactose reduced skim milk, corn syrup, egg yolks, salt",True
117,HaagenDazs,Vanilla Soft Dipped Ice Cream Bar,4.4,13,"VANILLA ICE CREAM: CREAM, SKIM MILK, SUGAR, EGG YOLKS, VANILLA EXTRACT. CHOCOLATY COATING: SUGAR, COCOA PROCESSED WITH ALKALI, CANOLA OIL, PALM OIL, SOY LECITHIN.\nCONTAINS: MILK, EGG AND SOY INGREDIENTS","vanilla ice cream: cream, skim milk, sugar, egg yolks, vanilla extract",True
157,Talenti,ORGANIC OAK-AGED VANILLA GELATO,4.8,19,"INGREDIENTS: ORGANIC SKIM MILK, ORGANIC CREAM, ORGANIC CANE SUGAR, ORGANIC DEXTROSE, OAKWOOD EXTRACT, ORGANIC CAROB BEAN GUM, ORGANIC VANILLA EXTRACT","ingredients: organic skim milk, organic cream, organic cane sugar, organic dextrose, oakwood extract, organic carob bean gum, organic vanilla extract",True


In [None]:
# manual correction
# only row 157 (Talenti Organic Oak-Aged Vanilla Gelato) needs correction

df.loc[157, 'have_additional_items'] = False

In [None]:
# observations with ':' problem - spliting by and selecting only second column

bool = df['ingredients_cleaned'].str.contains(":", regex=False)
val_to_replace = df['ingredients_cleaned'][bool].str.split(": ").str[1] # here we will use the second column, not the first
df['ingredients_cleaned'] = df['ingredients_cleaned'].mask(bool, val_to_replace)

In [None]:
df['ingredients_cleaned']

* Data Treatment: Manual Correction

In [None]:
# tests
#df[df['ingredients_cleaned'].str.contains("mono and diglycerides", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains(" and ", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("and/or", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("†", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("*", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains(")", regex=False)] == True
#df[df['ingredients_cleaned'].str.contains("/", regex=False)] == True
df[df['ingredients_cleaned'].str.contains("vanilla ice cream", regex=False) == True]

#81 have_add

In [None]:
# treating the problem with 'mono and diglycerides'

df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace("mono and diglycerides", "monoglycerides, diglycerides", regex=False)

In [None]:
# treating the problem with 'and/or' + 'and':
unwanted_text = [' and ', 'and/or']

for word in unwanted_text:
    df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(word, ',', regex=False)

In [None]:
# replacing strange text to ''

unwanted_text = ['†', ')' , 'organic', 'pasteurized','(sugar', 'unbleached', 'enriched', 'unenriched', 'vanilla ice cream', 'coffee ice cream', 'white chocolate ice cream', 
'unsweetened', 'whole', 'pieces', 'concentrates', 'concentrate', 'juice', 'pasteurized', 'puree',  'vegetable s', ' lake']

for word in unwanted_text:
    df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(word, '', regex=False)

In [None]:
# replacing strange text to ' '

unwanted_text = ['*', '/']

for word in unwanted_text:
    df['ingredients_cleaned'] = df['ingredients_cleaned'].str.replace(word, ' ', regex=False)

In [None]:
# Word Correcting

# Ok. I'm feeling very dumb for doing this checking mannually. 
# I could use PorterStemmer, which is based on suffixes in the English language. But after some tests, i'm not feeling very confident to use that method.

# Creating a dictionary
word_correcting = {"reese's mini ": "reese's peanut butter", "reese's peanut butter cup ": "reese's peanut butter", "reese's peanut butter cups": "reese's peanut butter", "reese's peanut butter sauce": "reese's peanut butter", "reese's peanut butter swirl": "reese's peanut butter", 'almonds roasted in vegetable oil': 'almonds', 'roasted almonds': 'almonds', 'dried apples': 'apple', 'apples': 'apple', 'artificial color lake': 'artificial color', 'color added': 'artificial color', 'annatto': 'artificial color', 'blue 1': 'artificial color', 'blue 1': 'artificial color', 'blue 1 lake': 'artificial color', 'blue 2': 'artificial color', 'blue 2 lake': 'artificial color', 'yellow 5': 'artificial color', 'yellow 5 lake': 'artificial color', 'yellow 6': 'artificial color', 'yellow 6 lake': 'artificial color', 'red 40': 'artificial color', 'red 40 lake': 'artificial color', 'reb a': 'artificial color', 'artificial color lake': 'artificial color', 'artificial flavoring': 'artificial flavor', 'artificial flavors': 'artificial flavor', 'banana puree': 'banana', 'bananas': 'banana', 'black carrot s': 'black carrot', 'black raspberry puree': 'black raspberry', 'black raspberries': 'black raspberry', 'blueberries': 'blueberry', 'butteroil': 'butter', 'butterfat': 'butter', 'butter oil': 'butter', 'dried cane syrup': 'cane syrup', 'caramelized sugar': 'caramel', 'caramel color': 'caramel', 'caramel flavor': 'caramel', 'caramel swirl': 'caramel', 'caramel syrup': 'caramel', 'caramelized sugar syrup': 'caramelized sugar', 'carob bean': 'carob bean gum', 'carob gum': 'carob bean gum', 'cheese culture': 'cheese cultures', 'cherries': 'cherry', 'cherry': 'cherry', 'cherry puree': 'cherry', 'cherry concentrate': 'cherry', 'cherry juice concentrate': 'cherry', 'chocolate chip cookies': 'chocolate', 'chocolate chips': 'chocolate', 'chocolate cookie pieces': 'chocolate', 'chocolate flavored coating': 'chocolate', 'chocolate liquor': 'chocolate', 'chocolate processed with alkali': 'chocolate', 'chocolatey chips': 'chocolate', 'milk chocolate candies': 'chocolate', 'milk chocolate': 'chocolate', 'dark chocolate': 'chocolate', 'semi-sweet chocolate chunks': 'chocolate', 'chocolaty coated cone': 'chocolatey coated cone', 'black cocoa processed with alkali': 'cocoa processed with alkali', 'coconut concentrate': 'coconut', 'coconut cream': 'coconut', 'coconut extract': 'coconut', 'desiccated coconut': 'coconut', 'coffee extract': 'coffee', 'coffee extract concentrate': 'coffee', 'condensed skim milk': 'condensed milk', 'evaporated milk': 'condensed milk', 'sweetened condensed milk': 'condensed milk', 'sweetened condensed skim milk': 'condensed milk', 'corn syrup solids': 'corn syrup', 'high fructose corn syrup': 'corn syrup', 'corn starch': 'cornstarch', 'modified corn starch': 'cornstarch', 'modified cornstarch': 'cornstarch', 'modified food starch': 'cornstarch', 'modified cornstarch': 'cornstarch', 'eggs': 'egg', 'whole egg': 'egg', 'whole eggs': 'egg', 'egg whites': 'egg white', 'egg yolks': 'egg yolk', 'guar': 'guar gum', 'ground heath toffee': 'heath toffee bar', 'hibiscus flower': 'hibiscus', 'hibiscus powder': 'hibiscus', 'invert cane sugar': 'inverted sugar syrup', 'invert sugar': 'inverted sugar syrup', 'lactase enzyme': 'lactase', 'lemon juice': 'lemon', 'lemon juice concentrate': 'lemon', 'locust bean': 'locust bean gum', 'dry malt extract': 'malt extract', 'maltitol': 'maltitol syrup', 'corn maltodextrin': 'maltodextrin', 'mango puree': 'mango', 'mangos': 'mango', 'organic milk': 'milk', 'anhydrous milkfat': 'milk fat', 'milkfat': 'milk fat', 'nonfat milk solids': 'milk powder', 'nonfat dry milk': 'milk powder', 'whole milk powder': 'milk powder', 'skim milk powder': 'milk powder', 'natural flavors': 'natural flavor', 'rolled oats': 'oats', 'vitamin a palm oil oilitate': 'palm oil', '^palm$': 'palm oil', 'palm kernel': 'palm oil', 'palm kernel oil': 'palm oil', 'partially defatted peanut flour': 'peanut', 'roasted peanuts': 'peanuts', 'peppermint bark': 'peppermint', 'peppermint extract': 'peppermint', 'peppermint oil': 'peppermint', 'peppermint twists candy': 'peppermint', 'pistachio paste': 'pistachio', 'pistachios': 'pistachio', 'raspberries': 'raspberry', 'raspberry swirl': 'raspberry', 'sea salt': 'salt', 'organic skim milk': 'skim milk', 'lactose reduced skim milk': 'skim milk', 'nonfat milk': 'skim milk', 'soy lecithin': 'soybean lecithin', 'soybean lecithin natural flavor': 'soybean lecithin', 'soy lecithin natural flavor': 'soybean lecithin', 'soybean oils': 'soybean oil', 'expeller pressed soybean oil': 'soybean oil', 'spice': 'spices', 'strawberries': 'strawberry', 'strawberry swirl': 'strawberry', 'powdered sugar': 'sugar', 'cane sugar': 'sugar', 'liquid sugar': 'sugar syrup', 'sunflower oils': 'sunflower oil', 'tapioca flour': 'tapioca', 'tapioca starch': 'tapioca', 'tapioca syrup': 'tapioca', 'tara': 'tara gum', 'thiamin mononitrate': 'thiamine mononitrate', 'vanilla bean seeds': 'vanilla', 'vanilla beans': 'vanilla', 'vanilla extract': 'vanilla', 'ground vanilla': 'vanilla', 'processed vanilla': 'vanilla', 'vegetable gums': 'vegetable gum', 'waffle cone': 'waffle cone pieces', 'un wheat flour': 'wheat flour', 'bleached wheat flour': 'wheat flour', 'whey protein concentrate': 'whey', 'milk protein concentrate': 'whey', 'whey protein': 'whey', 'heavy cream': 'cream'
}

In [None]:
# dictionary

df['ingredients_cleaned'] = df['ingredients_cleaned'].replace(word_correcting, regex=True)

In [None]:
df['ingredients_cleaned']

In [None]:
# generate all unique ingredients
all_ingredients = set()

for row in df.iterrows():
    ingredients = df.loc[row[0], 'ingredients_cleaned']
    ingredients = ingredients.replace(' , ', ', ') # trim excess of whitespace
    ingredients = ingredients.replace(',,', ', ') # trim excess of comma
    for ingredient in ingredients.split(','):
        if ingredient not in all_ingredients:
            #print(ingredient)
            ingredient = re.sub(r"^ +| +$", "", ingredient) # trim leading and trailing whitespace
            all_ingredients.add(ingredient)

In [None]:
all_ingredients.remove('')

In [None]:
all_ingredients

In [None]:
df['ingredients_cleaned'] = df['ingredients_cleaned'].str.split(',')

In [None]:
stop code

# Getting Dummy Matrix

In [None]:
y = df.loc[:, 'rating']
X = df.drop('rating', axis=1)
df_bckp = df.copy()

In [None]:
def dummy(df, column, prefix):
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
X = dummy(X, 'brand', 'b')

In [None]:
X

In [None]:
ingredients_df = X['ingredients_cleaned']
ingredients_df

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [None]:
mlb = MultiLabelBinarizer()

ingredients_df = pd.DataFrame(mlb.fit_transform(ingredients_df), columns=mlb.classes_)

In [None]:
for col in ingredients_df.columns:
    print(col)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)
#arrumar aqui

In [None]:
X_train

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

model.fit(X_train, y_train)

* Ingredients without Order

In [None]:
# putting ingredient order row content
# split: splita a coluna ingredients usando a vírgula como separador

new_ingredients = df['ingredients'].str.split(",", expand=True) 
new_ingredients.head()

In [None]:
# all unique ingredients: final
all_ingredients = set()

for row in df.iterrows():
    ingredients = df.loc[row[0], 'ingredients']
    for ingredient in ingredients.split(','):
        if ingredient not in all_ingredients:
            all_ingredients.add(ingredient)

* Ingredients with Order

In [None]:
for col in new_ingredients.columns:
    new_ingredients[col] = new_ingredients[col] + " " + str((col + 1))
new_ingredients.head()

In [None]:
new_ingredients = new_ingredients.stack().value_counts().reset_index()

# stack: reshapa o df de forma que cada observação tenha um em index "interno"
# value_counts: contagem de vezes em que o ingrediente aparece
# reset_index: cria uma nova linha e coluna com os índices (a ideia é usar para inserir nome para as colunas)

In [None]:
new_ingredients.columns = ['Word', 'Frequency']
new_ingredients

In [None]:
# checks for unique values for check any problem

In [None]:
# https://www.fda.gov/food/food-ingredients-packaging/overview-food-ingredients-additives-colors

## Chi Squared Feature Selection

In [None]:
# https://machinelearningmastery.com/feature-selection-with-categorical-data/

## Deprecated

In [None]:
# showing the problem with "contains:" and "\ncontains:"
bool = df['ingredients_cleaned'].str.contains('\ncontains', regex=False)
df['ingredients_cleaned'][bool == True] # checking

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
# eliminating content after '\ncontains'
val_to_replace = df['ingredients_cleaned'][df['ingredients_cleaned'].str.contains(":", regex=False) == True].str.split('\ncontains').str[0]
df['ingredients_cleaned'] = df['ingredients_cleaned'].mask(bool, val_to_replace)

In [None]:
df['ingredients_cleaned'][117]

In [None]:
"""# Draw a scatter plot while assigning point colors and sizes to different variables in the dataset
f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
clarity_ranking = ["bj", "breyers", "hd", "talenti"]
sns.scatterplot(x="rating", y="rating_count",
                hue="rating",
                palette="ch:r=-.2,d=.3_r",
                hue_order=clarity_ranking,
                sizes=(1, 8), linewidth=0,
                data=df, ax=ax)"""

In [None]:
"""f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
clarity_ranking = ["bj", "breyers", "hd", "talenti"]
sns.scatterplot(data=df,
                x="rating", y="rating_count",
                hue="brand",
                style="brand",
                sizes=(1, 8), linewidth=0
                )"""