# Libraries

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data exploration

In [2]:
cookies = pd.read_csv('../Data/cookies.csv')

In [3]:
cookies.head()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,mixins,crunch factor,aesthetic appeal
0,0.25,9.5,300,15.0,136.0,0.99367,8.1,0.44,12.1,8,melted,15.2,7,raisins,1.3,3
1,0.23,3.3,520,34.0,113.0,0.99429,8.16,0.48,8.4,7,melted,12.4,7,raisins,1.71,3
2,0.18,1.9,360,33.0,106.0,0.98746,8.21,0.83,14.0,9,melted,9.4,7,"nuts, chocolate",1.78,3
3,0.18,10.5,490,41.0,124.0,0.9963,8.14,0.35,10.5,7,melted,12.2,7,chocolate,1.59,3
4,0.24,2.4,770,6.0,33.0,0.9974,8.09,0.57,9.4,5,cubed,19.8,7,"nuts, oats, chocolate",1.3,3


In [4]:
cookies['quality'].value_counts() # #3-11

8     1780
7     1323
9      707
6      617
5      577
10     137
4       44
3        8
11       5
Name: quality, dtype: int64

In [5]:
cookies.isnull().sum()

sugar to flour ratio     0
sugar index              5
bake temp                0
chill time               0
calories                 0
density                  0
pH                       0
grams baking soda        0
bake time               10
quality                  0
butter type              0
weight                   0
diameter                 0
mixins                   2
crunch factor            0
aesthetic appeal         0
dtype: int64

In [6]:
cookies.dropna(inplace=True)

In [7]:
cookies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5181 entries, 0 to 5197
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sugar to flour ratio  5181 non-null   float64
 1   sugar index           5181 non-null   float64
 2   bake temp             5181 non-null   int64  
 3   chill time            5181 non-null   float64
 4   calories              5181 non-null   float64
 5   density               5181 non-null   float64
 6   pH                    5181 non-null   float64
 7   grams baking soda     5181 non-null   float64
 8   bake time             5181 non-null   float64
 9   quality               5181 non-null   int64  
 10  butter type           5181 non-null   object 
 11  weight                5181 non-null   float64
 12  diameter              5181 non-null   int64  
 13  mixins                5181 non-null   object 
 14  crunch factor         5181 non-null   float64
 15  aesthetic appeal     

In [None]:
cookies.describe()

## Correlations

In [10]:
X = cookies.drop(columns = 'quality')
y = cookies['quality']

In [11]:
X.corrwith(y)

sugar to flour ratio    0.179929
sugar index             0.219959
bake temp              -0.470895
chill time              0.364925
calories                0.440080
density                -0.017523
pH                     -0.113725
grams baking soda      -0.306251
bake time               0.315445
weight                 -0.328534
diameter                     NaN
crunch factor           0.008346
aesthetic appeal        0.000143
dtype: float64

In [None]:
heatmap = cookies.select_dtypes(['int64', 'float64'])

c = heatmap.corr()
fig_dims = (15, 15)
mask = np.triu(np.ones_like(c, dtype=bool))
fig, ax = plt.subplots(figsize=fig_dims)
sns.heatmap(c, annot=True,ax = ax, mask=mask,cmap="YlGnBu", linewidths=.5)
plt.show()

In [None]:
cookies.drop(['density', 'crunch factor', 'aesthetic appeal', 'diameter'], axis=1, inplace=True)

In [None]:
cookies

# Dropping Outliers

In [None]:
# find outliers
import scipy.stats as stats
z_scores = stats.zscore(cookies.select_dtypes(['int64', 'float64']))
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
cookies = cookies[filtered_entries]

In [None]:
boxplots = cookies.select_dtypes(['int64', 'float64'])

column_list = boxplots.columns.tolist()

for i in column_list:
    plt.figure(figsize=(12, 4))
    sns.boxplot(x=boxplots[i])

In [None]:
'''def drop_outliers(df, column_name):
    iqr = 1.5 * (np.percentile(df[column_name], 75) - np.percentile(df[column_name], 25))
    df.drop(df[df[column_name] > (iqr + np.percentile(df[column_name], 75))].index, inplace=True)
    df.drop(df[df[column_name] < (np.percentile(df[column_name], 25) - iqr)].index, inplace=True)'''

In [None]:
'''for item in column_list:
    drop_outliers(cookies, item)'''

In [None]:
'''boxplots = cookies.select_dtypes(['int64', 'float64'])

for i in column_list:
    plt.figure(figsize=(12, 4))
    sns.boxplot(x=boxplots[i])'''

## Get dummies

In [None]:
cookies['butter type'].value_counts() #get dummies (melted 1/ cubes 0)

In [None]:
cookies['butter type'] = cookies['butter type'].replace('melted', 1).replace('cubed', 0)

In [None]:
cookies['mixins'].value_counts() # 2 ways: 1) Others 2) all

In [None]:
mixins_list = ['chocolate', 'raisins', 'oats', 'nuts', 'peanut butter']

In [None]:
for x in mixins_list:
    cookies[x] = 0
    cookies[x] = cookies['mixins'].str.contains(x).astype(int)

In [None]:
cookies.drop('mixins', axis=1, inplace=True)

In [None]:
cookies.head()

In [None]:
# drop: density, crunch, aesthetic, diameter

# Save datafile

In [None]:
cookies.to_csv('../Data/cookies_clean.csv')

In [None]:
cookies