# **Setup**

In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re
lemmatizer = WordNetLemmatizer()
import en_core_web_sm
nlp = en_core_web_sm.load()

# **Import the Data**

In [2]:
dfVertuo_Raw = pd.read_excel("../data/CoffeeData.xlsx", index_col=False, sheet_name="Vertuo");
dfVertuo_Raw = dfVertuo_Raw[dfVertuo_Raw["Status"] == "Current"];

dfOriginal_Raw = pd.read_excel("../data/CoffeeData.xlsx", index_col=False, sheet_name="Original");
dfOriginal_Raw = dfOriginal_Raw[dfOriginal_Raw["Status"] == "Current"];

df = pd.concat([dfVertuo_Raw, dfOriginal_Raw]).reset_index(drop=True);
df.head(1)

Unnamed: 0,ID,Name,Type,Serving,Serving Size,Headline,Intensity,Sleeve Price,Per Capsule Price,Caption,...,Creamy Texture,Ingredients & Allergens,Number of Capsules per Sleeve,Net Weight per Total Number of Capsules,Capsule Image Link,Capsule & Sleeve Image Link,Decaf Coffee?,Category,Other Information,Status
0,VL01,Intenso,Vertuo,Coffee,230ml,Smooth & Strong,9.0,12.6,1.26,Why we love it: Try Intenso - a Vertuo coffee ...,...,,Roast and ground coffee,10,125 g,https://www.nespresso.com/ecom/medias/sys_mast...,https://www.nespresso.com/shared_res/agility/n...,No,Signature Coffee,,Current


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 30 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ID                                       70 non-null     object 
 1   Name                                     70 non-null     object 
 2   Type                                     70 non-null     object 
 3   Serving                                  70 non-null     object 
 4   Serving Size                             70 non-null     object 
 5   Headline                                 70 non-null     object 
 6   Intensity                                55 non-null     float64
 7   Sleeve Price                             70 non-null     float64
 8   Per Capsule Price                        70 non-null     float64
 9   Caption                                  70 non-null     object 
 10  Taste                                    70 non-null

In [4]:
df.columns

Index(['ID', 'Name', 'Type', 'Serving', 'Serving Size', 'Headline',
       'Intensity', 'Sleeve Price', 'Per Capsule Price', 'Caption', 'Taste',
       'Best Served As', 'Notes', 'Acidity', 'Bitterness', 'Roastness', 'Body',
       'Milky Taste', 'Bitterness with Milk', 'Roastiness with Milk',
       'Creamy Texture', 'Ingredients & Allergens',
       'Number of Capsules per Sleeve',
       'Net Weight per Total Number of Capsules', 'Capsule Image Link',
       'Capsule & Sleeve Image Link', 'Decaf Coffee?', 'Category',
       'Other Information', 'Status'],
      dtype='object')

# **Data Cleaning**

In [5]:
# HANDLE NULL INTENSITY COLUMN VALUES
# Carafe Pour-Over Style Mild = 5
# Carafe Pour-Over Style = 7
# Bianco Forte Intensity = 7
# All other Barista Creations = 6
nullIntensityRowIDs = df[df['Intensity'].isnull()].index.to_list();
for id in nullIntensityRowIDs:
    if df.loc[id, 'Name'] == "Carafe Pour-Over Style Mild":
        df.loc[id, 'Intensity'] = 5;
    elif (df.loc[id, 'Name'] == "Carafe Pour-Over Style") | (df.loc[id, 'Name'] == "Bianco Forte"):
        df.loc[id, 'Intensity'] = 7;
    else:
        df.loc[id, 'Intensity'] = 6;

In [6]:
# HANDLE NULL ACIDITY, BITTERNESS, ROASTNESS, AND BODY VALUES
# All null values are of category 'Barista Creations', so will set them to 3 as it respective to the middle of the scale between 1 to 5

if (df[df['Acidity'].isnull()].index.tolist() == df[df['Bitterness'].isnull()].index.tolist() == df[df['Roastness'].isnull()].index.tolist() == df[df['Body'].isnull().tolist()].index.tolist()) == True:
    nullRowIDs = df[df['Acidity'].isnull()].index.tolist();
    df.loc[nullRowIDs, ['Acidity','Bitterness','Roastness','Body']] = 3;
else:
    print("Invalid. Handle null values manually.")

In [7]:
# HANDLE NULL MILKY TASTE, BITTERNESS WITH MILK, ROASTINESS WITH MILK, AND CREAMY TEXTURE VALUES
# All null values are of all other categories besides 'Barista Creations', so will set them to 3 as it respective to the middle of the scale between 1 to 5

if (df[df['Milky Taste'].isnull()].index.tolist() == df[df['Bitterness with Milk'].isnull()].index.tolist() == df[df['Roastiness with Milk'].isnull()].index.tolist() == df[df['Creamy Texture'].isnull().tolist()].index.tolist()) == True:
    nullRowIDs = df[df['Milky Taste'].isnull()].index.tolist();
    df.loc[nullRowIDs, ['Milky Taste','Bitterness with Milk','Roastiness with Milk','Creamy Texture']] = 3;
else:
    print("Invalid. Handle null values manually.")

In [8]:
# Light Roast: 1 to 4
# Medium Roast: 5 to 8
# Dark Roast: 9 to 13

def determineIntensityClassification(intensity):
    if (intensity > 0) & (intensity < 5):
        return "Low";
    elif (intensity >= 5) & (intensity <= 8):
        return "Medium";
    else:
        return "High";
df["Intensity Classification"] = df.apply(lambda x: determineIntensityClassification(x['Intensity']), axis=1);

In [9]:
# Low Taste Profile Level = 1 to 2
# Medium Taste Profile Level = 3 to 4
# High Taste Profile Level = 5

def determineTasteProfileClassification(tasteProfileValue):
    if (tasteProfileValue > 0) & (tasteProfileValue < 3):
        return "Low";
    elif (tasteProfileValue >= 3) & (tasteProfileValue <= 4):
        return "Medium";
    else:
        return "High";
        
for col in ['Acidity','Bitterness','Roastness','Body']:
    df[col + " Classification"] = df.apply(lambda x: determineTasteProfileClassification(x[col]), axis=1);

In [10]:
# Low Taste Profile with Milk Level = 1 to 2
# Medium Taste Profile with Milk Level = 3 to 4
# High Taste Profile with Milk Level = 5

def determineTasteProfileWithMilkClassification(tasteProfileWithMilkValue):
    if (tasteProfileWithMilkValue > 0) & (tasteProfileWithMilkValue < 3):
        return "Low";
    elif (tasteProfileWithMilkValue >= 3) & (tasteProfileWithMilkValue <= 4):
        return "Medium";
    else:
        return "High";
for col in ['Milky Taste', 'Bitterness with Milk', 'Roastiness with Milk',
'Creamy Texture']:
    df[col + " Classification"] = df.apply(lambda x: determineTasteProfileWithMilkClassification(x[col]), axis=1);

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 39 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ID                                       70 non-null     object 
 1   Name                                     70 non-null     object 
 2   Type                                     70 non-null     object 
 3   Serving                                  70 non-null     object 
 4   Serving Size                             70 non-null     object 
 5   Headline                                 70 non-null     object 
 6   Intensity                                70 non-null     float64
 7   Sleeve Price                             70 non-null     float64
 8   Per Capsule Price                        70 non-null     float64
 9   Caption                                  70 non-null     object 
 10  Taste                                    70 non-null

In [12]:
df.columns

Index(['ID', 'Name', 'Type', 'Serving', 'Serving Size', 'Headline',
       'Intensity', 'Sleeve Price', 'Per Capsule Price', 'Caption', 'Taste',
       'Best Served As', 'Notes', 'Acidity', 'Bitterness', 'Roastness', 'Body',
       'Milky Taste', 'Bitterness with Milk', 'Roastiness with Milk',
       'Creamy Texture', 'Ingredients & Allergens',
       'Number of Capsules per Sleeve',
       'Net Weight per Total Number of Capsules', 'Capsule Image Link',
       'Capsule & Sleeve Image Link', 'Decaf Coffee?', 'Category',
       'Other Information', 'Status', 'Intensity Classification',
       'Acidity Classification', 'Bitterness Classification',
       'Roastness Classification', 'Body Classification',
       'Milky Taste Classification', 'Bitterness with Milk Classification',
       'Roastiness with Milk Classification', 'Creamy Texture Classification'],
      dtype='object')

# **NLP Pre-Processing**

In [13]:
NLP_Columns = [
    'Type', 
    'Serving', 
    'Serving Size', 
    'Headline',
    'Caption', 
    'Taste',
    'Best Served As', 
    'Notes', 
    'Category',
    'Intensity Classification',
    'Acidity Classification', 
    'Bitterness Classification',
    'Roastness Classification', 
    'Body Classification',
    'Milky Taste Classification', 
    'Bitterness with Milk Classification',
    'Roastiness with Milk Classification', 
    'Creamy Texture Classification'
];

In [14]:
def process_text_for_NLP(df, NLP_Columns):
    df["Textual Info"] = "";
    for i in df.index:
        textualInfo = "";
        for col in NLP_Columns:
            textualInfo += str(df.loc[i, col]).lower() + " ";
        textualInfo = textualInfo[:-1];
        textualInfo_tokens = word_tokenize(textualInfo);
        textualInfo_cleanedTokens = [];
        for token in textualInfo_tokens:
            token = lemmatizer.lemmatize(token, pos="a");
            if (token not in stop_words) and (token not in punctuation):
                token = re.sub(r'[^\w\s]', '', token);
                if token != '':
                    if str(nlp(token)[0].pos_) not in ["PNP","PNQ","PNX","POS","AVQ","CJC","CJS","CJT","DTQ","ITJ","PRF","PRP"]:
                        textualInfo_cleanedTokens.append(token);
        textualInfo_final = "";
        for token in textualInfo_cleanedTokens:
            textualInfo_final += token + " ";
        textualInfo_final = textualInfo_final[:-1];
        df.loc[i, "Textual Info"] = textualInfo_final;
    
    return df;

df_Final = process_text_for_NLP(df, NLP_Columns);

In [15]:
df_Final.head(1)

Unnamed: 0,ID,Name,Type,Serving,Serving Size,Headline,Intensity,Sleeve Price,Per Capsule Price,Caption,...,Intensity Classification,Acidity Classification,Bitterness Classification,Roastness Classification,Body Classification,Milky Taste Classification,Bitterness with Milk Classification,Roastiness with Milk Classification,Creamy Texture Classification,Textual Info
0,VL01,Intenso,Vertuo,Coffee,230ml,Smooth & Strong,9.0,12.6,1.26,Why we love it: Try Intenso - a Vertuo coffee ...,...,High,Low,Medium,High,Medium,Medium,Medium,Medium,Medium,vertuo coffee 230ml smooth strong love try int...


# **Export the Data**

In [16]:
df_Final.to_csv("../data/PreparedCoffeeData.csv", index=False);