# **Setup**

In [1]:
import pandas as pd
import numpy as np

from wordcloud import WordCloud
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# **Import the Data**

In [2]:
df = pd.read_csv('../data/PreparedCoffeeData.csv', index_col=False)
df.head(1)

Unnamed: 0,ID,Name,Type,Serving,Serving Size,Headline,Intensity,Sleeve Price,Per Capsule Price,Caption,...,Intensity Classification,Acidity Classification,Bitterness Classification,Roastness Classification,Body Classification,Milky Taste Classification,Bitterness with Milk Classification,Roastiness with Milk Classification,Creamy Texture Classification,Textual Info
0,VL01,Intenso,Vertuo,Coffee,230ml,Smooth & Strong,9.0,12.6,1.26,Why we love it: Try Intenso - a Vertuo coffee ...,...,High,Low,Medium,High,Medium,Medium,Medium,Medium,Medium,vertuo coffee 230ml smooth strong love try int...


# **TF-IDF**

In [3]:
def get_dataframeNLP(df, coffee_select):
    df_coffeeSelect = df[df["Name"] == coffee_select];
    df_NLP = pd.concat([df_coffeeSelect, df]);
    df_NLP = df_NLP.drop_duplicates();
    df_NLP = df_NLP[df_NLP.columns.tolist()[1:]];
    df_NLP = df_NLP.reset_index();
    return df_NLP;

def get_recommendations(df_NLP, coffee_select, numRec, indices, cosine_sim):
    idx = indices[coffee_select];
    sim_scores = list(enumerate(cosine_sim[idx]));
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True);
    sim_scores = sim_scores[1:numRec+1];
    coffee_indices = [i[0] for i in sim_scores];
    df_Rec = df_NLP[["Name","Type","Serving","Headline","Intensity","Category"]].iloc[coffee_indices];
    
    similarityScores = [];
    for i in range(len(sim_scores)):
        similarityScores.append(round(sim_scores[i][1], 4));
    df_Rec["Similarity Score"] =  similarityScores;
    
    df_Rec = df_Rec.reset_index().rename(columns={"index":"id"});
    
    return df_Rec;

def get_recommendationResultsTFIDF(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df, coffee_select);
        
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, sublinear_tf=sublinear_tf, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    cosine_sim = linear_kernel(matrix, matrix);
    indices = pd.Series(df_NLP.index, index=df_NLP["Name"]).drop_duplicates();
    
    df_Rec = get_recommendations(df, coffee_select, numRec, indices, cosine_sim);
    
    return df_Rec;

def get_featureResultsTFIDF(df, coffee_select, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df, coffee_select);
        
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, sublinear_tf=sublinear_tf, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    df_Feature = pd.DataFrame(matrix[0].T.todense(), 
        index=vectorizer.get_feature_names_out(), 
        columns=["TF-IDF"]
    );
    df_Feature = df_Feature.sort_values('TF-IDF', ascending=True);
    df_Feature = df_Feature[df_Feature["TF-IDF"] > 0];
    return df_Feature;

In [4]:
# General Parameters:
df = df;
coffee_select = "Intenso";
numRec = 10;
min_df = 2;
max_df = 0.95;
max_features = 50;
stop_words = "english";
n_lower = 1;
n_upper = 1;
# Unique Parameters:
sublinear_tf = True;

print(get_recommendationResultsTFIDF(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper)[0:3])
print("\n")
print(get_featureResultsTFIDF(df, coffee_select, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper))

   id        Name    Type   Serving                   Headline  Intensity  \
0  17  Diavolitto  Vertuo  Espresso  Highly Intense & Powerful       11.0   
1   1     Stormio  Vertuo    Coffee              Rich & Strong        8.0   
2  10      Odacio  Vertuo    Coffee              Bold & Lively        7.0   

           Category  Similarity Score  
0          Espresso            0.6830  
1  Signature Coffee            0.6132  
2  Signature Coffee            0.5991  


            TF-IDF
arabica   0.117834
roasted   0.127628
arabicas  0.136468
low       0.146465
smooth    0.151973
split     0.157920
love      0.164363
american  0.167797
acidity   0.171392
washed    0.197390
notes     0.209567
blend     0.212604
vertuo    0.219677
intense   0.252556
roast     0.255200
high      0.257313
coffee    0.266386
long      0.272722
dark      0.284106
robusta   0.296575
230ml     0.334211


# **Bag of Words**

In [5]:
def get_recommendationResultsBagOfWords(df_Prep, coffee_select, numRec, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df_Prep, coffee_select);
    
    vectorizer = CountVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, analyzer=analyzer, token_pattern=token_pattern, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    cosine_sim = cosine_similarity(matrix, matrix);
    indices = pd.Series(df_NLP.index, index=df_NLP["Name"]).drop_duplicates();
    
    df_Rec = get_recommendations(df_NLP, coffee_select, numRec, indices, cosine_sim);
    
    return df_Rec;

def get_featureResultsBagOfWords(df, coffee_select, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df, coffee_select);
        
    vectorizer = CountVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, analyzer=analyzer, token_pattern=token_pattern, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    df_Feature = pd.DataFrame(matrix[0].T.todense(), 
        index=vectorizer.get_feature_names_out(), 
        columns=["Bag of Words"]
    );
    df_Feature = df_Feature.sort_values('Bag of Words', ascending=True);
    df_Feature = df_Feature[df_Feature["Bag of Words"] > 0];
    return df_Feature;

In [6]:
# General Parameters:
df = df;
coffee_select = "Stormio";
numRec = 10;
min_df = 5;
max_df = 0.95;
max_features = 50;
stop_words = "english";
n_lower = 1;
n_upper = 1;
# Unique Parameters:
analyzer = "word";
token_pattern = r"\b[a-zA-Z]{3,}\b";

print(get_recommendationResultsBagOfWords(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper)[0:3])
print("\n")
print(get_featureResultsBagOfWords(df, coffee_select, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper))

   id             Name    Type Serving         Headline  Intensity  \
0   1          Intenso  Vertuo  Coffee  Smooth & Strong        9.0   
1  10           Odacio  Vertuo  Coffee    Bold & Lively        7.0   
2   6  Half Caffeinato  Vertuo  Coffee  Sweet & Velvety        5.0   

           Category  Similarity Score  
0  Signature Coffee            0.7648  
1  Signature Coffee            0.7285  
2  Signature Coffee            0.7081  


         Bag of Words
low                 1
long                1
cereal              1
milk                1
blend               1
taste               1
love                1
intense             1
smooth              1
vertuo              2
arabica             2
roasted             2
rich                2
coffees             2
roast               2
notes               2
coffee              4
