# **Setup**

In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import gensim.downloader as api
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from matplotlib import pyplot
from gensim.models import KeyedVectors

# import tensorflow_text
import tensorflow as tf
import tensorflow_hub as hub

# **Import the Data**

In [3]:
df = pd.read_csv('../data/PreparedCoffeeData.csv', index_col=False)
df.head(1)

Unnamed: 0,ID,Name,Type,Serving,Serving Size,Headline,Intensity,Sleeve Price,Per Capsule Price,Caption,...,Intensity Classification,Acidity Classification,Bitterness Classification,Roastness Classification,Body Classification,Milky Taste Classification,Bitterness with Milk Classification,Roastiness with Milk Classification,Creamy Texture Classification,Textual Info
0,VL01,Intenso,Vertuo,Coffee,230ml,Smooth & Strong,9.0,12.6,1.26,Why we love it: Try Intenso - a Vertuo coffee ...,...,High,Low,Medium,High,Medium,Medium,Medium,Medium,Medium,vertuo coffee 230ml smooth strong love try int...


# **TF-IDF**

In [68]:
def get_dataframeNLP(df, coffee_select):
    df_coffeeSelect = df[df["Name"] == coffee_select];
    df_NLP = pd.concat([df_coffeeSelect, df]);
    df_NLP = df_NLP.drop_duplicates();
    df_NLP = df_NLP[df_NLP.columns.tolist()[1:]];
    df_NLP = df_NLP.reset_index();
    return df_NLP;

def get_recommendations(df_NLP, coffee_select, numRec, indices, cosine_sim):
    idx = indices[coffee_select];
    sim_scores = list(enumerate(cosine_sim[idx]));
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True);
    sim_scores = sim_scores[1:numRec+1];
    coffee_indices = [i[0] for i in sim_scores];
    df_Rec = df_NLP[["Name","Type","Serving","Headline","Intensity","Category"]].iloc[coffee_indices];
    
    similarityScores = [];
    for i in range(len(sim_scores)):
        similarityScores.append(round(sim_scores[i][1], 4));
    df_Rec["Similarity Score"] =  similarityScores;
    
    df_Rec = df_Rec.reset_index().rename(columns={"index":"id"});
    
    return df_Rec;

def get_recommendationResultsTFIDF(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df, coffee_select);
        
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, sublinear_tf=sublinear_tf, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    cosine_sim = linear_kernel(matrix, matrix);
    indices = pd.Series(df_NLP.index, index=df_NLP["Name"]).drop_duplicates();
    
    df_Rec = get_recommendations(df, coffee_select, numRec, indices, cosine_sim);
    
    return df_Rec;

def get_featureResultsTFIDF(df, coffee_select, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df, coffee_select);
        
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, sublinear_tf=sublinear_tf, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    df_Feature = pd.DataFrame(matrix[0].T.todense(), 
        index=vectorizer.get_feature_names_out(), 
        columns=["TF-IDF"]
    );
    df_Feature = df_Feature.sort_values('TF-IDF', ascending=True);
    df_Feature = df_Feature[df_Feature["TF-IDF"] > 0];
    return df_Feature;

In [69]:
# General Parameters:
df = df;
coffee_select = "Intenso";
numRec = 10;
min_df = 2;
max_df = 0.95;
max_features = 50;
stop_words = "english";
n_lower = 1;
n_upper = 1;
# Unique Parameters:
sublinear_tf = True;

print(get_recommendationResultsTFIDF(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper)[0:3])
print("\n")
print(get_featureResultsTFIDF(df, coffee_select, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper))

   id        Name    Type     Serving                   Headline  Intensity  \
0  17  Diavolitto  Vertuo    Espresso  Highly Intense & Powerful       11.0   
1   2     Fortado  Vertuo  Gran Lungo      Intense & Full-Bodied        8.0   
2   1     Stormio  Vertuo      Coffee              Rich & Strong        8.0   

           Category  Similarity Score  
0          Espresso            0.7155  
1        Gran Lungo            0.6205  
2  Signature Coffee            0.6091  


            TF-IDF
arabica   0.117032
roasted   0.126760
arabicas  0.135540
low       0.145469
smooth    0.150940
split     0.156846
love      0.163245
american  0.166656
acidity   0.170226
washed    0.196048
notes     0.208141
blend     0.211158
vertuo    0.218183
intense   0.250838
roast     0.253463
high      0.255563
coffee    0.264574
long      0.270867
robusta   0.294557
dark      0.305255
230ml     0.331937


# **Bag of Words**

In [6]:
def get_recommendationResultsBagOfWords(df_Prep, coffee_select, numRec, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df_Prep, coffee_select);
    
    vectorizer = CountVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, analyzer=analyzer, token_pattern=token_pattern, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    cosine_sim = cosine_similarity(matrix, matrix);
    indices = pd.Series(df_NLP.index, index=df_NLP["Name"]).drop_duplicates();
    
    df_Rec = get_recommendations(df_NLP, coffee_select, numRec, indices, cosine_sim);
    
    return df_Rec;

def get_featureResultsBagOfWords(df, coffee_select, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df, coffee_select);
        
    vectorizer = CountVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, analyzer=analyzer, token_pattern=token_pattern, ngram_range=(n_lower, n_upper));
    matrix = vectorizer.fit_transform(df_NLP["Textual Info"]);
    df_Feature = pd.DataFrame(matrix[0].T.todense(), 
        index=vectorizer.get_feature_names_out(), 
        columns=["Bag of Words"]
    );
    df_Feature = df_Feature.sort_values('Bag of Words', ascending=True);
    df_Feature = df_Feature[df_Feature["Bag of Words"] > 0];
    return df_Feature;

In [7]:
# General Parameters:
df = df;
coffee_select = "Intenso";
numRec = 10;
min_df = 5;
max_df = 0.95;
max_features = 50;
stop_words = "english";
n_lower = 1;
n_upper = 1;
# Unique Parameters:
analyzer = "word";
token_pattern = r"\b[a-zA-Z]{3,}\b";

print(get_recommendationResultsBagOfWords(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper)[0:3])
print("\n")
print(get_featureResultsBagOfWords(df, coffee_select, min_df, max_df, max_features, stop_words, analyzer, token_pattern, n_lower, n_upper))

   id                    Name      Type   Serving                Headline  \
0  27  Carafe Pour-Over Style    Vertuo    Carafe        Roasted & Smokey   
1  37               Ristretto  Original  Espresso  Powerful & Contrasting   
2   1                 Stormio    Vertuo    Coffee           Rich & Strong   

   Intensity               Category  Similarity Score  
0        7.0             Craft Brew            0.7777  
1       10.0  Inspirazione Italiana            0.7631  
2        8.0       Signature Coffee            0.7498  


          Bag of Words
split                1
smooth               1
roasted              1
acidity              1
love                 1
american             1
arabica              1
arabicas             1
latin                1
washed               1
high                 2
robusta              2
low                  2
blend                2
long                 2
vertuo               2
intense              2
roast                3
notes                3
dark 

# **Word2Vec**
Official Pre-Trained Models: [gensim-data](https://github.com/RaRe-Technologies/gensim-data)

In [8]:
word2vec_path = '../modules\glove-wiki-gigaword-50.txt';
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path)

In [38]:
# TEST
word2vec_model.most_similar(positive='steve')

[('phil', 0.8498413562774658),
 ('greg', 0.8425466418266296),
 ('collins', 0.8077239394187927),
 ('rogers', 0.806643009185791),
 ('gary', 0.8002684116363525),
 ('evans', 0.7958705425262451),
 ('david', 0.7952155470848083),
 ('bob', 0.7951958179473877),
 ('bruce', 0.790093719959259),
 ("o'brien", 0.7886954545974731)]

In [39]:
'steve' in list(word2vec_model.key_to_index.keys())

True

In [11]:
word2vec_model['steve']

array([-0.82126 ,  0.047307,  0.65395 ,  0.58571 , -0.48872 , -0.1128  ,
       -2.45    , -0.1656  , -0.32732 , -0.22862 , -0.4568  ,  0.82956 ,
       -1.1079  , -0.04959 ,  0.63708 , -0.017772,  1.2628  , -0.6099  ,
       -1.0876  , -0.18171 , -0.97368 ,  1.1444  , -0.099794, -0.23374 ,
        0.46573 , -0.74488 ,  0.72217 , -0.36582 , -0.25045 ,  0.061675,
        1.6044  , -0.68753 ,  0.31537 , -0.090142,  0.085618,  0.36911 ,
        0.43175 ,  0.079752,  0.57473 , -0.93518 ,  1.2322  ,  0.78994 ,
       -0.76169 ,  0.22773 ,  0.10907 , -0.058962, -0.47251 , -0.32499 ,
       -0.26357 ,  1.4384  ], dtype=float32)

## **Average Word2Vec**

In [47]:
# Generate average word2vec for each coffee's textual info in the form of word embeddings

def get_word2vec_Avg_wordEmbeddings(df):
    wordEmbeddings = [];
    
    for info in df['Textual Info']:
        avgword2vec = None;
        count = 0;
        for word in info.split(" "):
            if word in list(word2vec_model.key_to_index.keys()):
                count += 1;
                if avgword2vec is None:
                    avgword2vec = word2vec_model[word];
                else:
                    avgword2vec = avgword2vec + word2vec_model[word];
    
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count;
            wordEmbeddings.append(avgword2vec);
    
    return wordEmbeddings;

# Retrieve recommendations based on average word2vec

def get_recommendationResultsAvgWord2Vec(df, coffee_select, numRec):
    df_NLP = get_dataframeNLP(df, coffee_select);
    
    wordEmbeddings = get_word2vec_Avg_wordEmbeddings(df_NLP);
    cosine_sim = cosine_similarity(wordEmbeddings, wordEmbeddings);
    indices = pd.Series(df_NLP.index, index=df_NLP["Name"]).drop_duplicates();
    
    df_Rec = get_recommendations(df_NLP, coffee_select, numRec, indices, cosine_sim);
    
    return df_Rec;

In [48]:
# Parameters
df = df;
coffee_select = "Intenso";
numRec = 10;

get_recommendationResultsAvgWord2Vec(df, coffee_select, numRec)

Unnamed: 0,id,Name,Type,Serving,Headline,Intensity,Category,Similarity Score
0,1,Stormio,Vertuo,Coffee,Rich & Strong,8.0,Signature Coffee,0.9826
1,9,Solelio,Vertuo,Coffee,Fruity & Lightly-Bodied,2.0,Signature Coffee,0.9824
2,25,Voltesso,Vertuo,Espresso,Light & Sweet,4.0,Espresso,0.9792
3,23,Toccanto,Vertuo,Espresso,Berry & Winey,5.0,Espresso,0.9782
4,20,Altissio,Vertuo,Espresso,Full-Bodied & Creamy,9.0,Espresso,0.9769
5,61,Cosi,Original,Espresso,Mild & Delicately roasted,4.0,Espresso,0.9764
6,16,Il Caffè,Vertuo,Espresso,Exceptionally Intense & Velvety,11.0,Espresso,0.9756
7,22,Orafio,Vertuo,Espresso,Caramel & Roasted,6.0,Espresso,0.9754
8,17,Diavolitto,Vertuo,Espresso,Highly Intense & Powerful,11.0,Espresso,0.9753
9,8,Inizio,Vertuo,Gran Lungo,Floral & Cereal,4.0,Gran Lungo,0.9728


## **Word2Vec x TF-IDF**

In [57]:
def get_word2vec_TFIDF_wordEmbeddings(df, vectorizer):
    
    tfidf_list = dict(zip(vectorizer.get_feature_names_out(), list(vectorizer.idf_)));
    tfidf_feature = vectorizer.get_feature_names_out();

    corpus = [];
    for words in df['Textual Info']:
        corpus.append(words.split());

    tfidf_vectors = [];
    count = 0;
    for info in corpus:
        sent_vec = np.zeros(50);
        weight_sum = 0;
        for word in info:
            if word in list(word2vec_model.key_to_index.keys()) and word in tfidf_feature:
                vec = word2vec_model[word];
                tf_idf = tfidf_list[word] * (info.count(word) / len(info));
                sent_vec += vec * tf_idf;
                weight_sum += tf_idf;
        if weight_sum != 0:
            sent_vec /= weight_sum;
        tfidf_vectors.append(sent_vec);
        count += 1

    return tfidf_vectors;

def get_recommendationResultsWord2VecTFIDF(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper):
    df_NLP = get_dataframeNLP(df, coffee_select);
        
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_features, stop_words=stop_words, 
    sublinear_tf=sublinear_tf, ngram_range=(n_lower, n_upper));

    vectorizer.fit(df_NLP['Textual Info']);

    tfidf_vectors = get_word2vec_TFIDF_wordEmbeddings(df_NLP, vectorizer);
    
    cosine_sim = cosine_similarity(tfidf_vectors, tfidf_vectors);
    
    indices = pd.Series(df_NLP.index, index=df_NLP["Name"]).drop_duplicates();
    
    df_Rec = get_recommendations(df, coffee_select, numRec, indices, cosine_sim);
    
    return df_Rec;

In [58]:
# General Parameters:
df = df;
coffee_select = "Intenso";
numRec = 10;
min_df = 2;
max_df = 0.95;
max_features = 50;
stop_words = "english";
n_lower = 1;
n_upper = 1;
# Unique Parameters:
sublinear_tf = True;

get_recommendationResultsWord2VecTFIDF(df, coffee_select, numRec, min_df, max_df, max_features, stop_words, sublinear_tf, n_lower, n_upper)

Unnamed: 0,id,Name,Type,Serving,Headline,Intensity,Category,Similarity Score
0,6,Half Caffeinato,Vertuo,Coffee,Sweet & Velvety,5.0,Signature Coffee,0.9799
1,25,Voltesso,Vertuo,Espresso,Light & Sweet,4.0,Espresso,0.9676
2,9,Solelio,Vertuo,Coffee,Fruity & Lightly-Bodied,2.0,Signature Coffee,0.9674
3,27,Carafe Pour-Over Style,Vertuo,Carafe,Roasted & Smokey,7.0,Craft Brew,0.9659
4,4,Melozio,Vertuo,Coffee,Smooth & Balanced,6.0,Signature Coffee,0.9572
5,1,Stormio,Vertuo,Coffee,Rich & Strong,8.0,Signature Coffee,0.9561
6,33,Chocolate Fudge,Vertuo,Coffee,Sweet & Velvety; Dark Chocolate Flavoured,6.0,Barista Creations,0.9478
7,37,Ristretto,Original,Espresso,Powerful & Contrasting,10.0,Inspirazione Italiana,0.9443
8,41,Inspirazione Venezia,Original,Espresso,Balanced & Thick Body,8.0,Inspirazione Italiana,0.9408
9,10,Odacio,Vertuo,Coffee,Bold & Lively,7.0,Signature Coffee,0.9389


# **BERT**
I don't know if I will do this or not

In [4]:
# preprocesor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3");

# encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1", trainable=True);