**Table of contents**<a id='toc0_'></a>    
- [1. About the Notebook](#toc1_)    
- [2. Libraries and Packages](#toc2_)    
- [3. Connecting In Snowflake](#toc3_)    
- [4. Defining Functions](#toc4_)    
  - [4.1. Text Cleaning](#toc4_1_)    
  - [4.2. Tokenizing](#toc4_2_)    
  - [4.3. Remove Stopwords](#toc4_3_)    
  - [4.4. Count Words in each product and discard irrelevant products](#toc4_4_)    
  - [4.5. Count Vectorizer](#toc4_5_)    
  - [4.6. One Hot Encoding](#toc4_6_)    
  - [4.7. Recommendations with WALS](#toc4_7_)    
- [5. Loading Data and Applying all Functions](#toc5_)    
  - [5.1. Extracting list of all Main Categories](#toc5_1_)    
  - [5.2. Producing a Recommendation dataframe to each Main Category](#toc5_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[1. About the Notebook](#toc0_)

Notebook that creates similarity recommendations

# <a id='toc2_'></a>[2. Libraries and Packages](#toc0_)

In [2]:
import snowflake.connector

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

from time import sleep

from bs4 import BeautifulSoup
import re

import nltk
nltk.download("punkt")
import spacy

from loguru import logger

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize, OneHotEncoder
from scipy.sparse import hstack
from faiss import IndexFlatIP
from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares

import spacy
from spacy.lang.en.examples import sentences 

import gc

  warn_incompatible_dep(
[nltk_data] Downloading package punkt to
[nltk_data]     /home/brunnokalyxton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# <a id='toc3_'></a>[3. Connecting In Snowflake](#toc0_)

In [4]:
conn = snowflake.connector.connect(
    user='************',
    password='************',
    account='*************',
    warehouse='ANALYTICS_WH',
    database='AMAZON',
    schema='AMZ_DATA_GOLD',
    role = 'ANALYSTS'
)

# <a id='toc4_'></a>[4. Defining Functions](#toc0_)

## <a id='toc4_1_'></a>[4.1. Text Cleaning](#toc0_)

In [5]:
def clean_html(text):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text(separator=" ")
    
    # Remove patterns like \n and its variations
    cleaned_text = re.sub(r'\\n+', ' ', cleaned_text)
    
    # Remove punctuations (excluding single quote to preserve words like "isn't" and "it's")
    cleaned_text = re.sub(r'[^\w\s\']', ' ', cleaned_text)
    
    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

    # Removing t, tt and its similarities
    cleaned_text = re.sub(r'\bt+\b', ' ', cleaned_text)

    cleaned_text = re.sub(r'\btt+\b', ' ', cleaned_text)

    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    
    # Strip leading and trailing whitespaces
    cleaned_text = cleaned_text.strip()

    #all text to lowercase
    cleaned_text = cleaned_text.lower()
    
    return cleaned_text

# Register the UDF
# clean_html_udf = udf(clean_html, StringType())

# Register the UDF
# clean_text_udf = udf(clean_html_udf, StringType())

## <a id='toc4_2_'></a>[4.2. Tokenizing](#toc0_)

In [6]:
def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list
    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]  

## <a id='toc4_3_'></a>[4.3. Remove Stopwords](#toc0_)

In [7]:
nlp = spacy.load("en_core_web_sm")
stopwords = list(nlp.Defaults.stop_words)

In [8]:
def StopWordsRemover(text):
    '''
    Removes Stop Words (also capitalized) from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without Stop Words
    ''' 
    # check in lowercase 
    t = [token for token in text if token.lower() not in stopwords]
    text = ' '.join(t)    
    return text

## <a id='toc4_4_'></a>[4.4. Count Words in each product and discard irrelevant products](#toc0_)

In [9]:
def word_count_func(text):
    '''
    Counts words within a string
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Number of words within a string, integer
    ''' 
    return len(text.split())

## <a id='toc4_5_'></a>[4.5. Count Vectorizer](#toc0_)

In [10]:
NGRAM_MIN = 1
NGRAM_MAX = 1
MIN_DOC_FREQ = 1
MAX_DOC_FREQ = 1.0
MAX_TERMS = 50000

In [11]:
vectorizer = CountVectorizer(ngram_range=(NGRAM_MIN,NGRAM_MAX), stop_words=None, 
                             lowercase=False, max_df=MAX_DOC_FREQ, min_df=MIN_DOC_FREQ, 
                             max_features=MAX_TERMS)

## <a id='toc4_6_'></a>[4.6. One Hot Encoding](#toc0_)

In [12]:
enc = OneHotEncoder(sparse=True)

## <a id='toc4_7_'></a>[4.7. Recommendations with WALS](#toc0_)

In [13]:
N_THREADS = 16
MIN_WORDS = 32
N_COMPONENTS = 256
N_CLUSTERS = 1024

In [14]:
wals = AlternatingLeastSquares(factors=N_COMPONENTS, regularization=0.1,
                               iterations=15, calculate_training_loss=True)



# <a id='toc5_'></a>[5. Loading Data and Applying all Functions](#toc0_)

## <a id='toc5_1_'></a>[5.1. Extracting list of all Main Categories](#toc0_)

In [15]:
query1 = """
SELECT DISTINCT
    MAIN_CATEGORY
FROM 
    PRODUCTS
WHERE 
    TITLE <> '[]' AND 
    PRICE IS NOT NULL AND
    MAIN_CATEGORY IS NOT NULL AND 
    MAIN_CATEGORY <> ''
"""

In [16]:
category = pd.read_sql_query(query1, conn)

  category = pd.read_sql_query(query1, conn)


In [17]:
category_list = category.MAIN_CATEGORY.values.tolist()

In [23]:
category_list = ['All Beauty']

In [18]:
category_list

['Movies & Tv',
 'Portable Audio & Accessories',
 'Software',
 'Buy A Kindle',
 'Amazon Fashion',
 'Collectible Coins',
 'Computers',
 'Cell Phones & Accessories',
 'Gift Cards',
 'Arts, Crafts & Sewing',
 'Pet Supplies',
 'Automotive',
 'Office Products',
 'Books',
 'All Electronics',
 'Tools & Home Improvement',
 'Home Audio & Theater',
 'Baby',
 'Gps & Navigation',
 'Toys & Games',
 'Camera & Photo',
 'Musical Instruments',
 'Video Games',
 'Digital Music',
 'Sports & Outdoors',
 'All Beauty',
 'Prime Pantry',
 'Fire Phone',
 'Memberships & Subscriptions',
 'Amazon Devices',
 'Luxury Beauty',
 'Apple Products',
 'Health & Personal Care',
 'Grocery',
 'Appliances',
 'Amazon Home',
 'Collectibles & Fine Art',
 'Sports Collectibles',
 'Amazon Fire Tv',
 'Handmade',
 'Home & Business Services',
 'Industrial & Scientific',
 'Car Electronics',
 'Entertainment']

## <a id='toc5_2_'></a>[5.2. Producing a Recommendation dataframe to each Main Category](#toc0_)

In [24]:
for i in category_list:
    query = """
    WITH CTE AS (
        SELECT 
            PROD.ASIN,
            PROD.TITLE,
            PROD.BRAND,
            DET.DETAILS,
            REL.RELATED_CATEGORIES,
            PROD.PRICE,
            REV.OVERALL,
            AVG(REV.OVERALL) OVER (PARTITION BY PROD.ASIN) AS "PRODUCT_AVG_RATING" 
        FROM 
            PRODUCTS AS PROD 
        INNER JOIN 
            PRODUCTS_REVIEWS AS REV ON PROD.ASIN = REV.ASIN 
        INNER JOIN 
            PRODUCTS_DETAILS AS DET ON PROD.ASIN = DET.ASIN
        INNER JOIN 
            PRODUCTS_RELATED AS REL ON PROD.ASIN = REL.ASIN
        WHERE 
            PROD.MAIN_CATEGORY = '{i}' AND 
            PROD.TITLE <> '' AND
            PROD.TITLE IS NOT NULL AND
            PROD.PRICE IS NOT NULL
    )
    SELECT
        *
    FROM 
        (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY ASIN ORDER BY PRICE DESC) AS ROW_NUMBER
    FROM
        CTE
    )
    WHERE 
        ROW_NUMBER = 1 AND 
        PRODUCT_AVG_RATING >= 4
    """.format(i=i)
    print(i)
    metadata = pd.read_sql_query(query, conn)

    cols = ['TITLE', 'BRAND', 'RELATED_CATEGORIES', 'DETAILS']
    metadata['full_text'] = metadata[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    metadata['full_text'] = metadata['full_text'].apply(clean_html)
    metadata['text_tokens'] = metadata['full_text'].apply(tokenize)
    metadata['text_tokens_sw'] = metadata['text_tokens'].apply(StopWordsRemover)
    metadata['word_count'] = metadata['text_tokens_sw'].apply(word_count_func)
    metadata = metadata[metadata['word_count'] > 32]  
    if len(metadata) > 10:
        metadata.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='') ## reseting index
        x_count = vectorizer.fit_transform(metadata['text_tokens_sw'])
        x_wt = bm25_weight(x_count)
        x_tags = enc.fit_transform(metadata[['BRAND']])
        x_tags.data = 5.0 * x_tags.data
        x = hstack([x_wt, x_tags])

        logger.info('Factorizing with WALS...')
        wals.fit(x, show_progress=True)
        doc_factors_wals = normalize(wals.item_factors, norm="l2", axis=1, copy=False)
        word_factors_wals = normalize(wals.user_factors, norm="l2", axis=1, copy=False)

        logger.info('Finding event nearest neighbors with WALS factors...')
        K = 16 # number of recommendations for each product
        published_idx = metadata.index.values
        original_idx = published_idx
        published_idx = np.array(list(range(0, len(original_idx))))
        pub_evts = IndexFlatIP(N_COMPONENTS)
        pub_evts.add(word_factors_wals[published_idx])
        nn_dist, nn_idx = pub_evts.search(word_factors_wals, K+1)
        product_neighbor = pd.DataFrame.from_dict({
        'product_id': np.repeat(metadata['ASIN'].values, K+1),
        'num_words': np.repeat(metadata['word_count'].values, K+1),
        'num_words_neighbor': metadata['word_count'].values[published_idx[nn_idx.flatten()]],
        'neighbor_id': metadata['ASIN'].values[published_idx[nn_idx.flatten()]], 
        'similarity': nn_dist.flatten()
        })
        same_id = product_neighbor.loc[product_neighbor['product_id'] == product_neighbor['neighbor_id']].index.values.tolist()
        product_neighbor.drop(labels=same_id, axis='index', inplace=True)
        product_neighbor.sort_values(['product_id','similarity'], ascending=[True, False], inplace=True)

        recommendations = product_neighbor.groupby('product_id')['neighbor_id'].apply(list).reset_index(name='recommendations')

        recommendations['MAIN_CATEGORY'] = '{i}'.format(i=i)

        recommendations.to_parquet(
            path="../output/similarity/recommendations_{i}.parquet".format(i=i),
            engine="auto"
        )   
        del metadata, x_count, x_wt, x_tags, x, doc_factors_wals, word_factors_wals, published_idx, original_idx, pub_evts, nn_dist, nn_idx, product_neighbor, same_id, recommendations 
        gc.collect()
    else:
        pass



All Beauty


  metadata = pd.read_sql_query(query, conn)
2023-08-05 13:05:44.108 | INFO     | __main__:<module>:60 - Factorizing with WALS...


  0%|          | 0/15 [00:00<?, ?it/s]

ModelFitError: NaN encountered in factors