# IRWA PROJECT PART 3
Laia Tomàs Jané u198723\
Quim Ribas Martinez u198742 

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
import collections 
from collections import defaultdict
from numpy import linalg as la
import math

In [2]:
# read preprocessed json from part 1
products_df = pd.read_json("fashion_products_dataset_processed.json")

## PART 1: Rankings

In [3]:
#copy function from part1 for cleaning queries
def build_terms(text):

    #check that the text is a string
    if not isinstance(text, str):
        return []

    #keep only any word character or spaces (remove special characters and numbers) (includes removing punctuation marks)
    text = re.sub(r'[^a-z\s]', '', text.lower()) 

    #tokenize text to a list of tokens
    tokens = text.split()

    #remove stop words
    stop_words = set(stopwords.words('english'))
    text = [word for word in tokens if word not in stop_words and len(word) > 2] #keep only words of length 3 minimum

    #apply stemming
    stemmer = PorterStemmer()
    text=[stemmer.stem(word) for word in text]

    return text

### tfidf + cosine similarity

#### Functions

In [4]:
def create_index_tfidf(products_df, num_products):
    """
    Implement the inverted index and compute tf, df and idf

    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(lambda: defaultdict(lambda: 0))
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    title_index = defaultdict(str)
    desc_index = defaultdict(str)
    idf = defaultdict(float)

    for _, line in products_df.iterrows():

        pid = line['pid']
        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        desc = line['description']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        desc_index[pid] = desc

        current_product_terms = defaultdict(lambda: 0)
        for term in terms:
            current_product_terms[term] += 1
            index[term][pid] += 1


        # normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for freq in current_product_terms.values():
            # posting will contain the list of positions for current term in current document.
            # posting ==> [current_doc, [list of positions]]
            # you can use it to infer the frequency of current term.
            norm += freq ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, freq in current_product_terms.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(freq / norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term


    # Compute IDF following the formula (3) above. HINT: use np.log
    # Note: It is computed later after we know the df.
    for term in df:
        idf[term] = np.round(np.log(float(num_products / df[term])), 4)

    return index, tf, df, idf, title_index, desc_index


In [5]:
num_products = len(products_df)
index, tf, df, idf, title_index, desc_index = create_index_tfidf(products_df, num_products)

In [6]:
#Represent the query as a weighted tf-idf vector
#Represent each product as a weighted tfidf vector
#Compute the cosine similarity score for the
#query vector and each product vector
#Rank product with respect to the query by score
def rank_products(terms, pids, index, idf, tf, title_index):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    pids -- list of products, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title

    Returns:
    Print the list of ranked product
    """

    # I'm interested only on the element of the productVector corresponding to the query terms
    # The remaining elements would become 0 when multiplied to the query_vector
    products_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with products)
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Generate product_vectors for matching products
        for pid_index, pid in enumerate(index[term].keys()):
            # Example of pid_index, pid
            # 0 JEAFNHERP6UHRQKH
            # 1 JEAFNHERJGTGQ4GP

            #tf[term][0] will contain the tf of the term "term" in the product JEAFNHERP6UHRQKH 
            if pid in pids:
                products_vectors[pid][termIndex] = tf[term][pid_index] * idf[term]  

    # Calculate the score of each product
    # compute the cosine similarity between queyVector and each productVector:

    products_scores = [[np.dot(curProdVec, query_vector), product] for product, curProdVec in products_vectors.items()]
    products_scores.sort(reverse=True)
    result_products = [x[1] for x in products_scores]
    #print product titles instead if product id's
    #result_products=[ title_index[x] for x in result_products ]
    if len(result_products) == 0:
        print("No results found, try again")
        # query = input()
        # products = search_tf_idf(query, index)
    #print ('\n'.join(result_products), '\n')
    return result_products, products_scores

In [7]:
def search_tf_idf(query, index):
    """
    output is the list of products that contain any of the query terms.
    So, we will get the list of products for each query term, and take the union of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # products containing this term

            if products is None:
                products = term_products  # initialize with first term's product
            else:
                products &= term_products  # intersection with the next term’s product

        except:
            # if a term isn't in the index, then no product contains ALL terms
            return []
    products = list(products)
    ranked_products, product_scores = rank_products(query, products, index, idf, tf, title_index)
    #print( ranked_products)
    return ranked_products, product_scores

In [8]:
def print_top_k_query_results(i, query, index, k):
    print(f"Results for query {i}: \033[1;34m{query}\033[0m")

    ranked_products, scores = search_tf_idf(query, index)
    print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products[:k]), len(ranked_products)))
    for p_id in ranked_products[:k]:
        print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))
    print("==================================================================")

#### results

In [9]:
query1 = "women blue casual tshirt" 
query2 = "cotton fit casual"
query3 = "animal print tshirt"
query4 = "round neck black dress"
query5 = "solid shirt pack of 3"
queries = [query1, query2, query3, query4, query5]

In [10]:
for i, q in enumerate(queries):
    print_top_k_query_results(i+1, q, index, 5)

Results for query 1: [1;34mwomen blue casual tshirt[0m
Top 5 results out of 301 for the searched query:

product_id= TSHEUJ4VFYTMZTXZ 
- product_title: Solid Women Round or Crew Blue T-Shirt 
- product_description: Axmann Light Blue Round Neck Casual Summer Wear Women T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirts

product_id= TSHFEY8VRGCMRSGG 
- product_title: Typography Women Round Neck Blue T-Shirt 
- product_description: Blue Chest print Kintted Cotton Casual T-shirts, has a round neck , Half Sleeve

product_id= TSHEU7DTUYMHMGW6 
- product_title: Striped Women Polo Neck Blue T-Shirt 
- product_description: Axmann Blue Striped Casual Wear Women Summer Polo T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirts

product_id= TSHEU7DSSQGESYNH 
- product_title: Printed Women Round Neck Blue T-Shirt 
- product_description: Axmann Blue Round Neck Casual Summe

### BM25

#### Functions

In [11]:
def create_index_bm25(products_df, num_products):
    """
    Implement the inverted index and compute tf, df and idf

    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(lambda: defaultdict(lambda: 0))
    tf = defaultdict(dict)  # term raw frequencies: tf[term][pid] = freq
    df = defaultdict(int)  # document frequencies of terms in the corpus
    title_index = defaultdict(str)
    desc_index = defaultdict(str)
    idf = defaultdict(float)
    doc_len = defaultdict(float) # dict of pid and document length

    for _, line in products_df.iterrows():

        pid = line['pid']
        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        desc = line['description']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        desc_index[pid] = desc

        doc_len[pid] = len(terms) # (product length after removing stopwords)

        current_product_terms = defaultdict(lambda: 0)
        for term in terms:
            current_product_terms[term] += 1
            index[term][pid] += 1

        # term raw frequencies 
        for term, freq in current_product_terms.items():
            tf[term][pid] = freq 
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term


    # Compute IDF 
    for term in df:
        idf[term] = np.round(np.log(float(num_products / df[term])), 4)

    L_ave = 0
    for pid in doc_len.keys():
        L_ave += doc_len[pid]
    L_ave /= len(doc_len)

    return index, tf, df, idf, title_index, desc_index, doc_len, L_ave


In [12]:
index_bm25, tf_bm25, df_bm25, idf_bm25, title_index, desc_index, doc_len, L_ave = create_index_bm25(products_df, num_products)

In [13]:
def rank_products_bm25(terms, pids, index, idf, tf, doc_len, L_ave, k1, b):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    pids -- list of products, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term raw frequencies for a given product
    doc_len -- dictionary of product id: length of title + description
    L_ave -- average length of all products

    Returns:
    Print the list of ranked product
    """

    rsv_product = defaultdict(float)
    
    for pid in pids:
        for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
            if term not in index:
                continue

            # Calculate the score of each product
            # sum for term in query: idf * formula
            numerator = (k1 + 1) * tf[term][pid]
            denominator = k1 * ((1 - b) + b *(doc_len[pid]/L_ave)) + tf[term][pid]
            rsv_product[pid] += idf[term] * numerator / denominator
    
    sorted_rsv = sorted(rsv_product.items(), key=lambda x: x[1], reverse=True)

    result_products = [x[0] for x in sorted_rsv]
    #products_scores = [x[1] for x in sorted_rsv]
    if len(sorted_rsv) == 0:
        print("No results found, try again")

    return result_products, sorted_rsv

In [14]:
def search_bm25(query, index, idf, tf, doc_len, L_ave, k1, b):
    """
    output is the list of products that contain any of the query terms.
    So, we will get the list of products for each query term, and take the union of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # products containing this term

            if products is None:
                products = term_products  # initialize with first term's product
            else:
                products &= term_products  # intersection with the next term’s product

        except:
            # if a term isn't in the index, then no product contains ALL terms
            return []
    products = list(products)
    ranked_products, product_scores = rank_products_bm25(query, products, index, idf, tf, doc_len, L_ave, k1, b)

    return ranked_products, product_scores

In [15]:
def print_top_k_query_results_bm25(i, query, index_bm25, k, idf_bm25, tf_bm25, doc_len, L_ave, k1, b):
    print(f"Results for query {i}: \033[1;34m{query}\033[0m")

    ranked_products_bm25, scores = search_bm25(query, index_bm25, idf_bm25, tf_bm25, doc_len, L_ave, k1, b)
    print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products_bm25[:k]), len(ranked_products_bm25)))
    for p_id in ranked_products_bm25[:k]:
        print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))
    print("==================================================================")

#### Results

In [16]:
k1 = 1.2
b = 0.75
k = 10
for i, query in enumerate(queries):
    print_top_k_query_results_bm25(i+1, query, index_bm25, k, idf_bm25, tf_bm25, doc_len, L_ave, k1, b)

Results for query 1: [1;34mwomen blue casual tshirt[0m
Top 10 results out of 301 for the searched query:

product_id= TSHFTN8K9JUSGSB8 
- product_title: Solid Women Polo Neck Blue T-Shirt 
- product_description: Steenbok Womens Cotton Casual Tshirt

product_id= TSHEU7DTUYMHMGW6 
- product_title: Striped Women Polo Neck Blue T-Shirt 
- product_description: Axmann Blue Striped Casual Wear Women Summer Polo T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirts

product_id= TSHEU7DSSQGESYNH 
- product_title: Printed Women Round Neck Blue T-Shirt 
- product_description: Axmann Blue Round Neck Casual Summer Wear Women T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirt

product_id= TSHEU7DTRZDFUEZB 
- product_title: Printed Women Round Neck Blue T-Shirt 
- product_description: Axmann Blue Round Neck Printed Casual Wear Women Summer T-shirt BY Axmann. Made with premiu

### Our score

In [17]:
products_df.columns

Index(['_id', 'actual_price', 'average_rating', 'brand', 'category',
       'crawled_at', 'description', 'discount', 'images', 'out_of_stock',
       'pid', 'product_details', 'seller', 'selling_price', 'sub_category',
       'title', 'url', 'title_clean', 'description_clean', 'discount_clean',
       'out_of_stock_int'],
      dtype='object')

We will add weights to the following columns for creating our score:
- rating
- discount
- price
- out_of_stock

The idea behing the following formula is to boost the score for products with a high rating, high discount, low price and that are in stock.

$$
\text{OUR\_SCORE}(pid) =
    \alpha \cdot bm25(pid)
  + \beta \cdot rating(pid)
  + \gamma \cdot discount(pid)
  + \delta \cdot (1 - price(pid))
  + \varepsilon \cdot (1 - out\_of\_stock(pid))
$$



#### Functions

In [18]:
# normalize the columns to 0-1 to ensure comparability
def normalize_column(df_column):
    min = df_column.min()
    max = df_column.max()
    return df_column.apply(lambda x: (x - min) / (max - min))

In [19]:
products_df['rating_normalized'] = normalize_column(products_df['average_rating'])
products_df['discount_normalized'] = normalize_column(products_df['discount_clean'])
products_df['price_normalized'] = normalize_column(products_df['selling_price'])
# products_df['out_of_stock_int'] is already normalized (values are either 1 or 0)

In [20]:
def search_our_score(query, index, idf, tf, doc_len, L_ave, k1, b, products_df, alpha, beta, gamma, delta, epsilon):
    """
    output is the list of products that contain any of the query terms.
    So, we will get the list of products for each query term, and take the union of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # products containing this term

            if products is None:
                products = term_products  # initialize with first term's product
            else:
                products &= term_products  # intersection with the next term’s product

        except:
            # if a term isn't in the index, then no product contains ALL terms
            return []
    products = list(products)
    _, product_scores = rank_products_bm25(query, products, index, idf, tf, doc_len, L_ave, k1, b)

    new_product_scores = defaultdict(float)

    scores = [s for _, s in product_scores]
    min_score = min(scores)
    max_score = max(scores)

    for (pid, score) in product_scores:
        score = (score - min_score) / (max_score - min_score)

        product = products_df[products_df['pid']==pid].iloc[0]
        weighted_score = (
                score * alpha + 
                product['rating_normalized'] * beta + 
                product['discount_normalized'] * gamma + 
                (1 - product['price_normalized']) * delta + 
                (1 - product['out_of_stock_int']) * epsilon
            )
        new_product_scores[pid] = weighted_score
    
    #reorder after new score
    new_product_scores = sorted(new_product_scores.items(), key=lambda x: x[1], reverse=True)
    ranked_products = [x[0] for x in new_product_scores]
    return ranked_products, new_product_scores

In [21]:
def print_top_k_query_results_our_score(i, query, index_bm25, k, idf_bm25, tf_bm25, doc_len, L_ave, k1, b, products_df, alpha, beta, gamma, delta, epsilon):
    print(f"Results for query {i}: \033[1;34m{query}\033[0m")

    ranked_products_our_score, scores = search_our_score(query, index_bm25, idf_bm25, tf_bm25, doc_len, L_ave, k1, b, products_df, alpha, beta, gamma, delta, epsilon)
    print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products_our_score[:k]), len(ranked_products_our_score)))
    for p_id in ranked_products_our_score[:k]:
        print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))
    print("==================================================================")

#### Results

In [22]:
alpha, beta, gamma, delta, epsilon = 0.65, 0.05, 0.05, 0.10, 0.15
for i, query in enumerate(queries):
    print_top_k_query_results_our_score(i+1, query, index_bm25, k, idf_bm25, tf_bm25, doc_len, L_ave, k1, b, products_df, alpha, beta, gamma, delta, epsilon)

Results for query 1: [1;34mwomen blue casual tshirt[0m
Top 10 results out of 301 for the searched query:

product_id= TSHEU7DTUYMHMGW6 
- product_title: Striped Women Polo Neck Blue T-Shirt 
- product_description: Axmann Blue Striped Casual Wear Women Summer Polo T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirts

product_id= TSHEU7DSSQGESYNH 
- product_title: Printed Women Round Neck Blue T-Shirt 
- product_description: Axmann Blue Round Neck Casual Summer Wear Women T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirt

product_id= TSHEU7DTRZDFUEZB 
- product_title: Printed Women Round Neck Blue T-Shirt 
- product_description: Axmann Blue Round Neck Printed Casual Wear Women Summer T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirt

product_id= TSHEUJ4VFYTMZTXZ 
- product_title: Solid Women Roun

In [23]:
# different weights: more weight to availability and low price, then discount and rating
alpha, beta, gamma, delta, epsilon = 5, 0.5, 2, 3, 4
for i, query in enumerate(queries):
    print_top_k_query_results_our_score(i+1, query, index_bm25, k, idf_bm25, tf_bm25, doc_len, L_ave, k1, b, products_df, alpha, beta, gamma, delta, epsilon)

Results for query 1: [1;34mwomen blue casual tshirt[0m
Top 10 results out of 301 for the searched query:

product_id= TSHEU7DTUYMHMGW6 
- product_title: Striped Women Polo Neck Blue T-Shirt 
- product_description: Axmann Blue Striped Casual Wear Women Summer Polo T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirts

product_id= TSHFHTGUH8NH7XFJ 
- product_title: Self Design Women Polo Neck Blue T-Shirt 
- product_description: Athliv Womens Cotton Collar Tshirt Casual Wear Tshirt

product_id= TSHFHTGGYAZKDG6G 
- product_title: Color Block Women Polo Neck Blue T-Shirt 
- product_description: Athliv Womens Cotton Collar Tshirt Casual Wear Tshirt

product_id= TSHFHTGGPM7CZMZU 
- product_title: Color Block Women Polo Neck Blue T-Shirt 
- product_description: Athliv Womens Cotton Collar Tshirt Casual Wear Tshirt

product_id= TSHFHTGGUAHXZQEU 
- product_title: Color Block Women Polo Neck Blue T-Shirt 
- product_description: Athl

## PART 2: Word2Vec + cosine ranking score

#### Implementation

In [24]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [25]:
# Combine all title + description tokens
corpus = (products_df['title_clean'] + products_df['description_clean']).tolist()

# Train Word2Vec
w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=10, min_count=1, workers=4)
w2v_model_sg = Word2Vec(sentences=corpus, vector_size=100, window=10, min_count=1, workers=4, sg=1)

In [26]:
def get_product_vector(tokens, model):
    product_vecs = [model.wv[t] for t in tokens if t in model.wv]
    if len(product_vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(product_vecs, axis=0)

In [27]:
products_df['w2v_vector'] = products_df['title_clean'] + products_df['description_clean']
products_df['w2v_vector'] = products_df['w2v_vector'].apply(lambda x: get_product_vector(x, w2v_model))

In [28]:
def rank_products_w2v(query, products_df, model):
    query_tokens = build_terms(query)
    q_vec = get_product_vector(query_tokens, model) # w2v vector for query

    # compute cosine similarity and store as (score, pid)
    # only normalize with product norm as query norm is a constant factor
    scores = [[np.dot(row['w2v_vector'], q_vec) / np.linalg.norm(row['w2v_vector']) , row['pid']] for idx, row in products_df.iterrows()]
        
    # sort by similarity descending
    product_scores = sorted(scores, key=lambda x: x[0], reverse=True)
    ranked_products = [x[1] for x in product_scores]

    return ranked_products, product_scores

In [29]:
def search_w2v(query, index, products_df, model):
    """
    output is the list of products that contain any of the query terms.
    So, we will get the list of products for each query term, and take the union of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # products containing this term

            if products is None:
                products = term_products  # initialize with first term's product
            else:
                products &= term_products  # intersection with the next term’s product

        except:
            # if a term isn't in the index, then no product contains ALL terms
            return []
    products = list(products)

    #filter the product_df with the products that have ALL query terms
    filtered_df = products_df[products_df['pid'].isin(products)]

    ranked_products, product_scores = rank_products_w2v(query, filtered_df, model)
    return ranked_products, product_scores

In [30]:
def print_top_k_query_results_word2vec(i, query, k, model, products_df, index):
    print(f"Results for query {i}: \033[1;34m{query}\033[0m")

    ranked_products_word2vec, scores = search_w2v(query, index, products_df, model)
    print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products_word2vec[:k]), len(ranked_products_word2vec)))
    for p_id in ranked_products_word2vec[:k]:
        print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))
    print("==================================================================")

#### Results

In [31]:
k = 20
for i, query in enumerate(queries):
    print_top_k_query_results_word2vec(i+1, query, k, w2v_model, products_df, index)

Results for query 1: [1;34mwomen blue casual tshirt[0m
Top 20 results out of 301 for the searched query:

product_id= TSHEYQ73VXZGAJY3 
- product_title: Military Camouflage Women Round or Crew Black, Grey T-Shirt 
- product_description: This white and black coloured T-shirt from the house of Fairdeals is accented with a neat solid pattern. The polyester spandex fabric and slim silhouette of this T-shirt for women offer a comfortable and flattering fit. Pair this T-shirt with blue denims and white casual shoes.

product_id= TSHEXH5986S35HJP 
- product_title: Solid Women V-neck Yellow T-Shirt 
- product_description: A smart and trendy casual wear pick for urbane women is this blue and white T-shirt from FAIRDEALS. Decked with blue and white solid design, this half-sleeved T-shirt is a worthy pick. Tailored using breathable cotton fabric, this regular-fit T-shirt is comfortable to wear with jeans and sneakers.

product_id= TSHEXH594ZTCQ6AZ 
- product_title: Solid Women Round Neck Black 

In [32]:
k = 20
for i, query in enumerate(queries):
    print_top_k_query_results_word2vec(i+1, query, k, w2v_model_sg, products_df, index)

Results for query 1: [1;34mwomen blue casual tshirt[0m
Top 20 results out of 301 for the searched query:

product_id= TSHEYQ73VXZGAJY3 
- product_title: Military Camouflage Women Round or Crew Black, Grey T-Shirt 
- product_description: This white and black coloured T-shirt from the house of Fairdeals is accented with a neat solid pattern. The polyester spandex fabric and slim silhouette of this T-shirt for women offer a comfortable and flattering fit. Pair this T-shirt with blue denims and white casual shoes.

product_id= TSHEXH5986S35HJP 
- product_title: Solid Women V-neck Yellow T-Shirt 
- product_description: A smart and trendy casual wear pick for urbane women is this blue and white T-shirt from FAIRDEALS. Decked with blue and white solid design, this half-sleeved T-shirt is a worthy pick. Tailored using breathable cotton fabric, this regular-fit T-shirt is comfortable to wear with jeans and sneakers.

product_id= TSHEXH594ZTCQ6AZ 
- product_title: Solid Women Round Neck Black 

## Extra: Doc2Vec Implementation

#### Implementation

In [33]:
# build corpus as list of tokens for product and pid
corpus = [TaggedDocument(row['title_clean'] + row['description_clean'], [row['pid']]) for _, row in products_df.iterrows()]

d2v_model = Doc2Vec(vector_size=100, min_count=1, epochs=40)
d2v_model.build_vocab(corpus)

In [34]:
d2v_model.train(corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

In [35]:
def rank_products_d2v(query, products_df, model):
    query_tokens = build_terms(query)
    q_vec = d2v_model.infer_vector(query_tokens) # d2v vector for query

    # compute cosine similarity and store as (score, pid)
    # only normalize with product norm as query norm is a constant factor
    scores = []
    for idx, row in products_df.iterrows():
        d2v_pid = model.dv[row['pid']] # get document vector (product vector) for such pid
        scores.append((np.dot(d2v_pid, q_vec) / np.linalg.norm(d2v_pid), row['pid']))

    # sort by similarity descending
    product_scores = sorted(scores, key=lambda x: x[0], reverse=True)
    ranked_products = [x[1] for x in product_scores]

    return ranked_products, product_scores

In [36]:
def search_d2v(query, index, products_df, model):
    """
    output is the list of products that contain any of the query terms.
    So, we will get the list of products for each query term, and take the union of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # products containing this term

            if products is None:
                products = term_products  # initialize with first term's product
            else:
                products &= term_products  # intersection with the next term’s product

        except:
            # if a term isn't in the index, then no product contains ALL terms
            return []
    products = list(products)

    #filter the product_df with the products that have ALL query terms
    filtered_df = products_df[products_df['pid'].isin(products)]

    ranked_products, product_scores = rank_products_d2v(query, filtered_df, model)
    return ranked_products, product_scores

In [37]:
def print_top_k_query_results_doc2vec(i, query, k, model, products_df, index):
    print(f"Results for query {i}: \033[1;34m{query}\033[0m")

    ranked_products_doc2vec, scores = search_d2v(query, index, products_df, model)
    print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products_doc2vec[:k]), len(ranked_products_doc2vec)))
    for p_id in ranked_products_doc2vec[:k]:
        print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))
    print("==================================================================")

#### Results

In [38]:
for i, q in enumerate(queries):
    print_top_k_query_results_doc2vec(i+1, q, k, d2v_model, products_df, index)

Results for query 1: [1;34mwomen blue casual tshirt[0m
Top 20 results out of 301 for the searched query:

product_id= TSHFUH9ERJPPKRXF 
- product_title: Printed Women Round Neck Yellow, Dark Blue T-Shirt 
- product_description: Slim fit round neck stylish women's t-shirt with half sleeves and allover abstract print. It gives you a casual and smart summer look when paired with denim or shorts

product_id= TSHFVSZ2BSRQQVF5 
- product_title: Striped Women Polo Neck Blue, Dark Blue T-Shirt 
- product_description: Slim fit women's t-shirt in woven stripe pattern featuring polo neck, half sleeves and chest pocket. Wear this t-shirt with your choice of bottom for stylish and casual summer look.

product_id= TSHFUH9ESCWFWZDJ 
- product_title: Printed Women Round Neck Dark Blue, Black T-Shirt 
- product_description: Slim fit round neck stylish women's t-shirt with half sleeves and allover abstract print. It gives you a casual and smart summer look when paired with denim or shorts

product_id=