# IRWA PROJECT PART 2
Laia Tomàs Jané u198723\
Quim Ribas Martinez u198742 

In [238]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
import collections 
from collections import defaultdict
from numpy import linalg as la
import math

In [239]:
# read preprocessed json from part 1
products_df = pd.read_json("fashion_products_dataset_processed.json")

In [240]:
#copy function from part1 for cleaning queries
def build_terms(text):

    #check that the text is a string
    if not isinstance(text, str):
        return []

    #keep only any word character or spaces (remove special characters and numbers) (includes removing punctuation marks)
    text = re.sub(r'[^a-z\s]', '', text.lower()) 

    #tokenize text to a list of tokens
    tokens = text.split()

    #remove stop words
    stop_words = set(stopwords.words('english'))
    text = [word for word in tokens if word not in stop_words and len(word) > 2] #keep only words of length 3 minimum

    #apply stemming
    stemmer = PorterStemmer()
    text=[stemmer.stem(word) for word in text]

    return text

## PART 1: INDEXING

#### Build inverted index.

In [241]:
#index with just list of pids that contain the term
def create_index2(df):
    """
    Implement the inverted index

    Argument:
    lines -- collection of Wikipedia articles

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids

    for _, line in df.iterrows():  # Remember, lines contain all documents from file
        pid = line['pid']

        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        for term in terms:
            if pid not in index[term]:  # avoid duplicates
                index[term].append(pid)

    return index, title_index

In [242]:
index2, title_index2 = create_index2(products_df)

In [243]:
#index with term as key and dict of pids as key and frequency of the term in the product as values. later used this one
def create_index(df):
    """
    Implement the inverted index

    Argument:
    lines -- collection of Wikipedia articles

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(lambda: defaultdict(lambda: 0))
    title_index = {}  # dictionary to map page titles to page ids
    desc_index = {}

    for _, line in df.iterrows():  # Remember, lines contain all documents from file
        pid = line['pid']

        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        desc = line['description']
        desc_index[pid] = desc
        
        for term in terms:
            try:
                index[term][pid] += 1
            except:
                index[term][pid] = 1 #first entry

    return index, title_index, desc_index


In [244]:
index, title_index, desc_index = create_index(products_df)

In [245]:
#first function to try to search queries, without any ranking
def search(query, index):
    """
    The output is the list of documents that contain all of the query terms.
    So, we will get the list of documents for each query term, and take the intersection of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # documents containing this term

            if products is None:
                products = term_products  # initialize with first term's products
            else:
                products &= term_products  # intersection with the next term’s products

        except:
            # if a term isn't in the index, then no document contains *all* terms
            return []

    return list(products) if products else []


#### test queries without ranking

In [246]:
query = "solid blue tshirt"
products = search(query, index)
top = 5

print("======================\nSample of {} results out of {} for the searched query ({}):\n".format(len(products[:top]), len(products), query))
for p_id in products[:top]:
    print("product id= {} \n- product title: {} \n- product description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))

Sample of 5 results out of 1095 for the searched query (solid blue tshirt):

product id= TSHFYUQXFRMY4VAW 
- product title: Solid Men Henley Neck Blue T-Shirt 
- product description: 

product id= TSHFWBWESMCTBUKC 
- product title: Solid Men Round Neck Dark Blue T-Shirt  (Pack of 3) 
- product description: Refresh your wardrobe with the Premium Collection of Cotton Round Neck T-shirts from Scott International. These t-shirts are made of Pure Cotton and provide comfort all day. Melange Fabric, Elegant Stitching and finish, Soft Neck Tape, Comfort Fit makes this product an ideal daily wear and casual wear. Combine with Trousers or Denims for a cool Casual look. Combo pack is ideal value for money.

product id= TSHFV5YXNSEHZ9JZ 
- product title: Solid Women Polo Neck Dark Blue, White, Red T-Shirt 
- product description: Stylish women's slim fit polo t-shirt with half sleeves, contrast inner placket and contrast two color panel across chest

product id= TSHEM8ZJAFWFGFFH 
- product title: S

In [247]:
query = "solid blue tshirt nike" # as nike is not in the index, the conjunctive query returns an empty list
products = search(query, index)
top = 5

print("======================\nSample of {} results out of {} for the searched query ({}):\n".format(len(products[:top]), len(products), query))
for p_id in products[:top]:
    print("product id= {} \n- product title: {} \n- product description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))

Sample of 0 results out of 0 for the searched query (solid blue tshirt nike):



#### ranking

In [248]:
def create_index_tfidf(products_df, num_products):
    """
    Implement the inverted index and compute tf, df and idf

    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(lambda: defaultdict(lambda: 0))
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    title_index = defaultdict(str)
    desc_index = defaultdict(str)
    idf = defaultdict(float)

    for _, line in products_df.iterrows():

        pid = line['pid']
        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        desc = line['description']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        desc_index[pid] = desc

        current_product_terms = defaultdict(lambda: 0)
        for term in terms:
            current_product_terms[term] += 1
            index[term][pid] += 1


        # normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for freq in current_product_terms.values():
            # posting will contain the list of positions for current term in current document.
            # posting ==> [current_doc, [list of positions]]
            # you can use it to infer the frequency of current term.
            norm += freq ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, freq in current_product_terms.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(freq / norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term


    # Compute IDF following the formula (3) above. HINT: use np.log
    # Note: It is computed later after we know the df.
    for term in df:
        idf[term] = np.round(np.log(float(num_products / df[term])), 4)


    return index, tf, df, idf, title_index, desc_index


In [249]:
num_products = len(products_df)
index, tf, df, idf, title_index, desc_index = create_index_tfidf(products_df, num_products)

In [250]:
#Represent the query as a weighted tf-idf vector
#Represent each product as a weighted tfidf vector
#Compute the cosine similarity score for the
#query vector and each product vector
#Rank product with respect to the query by score
#Return the top K (e.g., K = 10) to the user
def rank_products(terms, pids, index, idf, tf, title_index):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    pids -- list of products, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title

    Returns:
    Print the list of ranked product
    """

    # I'm interested only on the element of the productVector corresponding to the query terms
    # The remaining elements would become 0 when multiplied to the query_vector
    products_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with products)
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Generate product_vectors for matching products
        for pid_index, pid in enumerate(index[term].keys()):
            # Example of pid_index, pid
            # 0 JEAFNHERP6UHRQKH
            # 1 JEAFNHERJGTGQ4GP

            #tf[term][0] will contain the tf of the term "term" in the product JEAFNHERP6UHRQKH 
            if pid in pids:
                products_vectors[pid][termIndex] = tf[term][pid_index] * idf[term]  

    # Calculate the score of each product
    # compute the cosine similarity between queyVector and each productVector:

    products_scores = [[np.dot(curProdVec, query_vector), product] for product, curProdVec in products_vectors.items()]
    products_scores.sort(reverse=True)
    result_products = [x[1] for x in products_scores]
    #print product titles instead if product id's
    #result_products=[ title_index[x] for x in result_products ]
    if len(result_products) == 0:
        print("No results found, try again")
        # query = input()
        # products = search_tf_idf(query, index)
    #print ('\n'.join(result_products), '\n')
    return result_products, products_scores

In [251]:
def search_tf_idf(query, index):
    """
    output is the list of products that contain any of the query terms.
    So, we will get the list of products for each query term, and take the union of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # products containing this term

            if products is None:
                products = term_products  # initialize with first term's product
            else:
                products &= term_products  # intersection with the next term’s product

        except:
            # if a term isn't in the index, then no product contains ALL terms
            return []
    products = list(products)
    ranked_products, product_scores = rank_products(query, products, index, idf, tf, title_index)
    #print( ranked_products)
    return ranked_products, product_scores

#### test queries with ranking

In [252]:
def print_top_k_query_results(query, index, k):
    print(f"Results for query: {query}\n")

    ranked_products, scores = search_tf_idf(query, index)
    print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products[:k]), len(ranked_products)))
    for p_id in ranked_products[:k]:
        print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))

##### Define 5 queries based on keywords ranked by term-frequency

In [253]:
# rank by total TF
tf_ranked = sorted(
    ((term, sum(scores)) for term, scores in tf.items()),
    key=lambda x: x[1],
    reverse=True
)

print("Top terms by total TF:")
for term, score in tf_ranked[:15]:
    print(f"{term}: {score:.4f}")


Top terms by total TF:
tshirt: 4585.3306
women: 4043.7173
men: 3955.4447
neck: 3259.5684
print: 2928.9821
solid: 2494.3121
round: 2343.7958
shirt: 1827.0592
fit: 1804.0305
cotton: 1757.3377
casual: 1556.9151
pack: 1310.8927
blue: 1245.1850
sleev: 1123.0076
comfort: 1108.3091


In [301]:
query1 = "women blue casual tshirt" 
query2 = "cotton fit casual"
query3 = "animal print tshirt"
query4 = "round neck black dress"
query5 = "solid shirt pack of 3"

In [302]:
print_top_k_query_results(query1, index, 5)

Results for query: women blue casual tshirt

Top 5 results out of 301 for the searched query:

product_id= TSHEUJ4VFYTMZTXZ 
- product_title: Solid Women Round or Crew Blue T-Shirt 
- product_description: Axmann Light Blue Round Neck Casual Summer Wear Women T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirts

product_id= TSHFEY8VRGCMRSGG 
- product_title: Typography Women Round Neck Blue T-Shirt 
- product_description: Blue Chest print Kintted Cotton Casual T-shirts, has a round neck , Half Sleeve

product_id= TSHEU7DTUYMHMGW6 
- product_title: Striped Women Polo Neck Blue T-Shirt 
- product_description: Axmann Blue Striped Casual Wear Women Summer Polo T-shirt BY Axmann. Made with premium fabric for the most ultimate comfort. Range of vibrant colored casual t-shirts

product_id= TSHEU7DSSQGESYNH 
- product_title: Printed Women Round Neck Blue T-Shirt 
- product_description: Axmann Blue Round Neck Casual Summer Wear Women

In [303]:
print_top_k_query_results(query2, index, 5)

Results for query: cotton fit casual

Top 5 results out of 1072 for the searched query:

product_id= SHTFVGSFSMEGVAYX 
- product_title: Men Slim Fit Solid Casual Shirt 
- product_description: Copper casual slim fit cotton shirt.

product_id= SHTFVGSE7YBCVHYA 
- product_title: Women Slim Fit Solid Casual Shirt 
- product_description: Green casual slim fit shirt cut from cotton.

product_id= SHTFS2GNV9SUCT6M 
- product_title: Women Slim Fit Checkered Casual Shirt 
- product_description: CANTABIL Women's Maroon 100% Cotton Slim Fit Casual Shirt

product_id= SHTFS2GNPXXKXY2U 
- product_title: Women Slim Fit Checkered Casual Shirt 
- product_description: CANTABIL Women's Brown 100% Cotton Slim Fit Casual Shirt

product_id= SHTFS2GNP3Z3CCHH 
- product_title: Men Slim Fit Striped Casual Shirt 
- product_description: CANTABIL Men's Navy 100% Cotton Slim Fit Casual Shirt



In [304]:
print_top_k_query_results(query3, index, 5)

Results for query: animal print tshirt

Top 2 results out of 2 for the searched query:

product_id= TSHFNQW3FRSGET38 
- product_title: Animal Print Women Round Neck Yellow T-Shirt 
- product_description: 

product_id= TSHFW2YBHHFP7TJW 
- product_title: Animal Print Men Round Neck Blue T-Shirt 
- product_description: When you want to look cool without compromising on comfort this T-shirt is what you need. you're going to make a great impression With an eye-catching design, this stylish t-shirt is ideal for everyday wear. 1/2 length sleeves and a round neck. Made from breathable cotton-rich fabric



In [305]:
print_top_k_query_results(query4, index, 5)

Results for query: round neck black dress

Top 5 results out of 16 for the searched query:

product_id= TSHFK3W7BTPN7PRQ 
- product_title: Self Design Women Round Neck Black T-Shirt 
- product_description: An everyday essential dressed up. A textured jacquard fabric with an allover monogram print adds some class to this women's t-shirt. The print is subtle and tonal for an understated look. A droptail hem in the back provides extra coverage.

product_id= SWTFVMSDDZRPXAPT 
- product_title: Striped Round Neck Casual Women Reversible Red Sweater 
- product_description: Red and Black striped reversible sweater, has a round neck, long sleeves, and straight hem. This top-of-the-line Levis sweater will keep you looking your best all season long.  Great for breezy weather, this red addition can be dressed up with dark trousers and leather shoe.

product_id= TSHFUXPWUR2ZTEE2 
- product_title: Color Block Men V Neck Dark Blue T-Shirt 
- product_description: 1.PC cotton bio-washed full sleeves t-

In [306]:
print_top_k_query_results(query5, index, 5)

Results for query: solid shirt pack of 3

Top 5 results out of 717 for the searched query:

product_id= SHTFMBTYM2FKAKMN 
- product_title: Men Regular Fit Solid Formal Shirt  (Pack of 2) 
- product_description: 

product_id= SHTFZNSJC8RZFF4W 
- product_title: Women Regular Fit Solid Slim Collar Casual Shirt  (Pack of 2) 
- product_description: 

product_id= SHTFZNSJ8FXUECM5 
- product_title: Men Regular Fit Solid Slim Collar Casual Shirt  (Pack of 2) 
- product_description: 

product_id= SHTFT26KX5DEYK4H 
- product_title: Men Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2) 
- product_description: 

product_id= SHTFT26KMM39T2ZC 
- product_title: Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2) 
- product_description: 



## PART 2: EVALUATION

In [307]:
validation_df = pd.read_csv('validation_labels.csv')

In [308]:
validation_df['labels'].unique()
# we already have a binary representation

array([1, 0])

### validate functions

In [None]:
def precision_at_k(product_gt, y_score, k=10):
    """
    Parameters
    ----------
    product_gt: Ground truth (true relevance labels).
    y_score: Predicted scores.
    k : number of doc to consider.

    Returns
    -------
    precision @k : float

    """
    k = min(k, len(product_gt))
    #order = np.argsort(y_score)[::-1]  
    # Bonus point: How to improve the effitiency of this part?
    order = np.argsort(-y_score)

    product_gt = product_gt[order[:k]]
    relevant = sum(product_gt == 1)
    return float(relevant) / k

In [310]:
def recall_at_k(product_score, y_score, k=10):
    k = min(k, len(product_score))
    order = np.argsort(-y_score)
    
    product_score_at_k = product_score[order[:k]]
    
    relevant_at_k = np.sum(product_score_at_k == 1)
    
    total_relevant = np.sum(product_score == 1)
    
    if total_relevant == 0:
        return 0.0
    
    return float(relevant_at_k) / total_relevant


In [311]:
def average_precision_at_k(product_gt, predicted_scores, k=10):
    k = min(k, len(product_gt))
    sumatory = 0
    order = np.argsort(-predicted_scores) 
    product_gt = product_gt[order[:k]]
    relevant = sum(product_gt == 1)
    number_to_iterate = min(k, len(order))


    for i in range(1, number_to_iterate+1):
        #sumatory += precision_at_k(product_gt, predicted_scores, i)*product_gt[i]
        relevant_at_i = sum(product_gt[:i]==1)
        sumatory += (relevant_at_i / i) * product_gt[i-1]

    if relevant == 0:
        return 0

    return sumatory / relevant


In [312]:
def f1_score_at_k(product_gt, predicted_scores, k=10):
    k = min(k, len(product_gt))

    prec = precision_at_k(product_gt, predicted_scores, k)
    rec = recall_at_k(product_gt, predicted_scores, k)
    if prec + rec == 0:
        return 0.0
    return 2 * (prec * rec) / (prec + rec)

In [313]:
def mean_average_precision(queries_product_gt, queries_predicted_scores, k):
    # queries is a dict with the query id as the key and the predicted scores for that query as the value
    # queries_product_gt is also a dict with query id as the key and the ground truths for that query as the values
    total = 0

    for query, _ in queries_predicted_scores.items():
        total += average_precision_at_k(queries_product_gt[query], queries_predicted_scores[query], k)

    return total / len(queries_predicted_scores)


In [314]:
def reciprocal_rank(product_gt, predicted_scores, k):
    k = min(k, len(product_gt))
    order = np.argsort(-predicted_scores) 
    product_gt = product_gt[order[:k]]

    number_to_iterate = min(len(order), k)
    for i in range(number_to_iterate):
        if product_gt[i] == 1:
            return 1/(i+1)
    return 0

def mean_reciprocal_rank(queries_product_gt, queries_predicted_scores, k):
    
    total = 0
    for query, _ in queries_predicted_scores.items():
        total += reciprocal_rank(queries_product_gt[query], queries_predicted_scores[query], k)

    return total / len(queries_predicted_scores)


In [None]:
def dcg_at_p(product_gt, predicted_scores, p):
    order = np.argsort(-predicted_scores)  
    product_gt = product_gt[order]
    
    DCG_p = 0
    number_to_iterate = min(p, len(order))
    for i in range (0,number_to_iterate):
        DCG_p += (2**product_gt[i] - 1) / np.log2((i + 1) + 1)

    return DCG_p

def normalized_discounted_cumulative_gain(product_gt, predicted_scores, p):
    DCG_p = dcg_at_p(product_gt, predicted_scores, p)

    ideal_order = np.argsort(-product_gt)
    ideal_gt = np.asarray(product_gt)[ideal_order][:p]
    IDCG_p = dcg_at_p(ideal_gt, ideal_gt, p)

    if IDCG_p == 0:
        return 0.0

    return DCG_p / IDCG_p


In [316]:
def print_results_at_k(queries_gt, queries_predicted_scores, k):

    for query_id, query_gt in queries_gt.items():
        products_gt = queries_gt[query_id]
        predicted_scores = queries_predicted_scores[query_id]
        print("--------------------------------------------------")
        print("Results for query_id:", query_id)
        print("\nPrecision@{}: {:.3g}\n".format(k, precision_at_k(products_gt, predicted_scores, k)))
        print("Recall@{}: {:.3g}\n".format(k, recall_at_k(products_gt, predicted_scores, k)))
        print("Average precision@{}: {:.3g}\n".format(k, average_precision_at_k(products_gt, predicted_scores, k)))
        print("F1-Score@{}: {:.3g}\n".format(k, f1_score_at_k(products_gt, predicted_scores, k)))
        print("NDCG@{}: {:.3g}".format(k, normalized_discounted_cumulative_gain(products_gt, predicted_scores, k)))

    print("--------------------------------------------------")
    print("MAP@{}: {:.3g}\n".format(k, mean_average_precision(queries_gt, queries_predicted_scores, k)))
    print("MRR@{}: {:.3g}\n".format(k, mean_reciprocal_rank(queries_gt, queries_predicted_scores, k)))

### RESULTS FOR VALIDATION QUERIES

In [317]:
query1 = "women full sleeve sweatshirt cotton"
query2 = "men slim jeans blue"

result_products_q1, predicted_scores_q1 = search_tf_idf(query1, index)
result_products_q2, predicted_scores_q2 = search_tf_idf(query2, index)

In [318]:
predicted_scores_q1 = pd.DataFrame(predicted_scores_q1, columns=['predicted_score', 'pid'])
predicted_scores_q2 = pd.DataFrame(predicted_scores_q2, columns=['predicted_score', 'pid'])

In [319]:
validation_q1 = validation_df[validation_df['query_id']==1][['pid', 'labels', 'query_id']]
q1_scores_merged = validation_q1.merge(predicted_scores_q1, on='pid')

validation_q2 = validation_df[validation_df['query_id']==2][['pid', 'labels', 'query_id']]
q2_scores_merged = validation_q2.merge(predicted_scores_q2, on='pid')

In [320]:
q1_gt = q1_scores_merged['labels'].values
q1_predicted_scores = q1_scores_merged['predicted_score'].values

q2_gt = q2_scores_merged['labels'].values
q2_predicted_scores = q2_scores_merged['predicted_score'].values

In [321]:
# create dictionaries storing the query id and the ground truths and predicted scores
queries_gt = {1: q1_gt, 2: q2_gt}
queries_pred_scores = {1: q1_predicted_scores, 2: q2_predicted_scores}
k = 10

print_results_at_k(queries_gt, queries_pred_scores, k)

--------------------------------------------------
Results for query_id: 1

Precision@10: 0.9

Recall@10: 0.818

Average precision@10: 1

F1-Score@10: 0.857

NDCG@10: 0.936
--------------------------------------------------
Results for query_id: 2

Precision@10: 0.75

Recall@10: 1

Average precision@10: 0.948

F1-Score@10: 0.857

NDCG@10: 0.984
--------------------------------------------------
MAP@10: 0.974

MRR@10: 1



In [333]:
k = 5
print_results_at_k(queries_gt, queries_pred_scores, k)

--------------------------------------------------
Results for query_id: women full sleeve sweatshirt cotton

Precision@5: 1

Recall@5: 0.25

Average precision@5: 1

F1-Score@5: 0.4

NDCG@5: 1
--------------------------------------------------
Results for query_id: men slim jeans blue

Precision@5: 1

Recall@5: 0.25

Average precision@5: 1

F1-Score@5: 0.4

NDCG@5: 1
--------------------------------------------------
Results for query_id: animal print tshirt

Precision@5: 1

Recall@5: 1

Average precision@5: 1

F1-Score@5: 1

NDCG@5: 1
--------------------------------------------------
Results for query_id: round neck black dress

Precision@5: 0

Recall@5: 0

Average precision@5: 0

F1-Score@5: 0

NDCG@5: 0
--------------------------------------------------
Results for query_id: solid shirt pack of 3

Precision@5: 0

Recall@5: 0

Average precision@5: 0

F1-Score@5: 0

NDCG@5: 0
--------------------------------------------------
MAP@5: 0.6

MRR@5: 0.6



In [322]:
# Check our functions with the available functions of sklearn library
from sklearn.metrics import average_precision_score, ndcg_score

k=6
temp = q2_scores_merged.sort_values("predicted_score", ascending=False).head(k)
print("Average Precision@{} from sklearn: {:.3g}".format(k, average_precision_score(np.array(temp["labels"]), np.array(temp["predicted_score"][:k]))))
print("Average precision@{}: {:.3g}\n".format(k, average_precision_at_k(q2_gt, q2_predicted_scores, k)))

print("NDCG@{} from sklearn: {:.3g}".format(k, ndcg_score([q2_gt], [q2_predicted_scores], k=k)))
print("NDCG@{}: {:.3g}".format(k, normalized_discounted_cumulative_gain(q2_gt, q2_predicted_scores, k)))

Average Precision@6 from sklearn: 0.967
Average precision@6: 0.967

NDCG@6 from sklearn: 0.883
NDCG@6: 0.883


### Assign a binary relevance label to each document

In [323]:
q1_ranked_products, q1_scores = search_tf_idf(query1, index)
q2_ranked_products, q2_scores = search_tf_idf(query2, index)
q3_ranked_products, q3_scores = search_tf_idf(query3, index)
q4_ranked_products, q4_scores = search_tf_idf(query4, index)
q5_ranked_products, q5_scores = search_tf_idf(query5, index)

In [None]:
k = 20
q1_scores = q1_scores[:k]
q2_scores = q2_scores[:k]
q3_scores = q3_scores[:k]
q4_scores = q4_scores[:k]
q5_scores = q5_scores[:k]


In [325]:
df_q1_scores = pd.DataFrame(q1_scores, columns=['predicted_relevance', 'pid'])
df_q1_scores['title'] = df_q1_scores['pid'].map(title_index)
df_q1_scores['description'] = df_q1_scores['pid'].map(desc_index)

gt_scores_q1 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ])
df_q1_scores['ground_truth'] = gt_scores_q1


In [326]:
df_q2_scores = pd.DataFrame(q2_scores, columns=['predicted_relevance', 'pid'])
df_q2_scores['title'] = df_q2_scores['pid'].map(title_index)
df_q2_scores['description'] = df_q2_scores['pid'].map(desc_index)

gt_scores_q2 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ])
df_q2_scores['ground_truth'] = gt_scores_q2


In [327]:
df_q3_scores = pd.DataFrame(q3_scores, columns=['predicted_relevance', 'pid'])
df_q3_scores['title'] = df_q3_scores['pid'].map(title_index)
df_q3_scores['description'] = df_q3_scores['pid'].map(desc_index)

gt_scores_q3 = np.array([1, 1])
df_q3_scores['ground_truth'] = gt_scores_q3


In [328]:
df_q4_scores = pd.DataFrame(q4_scores, columns=['predicted_relevance', 'pid'])
df_q4_scores['title'] = df_q4_scores['pid'].map(title_index)
df_q4_scores['description'] = df_q4_scores['pid'].map(desc_index)
descriptions = df_q4_scores['description'].values
titles = df_q4_scores['title'].values
for title in titles:
    print(title)
for desc in descriptions:
    print(desc)


Self Design Women Round Neck Black T-Shirt
Striped Round Neck Casual Women Reversible Red Sweater
Color Block Men V Neck Dark Blue T-Shirt
Solid Women V Neck Maroon T-Shirt
Solid Women Round Neck Black T-Shirt  (Pack of 2)
Solid Women Round Neck Black T-Shirt
Solid Men Round Neck Black T-Shirt
Solid Women Round Neck Black T-Shirt
Solid Men Round Neck Black T-Shirt  (Pack of 2)
Solid Women Round Neck Black T-Shirt  (Pack of 2)
Solid Women Round Neck Black T-Shirt  (Pack of 2)
Solid Women Round Neck Black T-Shirt  (Pack of 2)
Solid Men Round Neck Black T-Shirt  (Pack of 2)
Solid Women Round Neck Black T-Shirt  (Pack of 2)
Solid Women Round Neck Black T-Shirt  (Pack of 2)
Solid Women Round Neck Black T-Shirt  (Pack of 2)
An everyday essential dressed up. A textured jacquard fabric with an allover monogram print adds some class to this women's t-shirt. The print is subtle and tonal for an understated look. A droptail hem in the back provides extra coverage.
Red and Black striped reversible

In [329]:
gt_scores_q4 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # because dress is in the description but in the form as dressed up, the recommended items are not dresses
df_q4_scores['ground_truth'] = gt_scores_q4

In [330]:
df_q5_scores = pd.DataFrame(q5_scores, columns=['predicted_relevance', 'pid'])
df_q5_scores['title'] = df_q5_scores['pid'].map(title_index)
df_q5_scores['description'] = df_q5_scores['pid'].map(desc_index)
titles = df_q5_scores['title'].values
for title in titles:
    print(title)

Men Regular Fit Solid Formal Shirt  (Pack of 2)
Women Regular Fit Solid Slim Collar Casual Shirt  (Pack of 2)
Men Regular Fit Solid Slim Collar Casual Shirt  (Pack of 2)
Men Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Men Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Men Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Men Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Men Regular Fit Solid Spread Collar Casual Shirt  (Pack of 2)
Women Re

In [331]:
gt_scores_q5 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) #because the results are all pack of 2, not 3
df_q5_scores['ground_truth'] = gt_scores_q5

In [332]:
queries_gt = {query1: gt_scores_q1, query2: gt_scores_q2, query3: gt_scores_q3, query4: gt_scores_q4, query5: gt_scores_q5}
queries_pred_scores = {query1: df_q1_scores['predicted_relevance'], query2: df_q2_scores['predicted_relevance'], query3: df_q3_scores['predicted_relevance'], query4: df_q4_scores['predicted_relevance'], query5: df_q5_scores['predicted_relevance']}
k = 10

print_results_at_k(queries_gt, queries_pred_scores, k)

--------------------------------------------------
Results for query_id: women full sleeve sweatshirt cotton

Precision@10: 1

Recall@10: 0.5

Average precision@10: 1

F1-Score@10: 0.667

NDCG@10: 1
--------------------------------------------------
Results for query_id: men slim jeans blue

Precision@10: 1

Recall@10: 0.5

Average precision@10: 1

F1-Score@10: 0.667

NDCG@10: 1
--------------------------------------------------
Results for query_id: animal print tshirt

Precision@10: 1

Recall@10: 1

Average precision@10: 1

F1-Score@10: 1

NDCG@10: 1
--------------------------------------------------
Results for query_id: round neck black dress

Precision@10: 0

Recall@10: 0

Average precision@10: 0

F1-Score@10: 0

NDCG@10: 0
--------------------------------------------------
Results for query_id: solid shirt pack of 3

Precision@10: 0

Recall@10: 0

Average precision@10: 0

F1-Score@10: 0

NDCG@10: 0
--------------------------------------------------
MAP@10: 0.6

MRR@10: 0.6

