# IRWA PROJECT PART 2
Laia Tomàs Jané u198723\
Quim Ribas Martinez u198742 

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
import collections 
from collections import defaultdict
from numpy import linalg as la
import math

In [204]:
# read preprocessed json from part 1
products_df = pd.read_json("fashion_products_dataset_processed.json")

In [205]:
#copy function from part1 for cleaning queries
def build_terms(text):

    #check that the text is a string
    if not isinstance(text, str):
        return []

    #keep only any word character or spaces (remove special characters and numbers) (includes removing punctuation marks)
    text = re.sub(r'[^a-z\s]', '', text.lower()) 

    #tokenize text to a list of tokens
    tokens = text.split()

    #remove stop words
    stop_words = set(stopwords.words('english'))
    text = [word for word in tokens if word not in stop_words and len(word) > 2] #keep only words of length 3 minimum

    #apply stemming
    stemmer = PorterStemmer()
    text=[stemmer.stem(word) for word in text]

    return text

### PART 1: INDEXING

#### Build inverted index. pregunta: full com a la practica o el d'exemple de lenunciat?

In [None]:
#index with just list of pids that contain the term
def create_index2(df):
    """
    Implement the inverted index

    Argument:
    lines -- collection of Wikipedia articles

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids

    for _, line in df.iterrows():  # Remember, lines contain all documents from file
        pid = line['pid']

        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        for term in terms:
            if pid not in index[term]:  # avoid duplicates
                index[term].append(pid)

    return index, title_index

In [210]:
index2, title_index2 = create_index2(products_df)

In [266]:
#index with term as key and dict of pids as key and frequency of the term in the product as values. later used this one
def create_index(df):
    """
    Implement the inverted index

    Argument:
    lines -- collection of Wikipedia articles

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(lambda: defaultdict(lambda: 0))
    title_index = {}  # dictionary to map page titles to page ids
    desc_index = {}

    for _, line in df.iterrows():  # Remember, lines contain all documents from file
        pid = line['pid']

        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        desc = line['description']
        desc_index[pid] = desc
        
        for term in terms:
            try:
                index[term][pid] += 1
            except:
                index[term][pid] = 1 #first entry

    return index, title_index, desc_index


In [267]:
index, title_index, desc_index = create_index(products_df)

In [268]:
#first function to try to search queries, without any ranking
def search(query, index):
    """
    The output is the list of documents that contain all of the query terms.
    So, we will get the list of documents for each query term, and take the intersection of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # documents containing this term

            if products is None:
                products = term_products  # initialize with first term's products
            else:
                products &= term_products  # intersection with the next term’s products

        except:
            # if a term isn't in the index, then no document contains *all* terms
            return []

    return list(products) if products else []


#### test queries without ranking

In [269]:
query = "solid blue"
len(search(query, index))

1749

In [280]:
query = "solid blue"
products = search(query, index)
top = 5

print("======================\nSample of {} results out of {} for the searched query ({}):\n".format(len(products[:top]), len(products), query))
for p_id in products[:top]:
    print("product id= {} \n- product title: {} \n- product description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))

Sample of 5 results out of 1749 for the searched query (solid blue):

product id= JCKFKKFZMDEHEZQS 
- product title: Full Sleeve Washed Men Denim Jacket 
- product description: Dark Blue solid denim jacket, has a spread collar, four pockets, a full button placket, long sleeves. Pair it with denim pants and leather boots to complete your rugged look.

product id= TSHFWDYSGZTZNZY2 
- product title: Solid Women Round Neck Blue T-Shirt 
- product description: ECKO Unltd Slim Fit Cotton REGULAR NAVY BLUE T-Shirt

product id= TSHFHBNTG3FJEZVY 
- product title: Solid Men Polo Neck Dark Blue, Maroon, Pink T-Shirt  (Pack of 3) 
- product description: EXPERIENCE THE AMAZING COMFORT OF A KEOTI POLO T-SHIRTS We know how it is as a busy student, father, or office worker. Some days getting dressed simply takes up too much time and you need something to throw on that looks simple and presentable. You don't want anything too fancy, just a cute and non-descript top that you can wear while you're on-the

#### ranking

In [248]:
def create_index_tfidf(products_df, num_products):
    """
    Implement the inverted index and compute tf, df and idf

    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents

    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(lambda: defaultdict(lambda: 0))
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    title_index = defaultdict(str)
    desc_index = defaultdict(str)
    idf = defaultdict(float)

    for _, line in products_df.iterrows():

        pid = line['pid']
        terms = line['title_clean'] + line['description_clean']

        title = line['title']
        desc = line['description']
        title_index[pid] = title  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        desc_index[pid] = desc

        current_product_terms = defaultdict(lambda: 0)
        for term in terms:
            current_product_terms[term] += 1
            index[term][pid] += 1


        # normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for freq in current_product_terms.values():
            # posting will contain the list of positions for current term in current document.
            # posting ==> [current_doc, [list of positions]]
            # you can use it to infer the frequency of current term.
            norm += freq ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, freq in current_product_terms.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(freq / norm, 4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1 # increment DF for current term


    # Compute IDF following the formula (3) above. HINT: use np.log
    # Note: It is computed later after we know the df.
    for term in df:
        idf[term] = np.round(np.log(float(num_products / df[term])), 4)


    return index, tf, df, idf, title_index, desc_index


In [249]:
num_products = len(products_df)
index, tf, df, idf, title_index, desc_index = create_index_tfidf(products_df, num_products)

In [None]:
#Represent the query as a weighted tf-idf vector
#Represent each product as a weighted tfidf vector
#Compute the cosine similarity score for the
#query vector and each product vector
#Rank product with respect to the query by score
#Return the top K (e.g., K = 10) to the user
def rank_products(terms, pids, index, idf, tf, title_index):
    """
    Perform the ranking of the results of a search based on the tf-idf weights

    Argument:
    terms -- list of query terms
    pids -- list of products, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    title_index -- mapping between page id and page title

    Returns:
    Print the list of ranked product
    """

    # I'm interested only on the element of the productVector corresponding to the query terms
    # The remaining elements would become 0 when multiplied to the query_vector
    products_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query.
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue

        ## Compute tf*idf(normalize TF as done with products)
        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        # Generate product_vectors for matching products
        for pid_index, pid in enumerate(index[term].keys()):
            # Example of pid_index, pid
            # 0 JEAFNHERP6UHRQKH
            # 1 JEAFNHERJGTGQ4GP

            #tf[term][0] will contain the tf of the term "term" in the product JEAFNHERP6UHRQKH 
            if pid in pids:
                products_vectors[pid][termIndex] = tf[term][pid_index] * idf[term]  

    # Calculate the score of each product
    # compute the cosine similarity between queyVector and each productVector:

    products_scores = [[np.dot(curProdVec, query_vector), product] for product, curProdVec in products_vectors.items()]
    products_scores.sort(reverse=True)
    result_products = [x[1] for x in products_scores]
    #print product titles instead if product id's
    #result_products=[ title_index[x] for x in result_products ]
    if len(result_products) == 0:
        print("No results found, try again")
        # query = input()
        # products = search_tf_idf(query, index)
    #print ('\n'.join(result_products), '\n')
    return result_products, products_scores

In [279]:
def search_tf_idf(query, index):
    """
    output is the list of products that contain any of the query terms.
    So, we will get the list of products for each query term, and take the union of them.
    """
    query = build_terms(query)  # so that stemmed terms are matched in the index
    products = None  # start with None to handle first term properly

    for term in query:
        try:
            term_products = set(index[term])  # products containing this term

            if products is None:
                products = term_products  # initialize with first term's product
            else:
                products &= term_products  # intersection with the next term’s product

        except:
            # if a term isn't in the index, then no product contains ALL terms
            return []
    products = list(products)
    ranked_products, product_scores = rank_products(query, products, index, idf, tf, title_index)
    #print( ranked_products)
    return ranked_products, product_scores

#### test queries with ranking

In [278]:
query = "solid blue"
print(f"Insert your query (i.e.: {query}):\n")

print(f"Results for query: {query}\n")

ranked_products, scores = search_tf_idf(query, index)
top = 10

print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products[:top]), len(ranked_products)))
for p_id in ranked_products[:top]:
    print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))

Insert your query (i.e.: solid blue):

Results for query: solid blue

Top 10 results out of 1749 for the searched query:

product_id= TSHFZWRUTZK8HH3M 
- product_title: Solid Women Round Neck Maroon, Blue, Light Blue, Dark Blue, Black T-Shirt  (Pack of 5) 
- product_description: 

product_id= TSHFZWRUD9X6DG3M 
- product_title: Solid Women Round Neck Maroon, Blue, Light Blue, Dark Blue, White T-Shirt  (Pack of 5) 
- product_description: 

product_id= TSHFZWRU8FRFSVAJ 
- product_title: Solid Women Round Neck Maroon, Blue, Light Blue, Dark Blue, Grey T-Shirt  (Pack of 5) 
- product_description: 

product_id= TSHFZWRTZFZ4KE6K 
- product_title: Solid Men Round Neck Maroon, Blue, Light Blue, Dark Blue, Grey T-Shirt  (Pack of 5) 
- product_description: 

product_id= TSHFZWRT6BWSBQAQ 
- product_title: Solid Women Round Neck Maroon, Blue, Light Blue, Dark Blue, White T-Shirt  (Pack of 5) 
- product_description: 

product_id= TSHFZWRT2KTZJ7BK 
- product_title: Solid Women Round Neck Maroon, Blue

In [277]:
query2 = "dark blue tshirt"
print(f"Results for query: {query2}\n")

ranked_products, scores = search_tf_idf(query2, index)
top = 10

print("======================\nTop {} results out of {} for the searched query:\n".format(len(ranked_products[:top]), len(ranked_products)))
for p_id in ranked_products[:top]:
    print("product_id= {} \n- product_title: {} \n- product_description: {}\n".format(p_id, title_index[p_id], desc_index[p_id]))

Results for query: dark blue tshirt

Top 10 results out of 885 for the searched query:

product_id= TSHFYUQKZVE6TWET 
- product_title: Color Block Women Round Neck Dark Blue, Dark Green T-Shirt 
- product_description: 

product_id= TSHFHB3MMVYKKPCY 
- product_title: Color Block Men Polo Neck Dark Blue, Dark Green T-Shirt 
- product_description: 

product_id= TSHFYURFAMGGKSPC 
- product_title: Color Block Women Round Neck Dark Green, White, Dark Blue T-Shirt 
- product_description: 

product_id= TSHFKNY6G5UMPYFE 
- product_title: Printed Women s Dark Blue T-Shirt 
- product_description: 

product_id= TSHFTN3VZ37H2HKP 
- product_title: Solid Women V Neck Dark Blue T-Shirt 
- product_description: 

product_id= TSHFTN3DC3VGHDCA 
- product_title: Solid Women V Neck Dark Blue T-Shirt 
- product_description: 

product_id= TSHFT59QDHQDRT6N 
- product_title: Solid Men V Neck Dark Blue T-Shirt 
- product_description: 

product_id= TSHFT59QCRYDTKUN 
- product_title: Solid Women V Neck Dark Blue T