# importing necessary libraries

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from math import log

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
nltk.download("stopwords")
nltk.download('punkt')
stop_words = set(stopwords.words("english"))
stop_words.update(set(word for word in string.ascii_lowercase))  # add all lowecase letters to stop words
stemmer = PorterStemmer()  # stemmer object

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


loading dataset

In [None]:
product_df = pd.read_csv("/content/drive/MyDrive/WANDS-main/dataset/product.csv",sep='\t')
query_df = pd.read_csv("/content/drive/MyDrive/WANDS-main/dataset/query.csv",sep='\t')
label_df = pd.read_csv("/content/drive/MyDrive/WANDS-main/dataset/label.csv", sep='\t')

In [None]:
df = product_df.copy()
query = query_df.copy()

In [None]:
df = df[["product_id", "product_description", "product_class"]]

In [None]:
df.head()

Unnamed: 0,product_id,product_description,product_class
0,0,"good , deep sleep can be quite difficult to ha...",Beds
1,1,"create delicious slow-cooked meals , from tend...",Slow Cookers
2,2,prepare home-cooked meals on any schedule with...,Slow Cookers
3,3,this original stainless tool was designed to c...,"Slicers, Peelers And Graters"
4,4,the hardware has a rich heritage of delivering...,Door Knobs


In [None]:
query.head()

Unnamed: 0,query_id,query,query_class
0,0,salon chair,Massage Chairs
1,1,smart coffee table,Coffee & Cocktail Tables
2,2,dinosaur,Kids Wall Décor
3,3,turquoise pillows,Accent Pillows
4,4,chair and a half recliner,Recliners


# removing null values

In [None]:
# for products

# df[df["product_description"].isna()]


# for query
# query[query["query_class"].isna()]

In [None]:
# for products
df.dropna(subset=['product_description'], inplace=True)

# for query
query.dropna(subset=['query_class'], inplace=True)

In [None]:
def preprocess_text(data_frame, col_name):

    data_frame[col_name] = data_frame[col_name].apply(nltk.word_tokenize)

    data_frame[col_name] = data_frame[col_name].apply(lambda words: [word for word in words if word.isalpha()])

    # this is optional
    #data_frame[col_name] = data_frame[col_name].apply(lambda words: [word for word in words if word.lower() not in stop_words])

    data_frame[col_name] = data_frame[col_name].apply(lambda words: [stemmer.stem(word) for word in words])

    return data_frame

In [None]:
tokenized_df = preprocess_text(df, 'product_description')

tokenized_query = preprocess_text(query, 'query')

In [None]:
for index, row in df.iterrows():
    tokenized_df.at[index, "product_description_length"] = len(row["product_description"])

for index, row in query.iterrows():
    tokenized_query.at[index, "query_length"] = len(row["query"])

In [None]:
tokenized_df

Unnamed: 0,product_id,product_description,product_class,product_description_length
0,0,"[good, deep, sleep, can, be, quit, difficult, ...",Beds,173.0
1,1,"[creat, delici, meal, from, tender, meat, to, ...",Slow Cookers,123.0
2,2,"[prepar, meal, on, ani, schedul, with, thi, es...",Slow Cookers,15.0
3,3,"[thi, origin, stainless, tool, wa, design, to,...","Slicers, Peelers And Graters",28.0
4,4,"[the, hardwar, ha, a, rich, heritag, of, deliv...",Door Knobs,61.0
...,...,...,...,...
42988,42988,"[thi, complet, shower, system, offer, a, sooth...",Shower Panels,41.0
42989,42989,"[the, malibu, pressur, balanc, divert, fix, sh...",Shower Panels,55.0
42991,42991,"[thi, pub, tabl, set, includ, counter, height,...",Dining Table Sets,111.0
42992,42992,"[bring, icon, modern, style, to, your, space, ...",Teen Lounge Furniture|Accent Chairs,60.0


In [None]:
tokenized_query

Unnamed: 0,query_id,query,query_class,query_length
0,0,"[salon, chair]",Massage Chairs,2.0
1,1,"[smart, coffe, tabl]",Coffee & Cocktail Tables,3.0
2,2,[dinosaur],Kids Wall Décor,1.0
3,3,"[turquois, pillow]",Accent Pillows,2.0
4,4,"[chair, and, a, half, reclin]",Recliners,5.0
...,...,...,...,...
475,483,"[rustic, twig]",Faux Plants and Trees,2.0
476,484,"[nespresso, vertuo, next, premium, by, brevil,...",Espresso Machines,8.0
477,485,"[pedistol, sink]",Kitchen Sinks,2.0
478,486,"[in, bench, cushion]",Furniture Cushions,3.0


In [None]:
tokenized_df.dropna(subset=["product_description_length"], inplace=True)

tokenized_query.dropna(subset=['query_length'], inplace=True)

# inverted index

## setting product_id as dataset's index

In [None]:
copy_dataset = tokenized_df.copy()

tokenized_df.head()

Unnamed: 0,product_id,product_description,product_class,product_description_length
0,0,"[good, deep, sleep, can, be, quit, difficult, ...",Beds,173.0
1,1,"[creat, delici, meal, from, tender, meat, to, ...",Slow Cookers,123.0
2,2,"[prepar, meal, on, ani, schedul, with, thi, es...",Slow Cookers,15.0
3,3,"[thi, origin, stainless, tool, wa, design, to,...","Slicers, Peelers And Graters",28.0
4,4,"[the, hardwar, ha, a, rich, heritag, of, deliv...",Door Knobs,61.0


In [None]:
copy_dataset.set_index('product_id', inplace=True)

copy_dataset.head()

Unnamed: 0_level_0,product_description,product_class,product_description_length
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"[good, deep, sleep, can, be, quit, difficult, ...",Beds,173.0
1,"[creat, delici, meal, from, tender, meat, to, ...",Slow Cookers,123.0
2,"[prepar, meal, on, ani, schedul, with, thi, es...",Slow Cookers,15.0
3,"[thi, origin, stainless, tool, wa, design, to,...","Slicers, Peelers And Graters",28.0
4,"[the, hardwar, ha, a, rich, heritag, of, deliv...",Door Knobs,61.0


In [None]:
def create_inverted_index(dataframe, field_name):
    lookup_table = {}

    for record_id, content_list in enumerate(dataframe[field_name]):
        for term in content_list:  # Iterate over each term in the content list
            if term not in lookup_table:  # If term is not in the lookup table
                lookup_table[term] = {
                    'df': 1,  # Start counting occurrences of the term
                    'tf': {record_id: 1}  # Record the occurrence
                }
            else:
                term_records = lookup_table[term]['tf']
                if record_id not in term_records:
                    term_records[record_id] = 1
                else:
                    term_records[record_id] += 1
                lookup_table[term]['df'] += 1  # Increment the global count for the term

    return lookup_table

In [None]:
inverted_index = create_inverted_index(copy_dataset, "product_description")

In [None]:
tokenized_query.head()

Unnamed: 0,query_id,query,query_class,query_length
0,0,"[salon, chair]",Massage Chairs,2.0
1,1,"[smart, coffe, tabl]",Coffee & Cocktail Tables,3.0
2,2,[dinosaur],Kids Wall Décor,1.0
3,3,"[turquois, pillow]",Accent Pillows,2.0
4,4,"[chair, and, a, half, reclin]",Recliners,5.0


# cosine similarity

In [None]:
def cosine(products, index_map, queries):
    query_results = {}
    total_documents = products.shape[0]

    for query_index, query_content in enumerate(queries["query"]):
        relevance_scores = {}
        query_terms = set(query_content)

        for term in query_terms:
            term_frequency_in_query = query_content.count(term)
            if term in index_map:
                document_freq_dict = index_map[term]['tf']
                for item_id, doc_term_frequency in document_freq_dict.items():
                    added_score = (1 + log(term_frequency_in_query, 10)) * log((total_documents / doc_term_frequency), 10)
                    if item_id in relevance_scores:
                        relevance_scores[item_id] += added_score
                    else:
                        relevance_scores[item_id] = added_score

        normalized_scores = []
        for item_id, score in relevance_scores.items():
            description_length = products.iloc[item_id]['product_description_length']
            normalized_scores.append((item_id, round((score / description_length), 2)))

        normalized_scores.sort(key=lambda x: x[1], reverse=True)
        query_results[query_index] = normalized_scores[:25]

    return query_results

In [None]:
Scores = cosine(copy_dataset, inverted_index, tokenized_query)

In [None]:
Scores[0]

[(26659, 1.14),
 (27591, 1.14),
 (21597, 0.91),
 (29484, 0.91),
 (29920, 0.91),
 (16492, 0.76),
 (2941, 0.65),
 (5309, 0.65),
 (21421, 0.65),
 (21422, 0.65),
 (21667, 0.65),
 (23658, 0.65),
 (17827, 0.57),
 (36391, 0.53),
 (29151, 0.51),
 (21855, 0.49),
 (10692, 0.46),
 (15863, 0.46),
 (15864, 0.46),
 (23132, 0.46),
 (29150, 0.46),
 (34757, 0.46),
 (4818, 0.42),
 (9721, 0.42),
 (26729, 0.42)]

In [None]:
print(Scores)

[(26659, 318.57), (28440, 273.06), (7754, 227.557), (16492, 221.482), (14509, 208.587), (14510, 208.587), (36960, 202.273), (27591, 201.007), (5309, 197.21), (33146, 193.417), (30550, 192.157), (21597, 191.142), (32536, 189.625), (33436, 182.04), (849, 178.247), (31219, 176.983), (31844, 174.455), (5254, 163.836), (15100, 156.034), (23132, 155.071), (10692, 151.7), (33041, 149.804), (29920, 148.666), (21667, 146.643), (7741, 146.643)]


In [None]:
print(Scores)

[(14616, 3861.430000000012), (2508, 3737.640000000012), (6309, 3728.8500000000113), (14615, 3724.900000000012), (24664, 3528.85000000001), (18881, 3467.4700000000116), (32332, 3446.5200000000095), (11336, 3421.37000000001), (5544, 3387.4000000000065), (19507, 3370.1200000000104), (6300, 3316.92000000001), (31012, 3315.1800000000094), (2362, 3309.1900000000096), (30101, 3283.2400000000066), (6273, 3265.91000000001), (4011, 3264.3700000000085), (3930, 3255.4500000000103), (35617, 3252.5700000000093), (19221, 3251.3300000000095), (460, 3244.1700000000064), (24835, 3226.1000000000095), (19679, 3198.4600000000096), (14236, 3180.9500000000085), (20276, 3179.760000000009), (30443, 3171.680000000007)]
