# 0. Data Preparation

In [3]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict

# Load your preprocessed data from Part 1

df = pd.read_csv(
    '/Users/julialopezpinot/Desktop/irwa_search_engine_G_011/data/processed_dataset.csv',
    engine='python',
    on_bad_lines='skip'
)


In [11]:
import ast

def safe_literal_eval(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []
    elif isinstance(x, list):
        return x
    else:
        return []
    

# we need to convert the string representation of lists/dicts back to actual lists/dicts
df['processed_text'] = df['processed_text'].apply(safe_literal_eval)
df['attributes'] = df['attributes'].apply(safe_literal_eval)

In [12]:
df.head()

Unnamed: 0,pid,url,processed_text,title,description,brand_facet,category_facet,subcategory_facet,seller_facet,discount,selling_price,actual_price,average_rating,attributes
0,TKPFCZ9EA7H5FYZH,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant, yorker...",Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,york,clothing_and_accessories,bottomwear,shyam_enterprises,69.0,921.0,2999.0,3.9,"[elast, side, pocket, cotton, blend, solid, mu..."
1,TKPFCZ9EJZV2UVRZ,https://www.flipkart.com/yorker-solid-men-blue...,"[solid, men, blue, track, pant, yorker, trackp...",Solid Men Blue Track Pants,Yorker trackpants made from 100% rich combed c...,york,clothing_and_accessories,bottomwear,shyam_enterprises,66.0,499.0,1499.0,3.9,"[drawstr, elast, side, pocket, cotton, blend, ..."
2,TKPFCZ9EHFCY5Z4Y,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, men, multicolor, track, pant, yorker, ...",Solid Men Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,york,clothing_and_accessories,bottomwear,shyam_enterprises,68.0,931.0,2999.0,3.9,"[elast, side, pocket, cotton, blend, solid, mu..."
3,TKPFCZ9ESZZ7YWEF,https://www.flipkart.com/yorker-solid-men-mult...,"[solid, women, multicolor, track, pant, yorker...",Solid Women Multicolor Track Pants,Yorker trackpants made from 100% rich combed c...,york,clothing_and_accessories,bottomwear,shyam_enterprises,69.0,911.0,2999.0,3.9,"[elast, side, pocket, cotton, blend, solid, mu..."
4,TKPFCZ9EVXKBSUD7,https://www.flipkart.com/yorker-solid-men-brow...,"[solid, women, brown, gray, track, pant, yorke...","Solid Women Brown, Grey Track Pants",Yorker trackpants made from 100% rich combed c...,york,clothing_and_accessories,bottomwear,shyam_enterprises,68.0,943.0,2999.0,3.9,"[drawstr, elast, side, pocket, cotton, blend, ..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28081 entries, 0 to 28080
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                28081 non-null  object 
 1   url                28081 non-null  object 
 2   processed_text     28081 non-null  object 
 3   title              28081 non-null  object 
 4   description        16927 non-null  object 
 5   brand_facet        26071 non-null  object 
 6   category_facet     28080 non-null  object 
 7   subcategory_facet  28080 non-null  object 
 8   seller_facet       26437 non-null  object 
 9   discount           27225 non-null  object 
 10  selling_price      28077 non-null  float64
 11  actual_price       27302 non-null  float64
 12  average_rating     25818 non-null  float64
 13  attributes         28079 non-null  object 
dtypes: float64(3), object(11)
memory usage: 3.0+ MB


First, we should analyze the missing values of the variables and define how to handle them
- textual/categorical --> we fill with unkown since it keeps it searchable and consisten
- numerical fields --> we fill them with 0 (we should interpret as 0 discount or no average rating available) 

In [13]:
# Fill missing textual/categorical values
text_cols = ['title', 'description', 'brand_facet', 
             'category_facet', 'subcategory_facet', 'seller_facet']
df[text_cols] = df[text_cols].fillna('unknown')

# Fill missing numerical fields with 0
num_cols = ['discount', 'selling_price', 'actual_price', 'average_rating']
df[num_cols] = df[num_cols].fillna(0)

# Ensure 'attributes' column has lists, even when missing
df['attributes'] = df['attributes'].apply(lambda x: x if isinstance(x, list) else [])

# Verify
print("Missing values after cleaning:")
print(df.isna().sum())


Missing values after cleaning:
pid                  0
url                  0
processed_text       0
title                0
description          0
brand_facet          0
category_facet       0
subcategory_facet    0
seller_facet         0
discount             0
selling_price        0
actual_price         0
average_rating       0
attributes           0
dtype: int64


# 1. Inverted Index

In [14]:
# we need all the tokens in a single column for the inverted index, hence we concatenate the processed_text and attributes columns
df['tokens'] = df['processed_text'] + df['attributes']

Now we are building the inverted index: 
- Inverted index maps each word (token) to the documents (products) that contain it. 

In [15]:
def build_inverted_index(df):
    """
    Build an inverted index mapping each token to a set of document IDs (pids)
    that contain that token.
    """
    inverted_index = defaultdict(set)
    
    for i, row in df.iterrows():
        for token in row['tokens']:
            inverted_index[token].add(row['pid'])
    
    # Convert sets to sorted lists for readability
    inverted_index = {term: sorted(list(docs)) for term, docs in inverted_index.items()}
    return inverted_index

inverted_index = build_inverted_index(df)
print(f"Inverted index built with {len(inverted_index)} unique terms.")


Inverted index built with 5692 unique terms.


In [16]:
for i, (term, postings) in enumerate(inverted_index.items()):
    print(f"{term}: {postings[:5]}")  # show first 5 product IDs for each term
    if i == 9:  # stop after 10 terms
        break

solid: ['4hWtdShnw%3D%3D', 'BDAFYGRSSHGHBBYE', 'BRFEWB5RPBDW9DKP', 'BRFEX35MBHYTBYGC', 'BRFEZEF49JAWFCMD']
women: ['ARME8P6GZGFHAUPX', 'BDAFSFMYKJDGSXUG', 'BDAFT7NGPTHYGVRJ', 'BDAFT8JHVBQ6ZT3F', 'BDAFT8JQFNVHGBVH']
multicolor: ['ARME8P6GFSXGE9AW', 'ARME8P6GJQ8PZBE6', 'ARME8P6GXFED7RJZ', 'ARME8P6GZGFHAUPX', 'BDAFSFMYKJDGSXUG']
track: ['BXRFGGX685HKER8Y', 'BXRFGYQTPZNHEWFD', 'BXRFGYQUTHXJKHHP', 'BXRFGYQUZH5DCABH', 'BXRFGYRYGUURZGDG']
pant: ['BXRFGGX685HKER8Y', 'BXRFGYQTPZNHEWFD', 'BXRFGYQUTHXJKHHP', 'BXRFGYQUZH5DCABH', 'BXRFGYRYGUURZGDG']
yorker: ['TKPFCZ9DYU33FFXS', 'TKPFCZ9E2UC3DR3F', 'TKPFCZ9EA7H5FYZH', 'TKPFCZ9ECDYYDNKA', 'TKPFCZ9EFK9DNWDA']
trackpant: ['CRGFGQRNCB7QVEME', 'CRGFGQS63JKJZM34', 'TKPEFVQAPRHKVJHJ', 'TKPEFVQB8Z65YCYQ', 'TKPEFVQBA9FJA8WX']
rich: ['CRGFB6UNUPXTTY7F', 'CRGFDCWNCYTB8SAM', 'CTPFVZD8CNSZ3AMR', 'CTPFVZEYHCRQ27Y2', 'CTPFVZFYR8KGYYBJ']
comb: ['BRFEWB5RPBDW9DKP', 'BRFEX35MBHYTBYGC', 'BRFEZEF49JAWFCMD', 'BRFEZWU9FCHYPJDU', 'BRFFYNFYDDNAUDZZ']
cotton: ['4hWtdShnw%3D

Our index contains 5692 unique terms, that means that our dataset contained 5692 distinct words. 

In [17]:
# we save the inverted index to a json file in order to use it later 
with open('inverted_index.json', 'w') as f:
    json.dump(inverted_index, f)