In [55]:
import pandas as pd
import json



In [56]:
FILE_PATH = '../data/flipkart_fashion_products_dataset.json'

In [57]:
def get_unique_products(file_path):
    '''
    Returns a dictionary of unique product ids. 
    '''

    product_list = []
    with open(file_path, "r") as json_file:
        json_data = json.load(json_file)
        for product in json_data:
            product_list.append(product)

    unique_product_ids = {}
    for product in product_list:
        product_id = product['pid']
        if product_id not in unique_product_ids:
            unique_product_ids[product_id] = product
    return unique_product_ids



In [65]:
def remove_duplicates(filepath):
    '''
    Removes all the duplicate products from the dataset
    '''
    cleaned_data = []
    index = 0
    product_ids = set()
    with open(filepath, "r") as json_file:
        json_data = json.load(json_file)
        for product in json_data:
            product_id = product['pid']
            if product_id not in product_ids:
                product['doc_id'] = index
                index += 1
                cleaned_data.append(product)
                product_ids.add(product_id)
           
             


    return cleaned_data


    
    
    
    

In [66]:
unique_product_ids = get_unique_products(FILE_PATH)

In [67]:
cleaned_data = remove_duplicates(FILE_PATH)


In [68]:
print(len(cleaned_data))

28080


In [70]:
cleaned_data[0]

{'_id': 'fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a',
 'actual_price': '2,999',
 'average_rating': '3.9',
 'brand': 'York',
 'category': 'Clothing and Accessories',
 'crawled_at': '02/10/2021, 20:11:51',
 'description': 'Yorker trackpants made from 100% rich combed cotton giving it a rich look.Designed for Comfort,Skin friendly fabric,itch-free waistband & great for all year round use Proudly made in India',
 'discount': '69% off',
 'images': ['https://rukminim1.flixcart.com/image/128/128/jr3t5e80/track-pant/z/y/n/m-1005combo2-yorker-original-imafczg3xfh5qqd4.jpeg?q=70',
  'https://rukminim1.flixcart.com/image/128/128/jr58l8w0/track-pant/w/d/a/l-1005combo8-yorker-original-imafczg3pgtxgraq.jpeg?q=70'],
 'out_of_stock': False,
 'pid': 'TKPFCZ9EA7H5FYZH',
 'product_details': [{'Style Code': '1005COMBO2'},
  {'Closure': 'Elastic'},
  {'Pockets': 'Side Pockets'},
  {'Fabric': 'Cotton Blend'},
  {'Pattern': 'Solid'},
  {'Color': 'Multicolor'}],
 'seller': 'Shyam Enterprises',
 'selling_price': '92

In [71]:
def add_missing_fields(data):
    """
    Add missing fields in the product data
    """
    keys = ['title', 'description', 'category', 'sub_category', 'brand', 'average_rating', 'url', 'out_of_stock']
    for product in data:
        for key in keys:
            if key not in product:
                product[key] = ''
    return data

In [73]:
cleaned_data = add_missing_fields(cleaned_data)

In [75]:
# TODO: Write cleaned data to a json file on data folder
# Specify the file path where you want to save the JSON file
file_path = "../data/cleaned_data.json"

# Write the list of dictionaries to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(cleaned_data, json_file, indent=4)  # 'indent=4' makes the JSON file more readable