In [1]:
import json 
import pandas as pd

## Extract zip files (run this only once)

In [16]:
import gzip
import shutil
import os


# Define the directory where compressed files are located.
data_directory = r'C:\Users\likki\OneDrive\Desktop\7_Likhitha\Courses & Competitions\ai-engineering\amazon-shoping-assistant\ai_engineering\data'

# List the names of the files to unzip.
files_to_unzip = [
    'meta_Clothing_Shoes_and_Jewelry.jsonl.gz',
    'Clothing_Shoes_and_Jewelry.jsonl.gz'
]



def unzip_file(input_gz_path, output_path):
    """
    Decompresses a .gz file to a specified output path.

    Args:
        input_gz_path (str): The full path to the input .gz file.
        output_path (str): The full path for the unzipped output file.
    """
    try:
        # Open the compressed file for reading and the new file for writing.
        # use 'rb' (read binary) and 'wb' (write binary) for efficient copying.
        with gzip.open(input_gz_path, 'rb') as f_in:
            with open(output_path, 'wb') as f_out:
                # Copy the decompressed content from the input file to the output file.
                shutil.copyfileobj(f_in, f_out)
        print(f"✅ Successfully unzipped: {os.path.basename(input_gz_path)}")
        print(f"   -> Output: {output_path}\n")
    except FileNotFoundError:
        print(f"❌ Error: File not found at {input_gz_path}\n")
    except Exception as e:
        print(f"❌ An unexpected error occurred while processing {input_gz_path}: {e}\n")


# --- Main Execution ---
if __name__ == "__main__":
    print("--- Starting File Decompression ---\n")
    
    # Loop through each file in the list and unzip it.
    for filename in files_to_unzip:
        # 1. Construct the full path to the input .gz file.
        input_path = os.path.join(data_directory, filename)
        
        # 2. Define the path for the new, unzipped file.
        #    This will place the new file in the same directory.
        output_path = input_path.replace('.gz', '')
        
        # 3. Call the function to perform the decompression.
        unzip_file(input_path, output_path)
        
    print("--- Decompression complete. ---")


--- Starting File Decompression ---

✅ Successfully unzipped: meta_Clothing_Shoes_and_Jewelry.jsonl.gz
   -> Output: C:\Users\likki\OneDrive\Desktop\7_Likhitha\Courses & Competitions\ai-engineering\amazon-shoping-assistant\ai_engineering\data\meta_Clothing_Shoes_and_Jewelry.jsonl

✅ Successfully unzipped: Clothing_Shoes_and_Jewelry.jsonl.gz
   -> Output: C:\Users\likki\OneDrive\Desktop\7_Likhitha\Courses & Competitions\ai-engineering\amazon-shoping-assistant\ai_engineering\data\Clothing_Shoes_and_Jewelry.jsonl

--- Decompression complete. ---


## Filter Items that have been observed for the first time in year 2022 or later.

In [17]:
def filter_data(data: dict) -> dict:
    filter = False
    if int(data['details']['Date First Available'][-4:]) < 2022:
        filter = True

    return filter

In [19]:


with open("../../data/meta_Clothing_Shoes_and_Jewelry.jsonl", 'r', encoding='utf-8') as fp:
    with open("../../extra-data/meta_Clothing_Shoes_and_Jewelry_2022_2023.jsonl", 'a', encoding='utf-8') as fp_out:
        with open("../../extra-data/meta_Clothing_Shoes_and_Jewelry_2022_2023_no_date.jsonl", 'a', encoding='utf-8') as fp_out_no_date:
            
            for i, line in enumerate(fp, 1): 
                try:
                    data = json.loads(line.strip())
                except json.JSONDecodeError:
                    print(f"Warning: Skipping corrupted JSON on line {i}. Line starts with: {line[:100]}...")
                    continue 
                try:
                    filter_result = filter_data(data)
                    if not filter_result:
                        json.dump(data, fp_out)
                        fp_out.write('\n')
                except KeyError: 
                    json.dump(data, fp_out_no_date)
                    fp_out_no_date.write('\n')
                
                if i % 10000 == 0:
                    print(f"Processed {i} lines")

Processed 10000 lines
Processed 20000 lines
Processed 30000 lines
Processed 40000 lines
Processed 50000 lines
Processed 60000 lines
Processed 70000 lines
Processed 80000 lines
Processed 90000 lines
Processed 100000 lines
Processed 110000 lines
Processed 120000 lines
Processed 130000 lines
Processed 140000 lines
Processed 150000 lines
Processed 160000 lines
Processed 170000 lines
Processed 180000 lines
Processed 190000 lines
Processed 200000 lines
Processed 210000 lines
Processed 220000 lines
Processed 230000 lines
Processed 240000 lines
Processed 250000 lines
Processed 260000 lines
Processed 270000 lines
Processed 280000 lines
Processed 290000 lines
Processed 300000 lines
Processed 310000 lines
Processed 320000 lines
Processed 330000 lines
Processed 340000 lines
Processed 350000 lines
Processed 360000 lines
Processed 370000 lines
Processed 380000 lines
Processed 390000 lines
Processed 400000 lines
Processed 410000 lines
Processed 420000 lines
Processed 430000 lines
Processed 440000 lin

## Split the items into two categories: "has main category", "does not have main category"

In [20]:
def filter_category(data: dict) -> dict:
    filter = False
    if data['main_category'] == None:
        filter = True

    return filter

In [21]:
with open("../../extra-data/meta_Clothing_Shoes_and_Jewelry_2022_2023.jsonl", 'r') as fp:
    with open("../../extra-data/meta_Clothing_Shoes_and Jewelry_2022_2023_with_category.jsonl", 'a', encoding = 'utf-8') as fp_out:
        with open("../../extra-data/meta_Clothing_Shoes_and_Jewelry_2022_2023_no_category", 'a', encoding = 'utf-8') as fp_out_no_category:
            for line in fp:
                data = json.loads(line.strip())
                if not filter_category(data):
                    json.dump(data, fp_out)
                    fp_out.write('\n')
                    fp_out.flush()
                else:
                    json.dump(data, fp_out_no_category)
                    fp_out_no_category.write('\n')
                    fp_out_no_category.flush()

## Filter products with atleast 100 rating and average rating>1

In [22]:
df = pd.read_json("../../extra-data/meta_Clothing_Shoes_and Jewelry_2022_2023_with_category.jsonl", lines = True)
df.shape


(407563, 16)

In [27]:
df_rating_100 = df[df['rating_number']>100]
df_rating_100.shape

(58302, 16)

In [28]:
df_avg_rating = df_rating_100[df_rating_100['average_rating']>1]
df_avg_rating.shape

(58302, 16)

## Apply random sampling (500 samples/products)

In [69]:
df_sample_500 = df_avg_rating.sample(n=500, random_state=45)
df_sample_500.shape

(500, 16)

In [30]:
df_avg_rating.to_json("../../extra-data/meta_Clothing_Shoes_and_Jewelry_2022_2023_with_category_ratings_100.jsonl")

In [71]:
df_sample_500.to_json("../../data/meta_Clothing_Stores_and_Jewelry_2022_2023_with_category_ratings_100_sample_500.jsonl", orient='records', lines=True)

## Extract reviews for the sampled products

In [65]:
df_avg_ratings = pd.read_json("../../extra-data/meta_Clothing_Shoes_and_Jewelry_2022_2023_with_category_ratings_100.jsonl", lines=True)
df_sample_500 = pd.read_json("../../data/meta_Clothing_Stores_and_Jewelry_2022_2023_with_category_ratings_100_sample_500.jsonl", lines=True)

In [35]:
with open("../../data/Clothing_Shoes_and_Jewelry.jsonl", 'r') as fp:
    with open("../../extra-data/Clothing_Shoes_and_Jewelry_2022_2023_with_category_rating_100.jsonl", 'a') as fp_out:
        id_list = set(df_avg_rating['parent_asin'].values)
        i = 0
        for line in fp:
            data = json.loads(line.strip())
            if data['parent_asin'] in id_list:
                json.dump(data, fp_out)
                fp_out.write('\n')
                fp_out.flush()
            i += 1
            if i % 100000 == 0:
                print(f"Processed {i} lines")

Processed 100000 lines
Processed 200000 lines
Processed 300000 lines
Processed 400000 lines
Processed 500000 lines
Processed 600000 lines
Processed 700000 lines
Processed 800000 lines
Processed 900000 lines
Processed 1000000 lines
Processed 1100000 lines
Processed 1200000 lines
Processed 1300000 lines
Processed 1400000 lines
Processed 1500000 lines
Processed 1600000 lines
Processed 1700000 lines
Processed 1800000 lines
Processed 1900000 lines
Processed 2000000 lines
Processed 2100000 lines
Processed 2200000 lines
Processed 2300000 lines
Processed 2400000 lines
Processed 2500000 lines
Processed 2600000 lines
Processed 2700000 lines
Processed 2800000 lines
Processed 2900000 lines
Processed 3000000 lines
Processed 3100000 lines
Processed 3200000 lines
Processed 3300000 lines
Processed 3400000 lines
Processed 3500000 lines
Processed 3600000 lines
Processed 3700000 lines
Processed 3800000 lines
Processed 3900000 lines
Processed 4000000 lines
Processed 4100000 lines
Processed 4200000 lines
P

In [66]:
df_sample_500.shape

(1, 16)

In [60]:
with open("../../extra-data/Clothing_Shoes_and_Jewelry_2022_2023_with_category_rating_100.jsonl", 'r') as fp:
    with open("../../data/Clothing_Shoes_and_Jewelry_2022_2023_with_category_rating_100_sample_500.jsonl", 'a') as fp_out:
        id_list = set(df_sample_500['parent_asin'].values)
        i = 0
        for line in fp:
            data = json.loads(line.strip())
            if data['parent_asin'] in id_list:
                json.dump(data, fp_out)
                fp_out.write('\n')
                fp_out.flush()
            i += 1
            if i % 100000 == 0:
                print(f"Processed {i} lines")

Processed 100000 lines
Processed 200000 lines
Processed 300000 lines
Processed 400000 lines
Processed 500000 lines
Processed 600000 lines
Processed 700000 lines
Processed 800000 lines
Processed 900000 lines
Processed 1000000 lines
Processed 1100000 lines
Processed 1200000 lines
Processed 1300000 lines
Processed 1400000 lines
Processed 1500000 lines
Processed 1600000 lines
Processed 1700000 lines
Processed 1800000 lines
Processed 1900000 lines
Processed 2000000 lines
Processed 2100000 lines
Processed 2200000 lines
Processed 2300000 lines
Processed 2400000 lines
Processed 2500000 lines
Processed 2600000 lines
Processed 2700000 lines
Processed 2800000 lines
Processed 2900000 lines
Processed 3000000 lines
Processed 3100000 lines
Processed 3200000 lines
Processed 3300000 lines
Processed 3400000 lines
Processed 3500000 lines
Processed 3600000 lines
Processed 3700000 lines
Processed 3800000 lines
Processed 3900000 lines
Processed 4000000 lines
Processed 4100000 lines
Processed 4200000 lines
P

In [64]:
df2 = pd.read_json("../../data/Clothing_Shoes_and_Jewelry_2022_2023_with_category_rating_100_sample_500.jsonl", lines=True)
df2.shape

(41981, 10)