In [7]:
import os
import pandas as pd
import zipfile
import gzip
import json
import shutil
from collections import defaultdict

# Extract Images Metadata From LISTINGS file (~150K image metadata)

In [8]:
# Function to read and return JSON data from a file
def read_json(file_path):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)
    return data

In [None]:
# Function to read and decode the NDJSON gzip file
def read_gz_ndjson(file_path):
    data = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Base directory containing the json.gz files (16 json.gz files available)
jsongz_base_dir = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-listings\listings\metadata'

# Lets observe one of the json.gz files
sample_json_gz = read_gz_ndjson(
    os.path.join(jsongz_base_dir, 
                 os.listdir(jsongz_base_dir)[0]))

print(len(sample_json_gz))
print()
print(sample_json_gz[0].keys())

In [None]:
# Lets dig deeper for some required fields
sample_json_gz[0]

In [None]:
# Lets combine all listing metadata jsons into one single file, and save it

jsongz_base_dir = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-listings\listings\metadata'
output_directory = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-listings'
all_metadata = {}

# Loop through all files in the metadata folder and read each json.gz file
for file in os.listdir(jsongz_base_dir):
    if file.endswith('.json.gz'):
        file_name = file.split('.')[0]
        file_category = file_name.split('_')[-1]
        file_path = os.path.join(jsongz_base_dir, file)

        single_json_file = read_gz_ndjson(file_path)
        all_metadata[file_category] = single_json_file

In [None]:
# Define the path to save the combined JSON file
# output_json_path = os.path.join(output_directory, 'combined_listings_metadata.json')

# Save the combined metadata to a single JSON file
# with open(output_json_path, 'w') as json_file:
#     json.dump(all_metadata, json_file, indent=4)

# Print a message to indicate the task is complete
# print(f"Combined metadata has been saved to {output_json_path}")

"""
it created around 2Gb json file
"""

In [9]:
combined_listings_json = read_json(r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-listings\combined_listings_metadata.json')

In [None]:
counter = 0
for key, values in combined_listings_json.items():
    counter += len(values)
print(counter)

# Obser Images Itself from Small Folder (~400K images)

In [18]:
import os

# Define the base directory containing the images
images_base_dir = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\small'

# Function to count total images in the folder and subfolders
def count_images(base_dir):
    image_count = 0
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
                image_count += 1
    return image_count

# Count the images
total_images = count_images(images_base_dir)

# Display the total count of images
print(f"Total number of images: {total_images}")

Total number of images: 398212


In [19]:
images_metadata = pd.read_csv(r"C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\metadata\images.csv\images.csv")
images_metadata.head()

Unnamed: 0,image_id,height,width,path
0,010-mllS7JL,106,106,14/14fe8812.jpg
1,01dkn0Gyx0L,122,122,da/daab0cad.jpg
2,01sUPg0387L,111,111,d2/d2daaae9.jpg
3,1168jc-5r1L,186,186,3a/3a4e88e6.jpg
4,11RUV5Fs65L,30,500,d9/d91ab9cf.jpg


In [20]:
images_metadata['folder'] = images_metadata['path'].apply(lambda x: x.split('/')[0])

In [21]:
images_metadata.head()

Unnamed: 0,image_id,height,width,path,folder
0,010-mllS7JL,106,106,14/14fe8812.jpg,14
1,01dkn0Gyx0L,122,122,da/daab0cad.jpg,da
2,01sUPg0387L,111,111,d2/d2daaae9.jpg,d2
3,1168jc-5r1L,186,186,3a/3a4e88e6.jpg,3a
4,11RUV5Fs65L,30,500,d9/d91ab9cf.jpg,d9


# Creating The Dataset

In [15]:
# Look at the listings[id] search in the images and copy into dedicated folder (category - product type)

# for example: 6th json.gz file, 784th product's main image id 
print(combined_listings_json['6'][784]['main_image_id'])

print(combined_listings_json['6'][784]['product_type'][0]['value'])

71TE6cR7XkL
CELLULAR_PHONE_CASE


In [16]:
images_base_dir = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\small'
output_base_dir = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\SORTED'
# Create output base directory if it doesn't exist
if not os.path.exists(output_base_dir):
    os.makedirs(output_base_dir)

In [33]:
desired_product_types = {
    'OFFICE_PRODUCTS',
    'PET_SUPPLIES',
    'SPORTING_GOODS',
    'OUTDOOR_LIVING',
    'WALL_ART',
    'BABY_PRODUCT',
    'HARDWARE',
    'WIRELESS_ACCESSORY',
    'BATTERY',
    'COMPUTER_ADD_ON',
    'HEADPHONES',
    'OFFICE_ELECTRONICS'
    'INSTRUMENT_PARTS_AND_ACCESSORIES',
    'FILE_FOLDER',
    'WRITING_INSTRUMENT',
    'COMPUTER_COMPONENT',
    'SPEAKERS',
    'MAJOR_HOME_APPLIANCES',
    'ELECTRIC_FAN'
}

In [34]:
# Create a dictionary for fast lookup of image_id in the DataFrame
image_id_to_path = images_metadata.set_index('image_id').to_dict('index')

# Function to copy images to the appropriate folder
def copy_image(image_id, product_type):
    if image_id in image_id_to_path:
        image_info = image_id_to_path[image_id]
        folder = image_info['folder']
        path = image_info['path']
        # Construct full path to the source image
        source_image_path = os.path.join(images_base_dir, path)
        # Define the destination directory based on product_type
        dest_dir = os.path.join(output_base_dir, product_type)
        # Create the destination directory if it doesn't exist
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        # Construct the new image name
        new_image_name = f"{folder}_{product_type[:5]}_{image_id}.jpg"
        dest_image_path = os.path.join(dest_dir, new_image_name)
        # Copy the image to the destination
        shutil.copyfile(source_image_path, dest_image_path)
        print(f"Copied {source_image_path} to {dest_image_path}")
    else:
        print(f"Image ID {image_id} not found in the DataFrame")

# Dictionary to keep track of product types
product_type_list = {}

# Loop through the combined metadata and process each entry
for category, products in combined_listings_json.items():
    for product in products:
        main_image_id = product.get('main_image_id')
        product_type = product.get('product_type', [{}])[0].get('value', 'UNKNOWN_TYPE')
        if main_image_id and product_type in desired_product_types:
            copy_image(main_image_id, product_type)
            # Add product_type to the dictionary
            if product_type not in product_type_list:
                product_type_list[product_type] = 0
            product_type_list[product_type] += 1

# Print the product type list
print("Product types and their counts:")
for product_type, count in product_type_list.items():
    print(f"{product_type}: {count}")


Copied C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\small\9f/9f76d27b.jpg to C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\SORTED\HARDWARE\9f_HARDW_619y9YG9cnL.jpg
Copied C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\small\35/3595924e.jpg to C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\SORTED\HEADPHONES\35_HEADP_61OnkGVhMWL.jpg
Copied C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\small\d1/d1c54789.jpg to C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\SORTED\OFFICE_PRODUCTS\d1_OFFIC_715LX+Gx2-L.jpg
Copied C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\small\f0/f079b13b.jpg to C:\User

In [None]:
import os

def count_images_within_folders(folder_path):
    image_counts = {}
    for subdir, _, files in os.walk(folder_path):
        image_count = len([file for file in files if file.lower().endswith(('.jpg', '.jpeg'))])
        if image_count > 0:
            image_counts[subdir] = image_count
    return image_counts

image_counts = count_images_within_folders(r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\SORTED')
for subdir, count in image_counts.items():
    print(f"{subdir}: {count} images")

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.Series(image_counts).sort_values(ascending=False)

i need your help. i have stacked many json files into a single json file, its combined_listings_json: it has 16 sub elements (keys) and each key has thousands of products. And each product has several features. I am interested in one of the features named main_image_id and product_type

# for example: 6th json.gz file, 784th product's main image id and product_type
combined_listings_json['6'][784]['main_image_id'] --> 71TE6cR7XkL
print(combined_listings_json['6'][784]['product_type'][0]['value']) --> CELLULAR_PHONE_CASE

on the other hand, i have a dataframe named images_metadata - such as:
image_id	height	width	path	folder
0	010-mllS7JL	106	106	14/14fe8812.jpg	14
1	01dkn0Gyx0L	122	122	da/daab0cad.jpg	da
2	01sUPg0387L	111	111	d2/d2daaae9.jpg	d2
3	1168jc-5r1L	186	186	3a/3a4e88e6.jpg	3a
4	11RUV5Fs65L	30	500	d9/d91ab9cf.jpg	d9

now i want to:
- create a list of product types, for each product types
- check the main_image_ids and try to find it in the dataframe's image_id column
- if i find it, that means i need to go to the path column (stored in dataframe and needs to be joined with 'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-images-small\images\small'), find this image, and copy it into another folder that we create or already created.
- the folder that we will copy the images into it, shall be named according to product_type
- the image_name should inherent dataframe folder column, image_id and product_type, it can be such as {folder}_{product_type[:5]}_image_id

can you give me the codes

In [None]:
import json
from collections import defaultdict

# Define the path to the combined JSON file
combined_json_path = r'C:\Users\fcali\OneDrive\Masaüstü\DATA SCIENCE\WEB-SCRAPING\AMAZON\AMAZON_DATA\ABO_BERKELEY\abo-listings\combined_listings_metadata.json'

# Load the combined JSON data
with open(combined_json_path, 'r') as json_file:
    combined_metadata = json.load(json_file)

# Initialize a dictionary to count occurrences of each product type
product_type_counts = defaultdict(int)

# Loop through the combined metadata and count each product type
for category, products in combined_metadata.items():
    for product in products:
        product_type = product.get('product_type', [{}])[0].get('value', 'UNKNOWN_TYPE')
        product_type_counts[product_type] += 1


In [26]:
# Convert the counts to a pandas Series
product_type_series = pd.Series(product_type_counts)

# Filter the Series to keep only counts greater than 100 and sort in descending order
filtered_series = product_type_series[product_type_series > 20].sort_values(ascending=False)

filtered_series

CELLULAR_PHONE_CASE                 64853
SHOES                               12965
GROCERY                              6546
HOME                                 5264
HOME_BED_AND_BATH                    3082
HOME_FURNITURE_AND_DECOR             2255
CHAIR                                2100
BOOT                                 2009
SANDAL                               1845
FINERING                             1540
HEALTH_PERSONAL_CARE                 1449
FINENECKLACEBRACELETANKLET           1377
ACCESSORY                            1362
SOFA                                 1199
OFFICE_PRODUCTS                      1152
FINEEARRING                          1137
PET_SUPPLIES                         1064
SPORTING_GOODS                        972
TABLE                                 936
HARDWARE_HANDLE                       860
RUG                                   817
HANDBAG                               792
LIGHT_BULB                            756
KITCHEN                           

In [32]:
# Words to filter the index
filter_words = ['LEISURE']

# Create a regex pattern to match any of the filter words
pattern = '|'.join(filter_words)

# Filter the Series based on the pattern in the index
filtered_series = filtered_series[filtered_series.index.str.contains(pattern)]

# Print the filtered Series
print(filtered_series)

Series([], dtype: int64)
