# Library Import and files

- Files available here: https://amazon-reviews-2023.github.io/

## nvidia rapids verification to run using rapids ubuntu

In [56]:
import subprocess
import pandas as pd

def check_nvidia_gpu():
    try:
        # Check if NVIDIA GPU is available using nvidia-smi
        result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.returncode == 0:
            print("NVIDIA GPU detected.")
            return True
        else:
            print("No NVIDIA GPU detected. Using CPU-based `pandas`.")
            return False
    except FileNotFoundError:
        print("`nvidia-smi` command not found. Using CPU-based `pandas`.")
        return False
    except Exception as e:
        print(f"An error occurred while checking for the GPU: {str(e)}. Using CPU-based `pandas`.")
        return False

if __name__ == "__main__":
    # Check if we have a GPU
    gpu_available = check_nvidia_gpu()

    if gpu_available:
        try:
            # Load the cuDF extension to accelerate pandas with the GPU
            print("Loading GPU-accelerated `cudf.pandas`...")
            # Ensure this works inside Jupyter using the `%load_ext` magic command
            get_ipython().run_line_magic('load_ext', 'cudf.pandas')
            print("GPU acceleration enabled!")
        except Exception as e:
            print(f"Failed to load `cudf.pandas`. Error: {e}")
            print("Falling back to CPU-based `pandas`.")



`nvidia-smi` command not found. Using CPU-based `pandas`.


# Local File

In [57]:

file_path='meta_Electronics.jsonl'



# Specify the chunk size (e.g., load 1000 lines at a time)
chunk_size = 500000

# Initialize an empty list to store the chunks
chunks = []

# Load the first chunk (or any number of chunks you want)
for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size):
    chunks.append(chunk)
    # Break after the first chunk to load only a sample
    break

# Concatenate the chunks into a single DataFrame (optional)
df = pd.concat(chunks)

# Print the sample data
display(df.head())


KeyboardInterrupt: 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   main_category    98945 non-null   object 
 1   title            100000 non-null  object 
 2   average_rating   100000 non-null  float64
 3   rating_number    100000 non-null  int64  
 4   features         100000 non-null  object 
 5   description      100000 non-null  object 
 6   price            41829 non-null   object 
 7   images           100000 non-null  object 
 8   videos           100000 non-null  object 
 9   store            99445 non-null   object 
 10  categories       100000 non-null  object 
 11  details          100000 non-null  object 
 12  parent_asin      100000 non-null  object 
 13  bought_together  0 non-null       float64
 14  subtitle         33 non-null      object 
 15  author           22 non-null      object 
dtypes: float64(2), int64(1), object(13)
mem

In [8]:
print(df['parent_asin'].nunique())

100000


# How many categories?

In [9]:
df['path_length'] = df['categories'].apply(len)

max_path_length = df['path_length'].max()

print(f"Maximum path length: {max_path_length}")


Maximum path length: 7


## split df categories to then join with original df

In [10]:
# Step 1: Split the 'categories' column into 9 columns
df_split = df['categories'].apply(lambda x: ','.join(x)).str.split(',', expand=True)

# Step 2: Rename columns (e.g., category_1, category_2, ..., category_9)
df_split.columns = [f'category_{i+1}' for i in range(df_split.shape[1])]


# Encode missing values as a new category, with page path number
for col in df_split.columns:
    df_split[col] = df_split[col].fillna(df['path_length'])


# Step 3: Concatenate the new split columns back to the original DataFrame (df)
df = pd.concat([df, df_split], axis=1)


df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   main_category    98945 non-null   object 
 1   title            100000 non-null  object 
 2   average_rating   100000 non-null  float64
 3   rating_number    100000 non-null  int64  
 4   features         100000 non-null  object 
 5   description      100000 non-null  object 
 6   price            41829 non-null   object 
 7   images           100000 non-null  object 
 8   videos           100000 non-null  object 
 9   store            99445 non-null   object 
 10  categories       100000 non-null  object 
 11  details          100000 non-null  object 
 12  parent_asin      100000 non-null  object 
 13  bought_together  0 non-null       float64
 14  subtitle         33 non-null      object 
 15  author           22 non-null      object 
 16  path_length      100000 non-null  int64

In [11]:
print(df.isnull().sum())


main_category        1055
title                   0
average_rating          0
rating_number           0
features                0
description             0
price               58171
images                  0
videos                  0
store                 555
categories              0
details                 0
parent_asin             0
bought_together    100000
subtitle            99967
author              99978
path_length             0
category_1              0
category_2              0
category_3              0
category_4              0
category_5              0
category_6              0
category_7              0
dtype: int64


# Image and video count

In [12]:
df['num_images'] = df['images'].apply(len)

df['num_videos'] = df['videos'].apply(len)


# Feature string parsing

## feature df

In [42]:
# generate random row sample to get values from dict
print(df['details'].sample(n=1).iloc[0])


{'Brand': 'MTAKYA', 'Connector Type': 'USB Type C', 'Cable Type': 'Lightning', 'Compatible Devices': 'Charging Adapter, Smartphone', 'Special Feature': 'Braided, Tangle Free, Fast Charging', 'Package Dimensions': '7.6 x 5.75 x 0.51 inches; 2.89 Ounces', 'Date First Available': 'February 8, 2023', 'Manufacturer': 'Dongguan Lijie Electronic Technology Co., Ltd.'}


In [25]:
df_feature = df[['title','average_rating','main_category','details','parent_asin']]

In [55]:
display(df_feature.sample(n=10))

Unnamed: 0,title,average_rating,main_category,details,parent_asin,Brand,Package_Dimensions
22060,[3 Colors] Leather Camera Case/bag For Olympus...,4.0,Computers,"{'Manufacturer': 'shoppers3 (Olympus )', 'Item...",B005CLNC20,Shoppers3 (Olympus ),
37313,ReplacementScrews Stand Screws Compatible with...,2.4,All Electronics,{'Package Dimensions': '6.5 x 3.5 x 0.5 inches...,B07QP8WFWD,ReplacementScrews,6.5 x 3.5 x 0.5 inches
49,IU3D Low Noise DC 24V 0.12A 4010 Brushless Fan...,3.3,Industrial & Scientific,"{'Brand': 'IU3D', 'Power Connector Type': '2-P...",B08T8BSHQ9,IU3D,5.16 x 4.29 x 1.42 inches; 1.45 Ounces
74807,Precision Design PD-FD300 Bounce Flash Diffuse...,3.4,Camera & Photo,{'Package Dimensions': '6.7 x 5.4 x 2.2 inches...,B003733QRQ,,6.7 x 5.4 x 2.2 inches
29674,CORSAIR VENGEANCE RGB PRO 16GB (2x8GB) DDR4 40...,4.8,Computers,"{'RAM': '16 GB DDR4', 'Memory Speed': '4000 MH...",B07DKMJ61N,Corsair,
48155,Husky Mounts Full Motion TV Wall Mount Articul...,4.2,All Electronics,"{'Brand Name': 'Husky Mounts', 'Item Weight': ...",B00XZHBVZI,Husky Mounts,
81149,"HMDX Go XL Portable Speaker Case, HX-GO4GY Gray",3.8,Cell Phones & Accessories,{'Product Dimensions': '4 x 0.75 x 6.5 inches'...,B00CDJ6OJ2,HMDX,
99940,"100PCS Mixed Cartoon Stickers, Cute Cartoon Ch...",4.3,Amazon Home,"{'Brand': 'DDUAN', 'Color': 'Multicolor', 'Roo...",B0B2RBG35G,DDUAN,4.06 x 2.76 x 0.47 inches
11401,"Universal 10 10.1 Inch Android Tablet Case, Un...",4.5,Cell Phones & Accessories,{'Product Dimensions': '10.5 x 7.4 x 0.6 inche...,B0C1PC2B8Q,Dluggs,
454,Canon BP-2L5 Battery - Replacement for Canon N...,1.0,Camera & Photo,"{'Item model number': 'NB-2LH-DL2', 'Date Firs...",B00GWVPZSI,,


In [None]:
# unique_keys = set()
# for details_dict in df_feature['details']:
#     if isinstance(details_dict, dict):  # Ensure it's a dictionary
#         unique_keys.update(details_dict.keys())

# # Display the unique keys
# print(unique_keys)

In [29]:
keys_to_extract = [
    'Brand',
    'Package Dimensions',
]


# Loop over the list of keys and create new columns in the DataFrame
for key in keys_to_extract:
    column_name = key.replace(' ', '_').replace('By', '_By')  # Replace spaces and handle "By" for column names
    df_feature[column_name] = df_feature['details'].apply(lambda x: x.get(key, None))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feature[column_name] = df_feature['details'].apply(lambda x: x.get(key, None))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feature[column_name] = df_feature['details'].apply(lambda x: x.get(key, None))


In [36]:
display(df_feature.sample(n=10))

Unnamed: 0,title,average_rating,main_category,details,parent_asin,Brand,Package_Dimensions
19122,SDNP80 Lithium-Ion Battery - Rechargeable Ultr...,3.2,All Electronics,"{'Package Dimensions': '2 x 2 x 2 inches', 'It...",B00HXWFK1M,Synergy Digital,2 x 2 x 2 inches
63297,Finis SWIMP3 1G Waterproof 1GB MP3 Player with...,4.5,Home Audio & Theater,"{'Manufacturer': 'Finis', 'Date First Availabl...",B00366RGE8,FINIS,
28211,LiViTech(TM Modern Art Flower Design Series 36...,4.5,Computers,"{'Other display features': 'PC Accessory', 'Co...",B00GU58K6K,LiViTech,
42447,KNACRO PWM-to-Voltage Module 0% -100% PWM Conv...,3.1,All Electronics,{'Package Dimensions': '6.02 x 4.21 x 0.55 inc...,B0797NBC79,KNACRO,6.02 x 4.21 x 0.55 inches
34846,MC Vinyl 7x5ft Litter Baby Boss Boys Theme Pho...,4.8,Camera & Photo,{'Package Dimensions': '12.9 x 9.2 x 0.6 inche...,B07RSQY4LL,,12.9 x 9.2 x 0.6 inches
56208,ASUS ZenPad 10 10.1-inch IPS WXGA (1920x1200) ...,4.1,Computers,{'Standing screen display size': '10.1 Inches'...,B01MYGPQSK,ASUS,
83897,Hynix HMT325S6CFR8C-PB 2GB DDR3 1600MHz CL11 2...,4.7,Computers,{'Package Dimensions': '3.7 x 3.1 x 0.02 inche...,B00L3QXI6S,Hynix,3.7 x 3.1 x 0.02 inches
98869,WiFi Extenders 1200Mbps Signal Booster for Hom...,3.5,Computers,"{'Product Dimensions': '2 x 2 x 1.5 inches', '...",B08BJWTSPP,Getue,
22260,cyen Adapter Headphone Adapter Headphone Adapt...,3.8,All Electronics,"{'Product Dimensions': '8.3 x 2 x 5.5 inches',...",B07FVY8K5R,CYen,
34409,HP 697738-001 Unbranded Portia USB Mouse,1.0,Computers,"{'Package Dimensions': '6 x 5.5 x 1.9 inches',...",B014QIXE8Q,HP,6 x 5.5 x 1.9 inches


In [None]:
print(df_feature.isnull().sum())
