# Library Import and files

- Files available here: https://amazon-reviews-2023.github.io/

## nvidia rapids verification to run using rapids ubuntu

In [1]:
import subprocess
import pandas as pd

def check_nvidia_gpu():
    try:
        # Check if NVIDIA GPU is available using nvidia-smi
        result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.returncode == 0:
            print("NVIDIA GPU detected.")
            return True
        else:
            print("No NVIDIA GPU detected. Using CPU-based `pandas`.")
            return False
    except FileNotFoundError:
        print("`nvidia-smi` command not found. Using CPU-based `pandas`.")
        return False
    except Exception as e:
        print(f"An error occurred while checking for the GPU: {str(e)}. Using CPU-based `pandas`.")
        return False

if __name__ == "__main__":
    # Check if we have a GPU
    gpu_available = check_nvidia_gpu()

    if gpu_available:
        try:
            # Load the cuDF extension to accelerate pandas with the GPU
            print("Loading GPU-accelerated `cudf.pandas`...")
            # Ensure this works inside Jupyter using the `%load_ext` magic command
            get_ipython().run_line_magic('load_ext', 'cudf.pandas')
            print("GPU acceleration enabled!")
        except Exception as e:
            print(f"Failed to load `cudf.pandas`. Error: {e}")
            print("Falling back to CPU-based `pandas`.")



NVIDIA GPU detected.
Loading GPU-accelerated `cudf.pandas`...
GPU acceleration enabled!


## Regular libraries

In [2]:
import pandas as pd

# HuggingFace Dataset

In [None]:
# from datasets import load_dataset

# meta_ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Automotive", split="full", trust_remote_code=True)

# meta_df = meta_ds.to_pandas() 
# print(df.head())

# Local File

In [None]:

file_path='meta_Automotive.jsonl'

wsl_filepath='/mnt/c/Users/nosta/CODING/ist_dash_2024_rec/notebooks/amazon_reviews/meta_Automotive.jsonl'

try:

    meta_df = pd.read_json(file_path, lines=True)
    print('wsl file path loaded')

except:

    meta_df = pd.read_json(file_path, lines=True)
    print('regular folder file path loaded')




display(meta_df.head())


In [None]:
meta_df.info()

In [None]:
print(meta_df['parent_asin'].nunique())

# How many categories?

In [None]:
meta_df['path_length'] = meta_df['categories'].apply(len)

max_path_length = meta_df['path_length'].max()

print(f"Maximum path length: {max_path_length}")


## split df categories to then join with original df

In [None]:
# Step 1: Split the 'categories' column into 9 columns
df_split = meta_df['categories'].apply(lambda x: ','.join(x)).str.split(',', expand=True)

# Step 2: Rename columns (e.g., category_1, category_2, ..., category_9)
df_split.columns = [f'category_{i+1}' for i in range(df_split.shape[1])]


# Encode missing values as a new category, with page path number
for col in df_split.columns:
    df_split[col] = df_split[col].fillna(meta_df['path_length'])


# Step 3: Concatenate the new split columns back to the original DataFrame (meta_df)
meta_df = pd.concat([meta_df, df_split], axis=1)


meta_df.info()


In [None]:
print(meta_df.isnull().sum())


# Image and video count

In [10]:
meta_df['num_images'] = meta_df['images'].apply(len)

meta_df['num_videos'] = meta_df['videos'].apply(len)


# Feature string parsing

## feature df

In [None]:
# generate random row sample to get values from dict
print(meta_df['details'].sample(n=1).iloc[0])


In [74]:
df_feature = meta_df[['title','details','parent_asin']]

In [None]:
display(df_feature.sample(n=10))

In [None]:
# unique_keys = set()
# for details_dict in df_feature['details']:
#     if isinstance(details_dict, dict):  # Ensure it's a dictionary
#         unique_keys.update(details_dict.keys())

# # Display the unique keys
# print(unique_keys)

In [None]:
keys_to_extract = [
    'Brand',
    'Color',
    'Item Weight',
    'Package Dimensions',
    'Is Discontinued By Manufacturer',
    'Manufacturer Part Number',
    'OEM Part Number',
    'Date First Available'
    'Number of Pieces',
    'Exterior Finish',
    'Handle Type',
    'Special Feature',
    'Included Components',
]


# Loop over the list of keys and create new columns in the DataFrame
for key in keys_to_extract:
    column_name = key.replace(' ', '_').replace('By', '_By')  # Replace spaces and handle "By" for column names
    df_feature[column_name] = df_feature['details'].apply(lambda x: x.get(key, None))


# df_feature['Brand'] = df_feature['details'].apply(lambda x: x.get('Brand', None))
# df_feature['Color'] = df_feature['details'].apply(lambda x: x.get('Color', None))
# df_feature['Item_Weight'] = df_feature['details'].apply(lambda x: x.get('Item Weight', None))
# df_feature['Package_Dimensions'] = df_feature['details'].apply(lambda x: x.get('Package Dimensions', None))
# df_feature['Is_Discontinued'] = df_feature['details'].apply(lambda x: x.get('Is Discontinued By Manufacturer', None))
# df_feature['Manufacturer_Part_Number'] = df_feature['details'].apply(lambda x: x.get('Manufacturer Part Number', None))
# df_feature['OEM_Part_Number'] = df_feature['details'].apply(lambda x: x.get('OEM Part Number', None))
# df_feature['Date_First_Available'] = df_feature['details'].apply(lambda x: x.get('Date First Available', None))

# Number of Pieces
# Exterior Finish
# Handle Type
# Special Feature
# Included Components

In [None]:
display(df_feature.sample(n=10))

In [None]:
print(df_feature.isnull().sum())
