# Data Preprocessing

This notebook preprocesses the Amazon Handmade Products dataset:
- **Product Reviews**: `Handmade_Products.jsonl.gz` ‚Üí `review.csv`
- **Product Metadata**: `meta_Handmade_Products.jsonl.gz` ‚Üí `metadata.csv`


In [1]:
# Import required libraries
import json
import gzip
import os
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


2025-12-24 07:27:34.177917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766561254.363618      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766561254.417630      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766561254.869710      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766561254.869746      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766561254.869749      55 computation_placer.cc:177] computation placer alr

Libraries imported successfully!


## 1. Process Product Reviews Data


In [2]:
def load_jsonl(filepath):
    data = []
    try:
        # Changed gzip.open to standard open()
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in tqdm(f, desc=f"Loading {os.path.basename(filepath)}"):
                if line.strip():
                    data.append(json.loads(line))
    except Exception as e:
        raise Exception(f"Error loading {filepath}: {str(e)}")
    
    return data

# Load review data
print("Loading review data from Handmade_Products.jsonl.gz...")
reviews = load_jsonl('/kaggle/input/amazon-product-review-handmade-products/Handmade_Products.jsonl/Handmade_Products.jsonl')
print(f"Loaded {len(reviews)} reviews")


Loading review data from Handmade_Products.jsonl.gz...


Loading Handmade_Products.jsonl: 664162it [00:08, 81294.43it/s] 

Loaded 664162 reviews





In [3]:
# Define columns to retain
columns_to_retain = ['user_id', 'asin', 'parent_asin', 'rating', 'title', 'text', 
                     'helpful_vote', 'verified_purchase', 'timestamp']

# Process reviews: filter columns and convert timestamp
processed_reviews = []
for review in tqdm(reviews, desc="Processing reviews"):
    processed_review = {}
    for col in columns_to_retain:
        if col == 'timestamp':
            # Convert unix timestamp to datetime
            unix_timestamp = review.get(col, None)
            if unix_timestamp is not None:
                try:
                    # Handle both seconds and milliseconds timestamps
                    if unix_timestamp > 1e10:  # Likely milliseconds
                        unix_timestamp = unix_timestamp / 1000
                    # Convert unix timestamp to datetime
                    processed_review[col] = datetime.fromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')
                except (ValueError, OSError, TypeError):
                    # Handle invalid timestamps
                    processed_review[col] = None
            else:
                processed_review[col] = None
        else:
            processed_review[col] = review.get(col, None)
    processed_reviews.append(processed_review)

# Convert to DataFrame
df_reviews = pd.DataFrame(processed_reviews)
print(f"\nReviews DataFrame shape: {df_reviews.shape}")
print(f"\nColumns: {list(df_reviews.columns)}")
print(f"\nFirst few rows:")
print(df_reviews.head())


Processing reviews: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 664162/664162 [00:03<00:00, 194284.63it/s]



Reviews DataFrame shape: (664162, 9)

Columns: ['user_id', 'asin', 'parent_asin', 'rating', 'title', 'text', 'helpful_vote', 'verified_purchase', 'timestamp']

First few rows:
                        user_id        asin parent_asin  rating  \
0  AF7OANMNHQJC3PD4HRPX2FATECPA  B08GPJ1MSN  B08GPJ1MSN     5.0   
1  AGMJ3EMDVL6OWBJF7CA5RGJLXN5A  B084TWHS7W  B084TWHS7W     5.0   
2  AEYORY2AVPMCPDV57CE337YU5LXA  B07V3NRQC4  B07V3NRQC4     5.0   
3  AEINY4XOINMMJCK5GZ3M6MMHBN6A  B071ZMDK26  B071ZMDK26     5.0   
4  AGCPAPUHXYA3EEIL2KGSQTGO5HRA  B01MPVZ4YP  B01MPVZ4YP     5.0   

                                        title  \
0                            Beautiful colors   
1  You simply must order order more than one!   
2                                       Great   
3                  Well made and so beautiful   
4            Smells just like the real thing!   

                                                text  helpful_vote  \
0  I bought one for myself and one for my grandda...   

In [4]:
# Save reviews to CSV
print("Saving reviews to review.csv...")
df_reviews.to_csv('review.csv', index=False)
file_size_mb = os.path.getsize('review.csv') / (1024 * 1024)
print(f"Successfully saved {len(df_reviews)} reviews to review.csv")
print(f"File size: {file_size_mb:.2f} MB")


Saving reviews to review.csv...
Successfully saved 664162 reviews to review.csv
File size: 152.36 MB


## 2. Filter Reviews by User and Product Constraints


In [5]:
# Filter reviews based on minimum constraints
# Minimum number of reviews per user = 5
# Minimum number of reviews per product = 5

MIN_USER_REVIEWS = 3
MIN_PRODUCT_REVIEWS = 3

print("Filtering reviews based on constraints...")
print(f"  ‚Ä¢ Minimum reviews per user: {MIN_USER_REVIEWS}")
print(f"  ‚Ä¢ Minimum reviews per product: {MIN_PRODUCT_REVIEWS}")

# First, filter out rows with missing user_id or parent_asin
df_reviews_for_filtering = df_reviews[
    df_reviews['user_id'].notna() & 
    df_reviews['parent_asin'].notna()
].copy()

print(f"\nInitial reviews: {len(df_reviews):,}")
print(f"Reviews with valid user_id and parent_asin: {len(df_reviews_for_filtering):,}")

# Count reviews per user and per product
user_review_counts = df_reviews_for_filtering['user_id'].value_counts()
product_review_counts = df_reviews_for_filtering['parent_asin'].value_counts()

print(f"\nBefore filtering:")
print(f"  ‚Ä¢ Users: {len(user_review_counts):,}")
print(f"  ‚Ä¢ Products: {len(product_review_counts):,}")

# Get valid users and products
valid_users = user_review_counts[user_review_counts >= MIN_USER_REVIEWS].index
valid_products = product_review_counts[product_review_counts >= MIN_PRODUCT_REVIEWS].index

print(f"\nAfter applying minimum constraints:")
print(f"  ‚Ä¢ Valid users (‚â•{MIN_USER_REVIEWS} reviews): {len(valid_users):,}")
print(f"  ‚Ä¢ Valid products (‚â•{MIN_PRODUCT_REVIEWS} reviews): {len(valid_products):,}")

# Filter reviews to keep only valid users and products
df_reviews_filtered = df_reviews_for_filtering[
    (df_reviews_for_filtering['user_id'].isin(valid_users)) & 
    (df_reviews_for_filtering['parent_asin'].isin(valid_products))
].copy()

print(f"\nFiltered reviews: {len(df_reviews_filtered):,}")
print(f"Reduction: {len(df_reviews) - len(df_reviews_filtered):,} reviews removed ({(1 - len(df_reviews_filtered)/len(df_reviews))*100:.2f}%)")

# Verify the constraints are met after filtering
final_user_counts = df_reviews_filtered['user_id'].value_counts()
final_product_counts = df_reviews_filtered['parent_asin'].value_counts()

print(f"\nVerification:")
print(f"  ‚Ä¢ Users with ‚â•{MIN_USER_REVIEWS} reviews: {(final_user_counts >= MIN_USER_REVIEWS).sum()}/{len(final_user_counts)}")
print(f"  ‚Ä¢ Products with ‚â•{MIN_PRODUCT_REVIEWS} reviews: {(final_product_counts >= MIN_PRODUCT_REVIEWS).sum()}/{len(final_product_counts)}")
print(f"  ‚Ä¢ Final unique users: {df_reviews_filtered['user_id'].nunique():,}")
print(f"  ‚Ä¢ Final unique products: {df_reviews_filtered['parent_asin'].nunique():,}")


Filtering reviews based on constraints...
  ‚Ä¢ Minimum reviews per user: 3
  ‚Ä¢ Minimum reviews per product: 3

Initial reviews: 664,162
Reviews with valid user_id and parent_asin: 664,162

Before filtering:
  ‚Ä¢ Users: 586,613
  ‚Ä¢ Products: 164,728

After applying minimum constraints:
  ‚Ä¢ Valid users (‚â•3 reviews): 11,984
  ‚Ä¢ Valid products (‚â•3 reviews): 47,006

Filtered reviews: 32,510
Reduction: 631,652 reviews removed (95.11%)

Verification:
  ‚Ä¢ Users with ‚â•3 reviews: 6986/11180
  ‚Ä¢ Products with ‚â•3 reviews: 3278/15810
  ‚Ä¢ Final unique users: 11,180
  ‚Ä¢ Final unique products: 15,810


In [6]:
# Save filtered reviews to CSV
print("\nSaving filtered reviews to review_filtered.csv...")
df_reviews_filtered.to_csv('review_filtered.csv', index=False)
file_size_mb = os.path.getsize('review_filtered.csv') / (1024 * 1024)
print(f"Successfully saved {len(df_reviews_filtered)} filtered reviews to review_filtered.csv")
print(f"File size: {file_size_mb:.2f} MB")



Saving filtered reviews to review_filtered.csv...
Successfully saved 32510 filtered reviews to review_filtered.csv
File size: 8.22 MB


## 3. Process Product Metadata


In [7]:
# Load metadata
print("Loading metadata from meta_Handmade_Products.jsonl.gz...")
metadata = load_jsonl('/kaggle/input/amazon-product-review-handmade-products/meta_Handmade_Products.jsonl/meta_Handmade_Products.jsonl')
print(f"Loaded {len(metadata)} metadata records")

Loading metadata from meta_Handmade_Products.jsonl.gz...


Loading meta_Handmade_Products.jsonl: 164817it [00:06, 25006.43it/s]

Loaded 164817 metadata records





In [8]:
# Convert metadata to DataFrame (keep all columns)
df_metadata = pd.DataFrame(metadata)
print(f"\nMetadata DataFrame shape: {df_metadata.shape}")
print(f"\nColumns: {list(df_metadata.columns)}")
print(f"\nFirst few rows:")
print(df_metadata.head())



Metadata DataFrame shape: (164817, 14)

Columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together']

First few rows:
  main_category                                              title  \
0      Handmade  Daisy Keychain Wristlet Gray Fabric Key fob La...   
1      Handmade  Anemone Jewelry Beauteous November Birthstone ...   
2      Handmade      Silver Triangle Earrings with Chevron Pattern   
3      Handmade  Las Vegas Subway Sign Print - Caesar's, Freemo...   
4      Handmade  Round Cut Cubic Zirconia Stud Earrings Fashion...   

   average_rating  rating_number  \
0             4.5             12   
1             4.1             10   
2             5.0              1   
3             1.0              1   
4             4.2              2   

                                            features  \
0  [High Quality Fabrics, Antique Brass Metallic ... 

In [9]:
# Save metadata to CSV
print("Saving metadata to metadata.csv...")
df_metadata.to_csv('metadata.csv', index=False)
file_size_mb = os.path.getsize('metadata.csv') / (1024 * 1024)
print(f"Successfully saved {len(df_metadata)} metadata records to metadata.csv")
print(f"File size: {file_size_mb:.2f} MB")


Saving metadata to metadata.csv...
Successfully saved 164817 metadata records to metadata.csv
File size: 347.67 MB


## 4. Create Item-Item Interaction Matrix using Text Embeddings (Filtered Data)


In [10]:
# Prepare text data for embeddings
# Filter metadata to only include products that have at least MIN_PRODUCT_REVIEWS reviews
print("Preparing text data for embeddings (using filtered products only)...")

# Get valid products from filtered reviews
valid_product_asins = df_reviews_filtered['parent_asin'].unique()
print(f"Products with ‚â•{MIN_PRODUCT_REVIEWS} reviews: {len(valid_product_asins):,}")
print(f"Sample product ASINs from reviews: {list(valid_product_asins[:5])}")

# Check what columns are available in metadata
print(f"\nMetadata columns: {list(df_metadata.columns)}")
if 'asin' in df_metadata.columns:
    print(f"Sample 'asin' values from metadata: {list(df_metadata['asin'].head().values)}")
if 'parent_asin' in df_metadata.columns:
    print(f"Sample 'parent_asin' values from metadata: {list(df_metadata['parent_asin'].head().values)}")

# Filter metadata to only include valid products
# Try multiple column matching strategies
df_metadata_filtered = None
if 'parent_asin' in df_metadata.columns:
    df_metadata_filtered = df_metadata[df_metadata['parent_asin'].isin(valid_product_asins)].copy()
    print(f"Matched using 'parent_asin' column: {len(df_metadata_filtered):,} products")
elif 'asin' in df_metadata.columns:
    df_metadata_filtered = df_metadata[df_metadata['asin'].isin(valid_product_asins)].copy()
    print(f"Matched using 'asin' column: {len(df_metadata_filtered):,} products")
else:
    # If no ASIN column, use index matching (assuming index is ASIN)
    df_metadata_filtered = df_metadata[df_metadata.index.isin(valid_product_asins)].copy()
    print(f"Matched using index: {len(df_metadata_filtered):,} products")

if df_metadata_filtered is None or len(df_metadata_filtered) == 0:
    print(f"\nWARNING: No metadata found for filtered products!")
    print(f"This might indicate a mismatch between 'parent_asin' in reviews and metadata columns.")
    print(f"Trying to find intersection...")
    # Try to find any overlap
    if 'asin' in df_metadata.columns:
        metadata_asins = set(df_metadata['asin'].unique())
    elif 'parent_asin' in df_metadata.columns:
        metadata_asins = set(df_metadata['parent_asin'].unique())
    else:
        metadata_asins = set(df_metadata.index.unique())
    
    review_asins = set(valid_product_asins)
    overlap = review_asins & metadata_asins
    print(f"Overlap between review ASINs and metadata ASINs: {len(overlap):,}")
    if len(overlap) == 0:
        raise ValueError("No matching products found between reviews and metadata. Check column names.")
    # Use the overlap
    valid_product_asins = list(overlap)
    if 'parent_asin' in df_metadata.columns:
        df_metadata_filtered = df_metadata[df_metadata['parent_asin'].isin(valid_product_asins)].copy()
    elif 'asin' in df_metadata.columns:
        df_metadata_filtered = df_metadata[df_metadata['asin'].isin(valid_product_asins)].copy()
    else:
        df_metadata_filtered = df_metadata[df_metadata.index.isin(valid_product_asins)].copy()

print(f"\nMetadata filtered to {len(df_metadata_filtered):,} products (from {len(df_metadata):,} total)")

def combine_product_text(row):
    """Combine title, features, and description into a single text string."""
    # Handle title
    title_val = row.get('title', '')
    if title_val is None or (not isinstance(title_val, (list, np.ndarray)) and pd.isna(title_val)):
        title = ''
    else:
        title = str(title_val)
    
    # Handle features (can be list, array, or string)
    features_val = row.get('features', '')
    if features_val is None:
        features = ''
    elif isinstance(features_val, (list, np.ndarray)):
        # Handle list/array features - check if it's empty first
        if len(features_val) == 0:
            features = ''
        else:
            features = ' '.join([str(f) for f in features_val if f is not None and str(f).strip() != ''])
    elif pd.isna(features_val):
        features = ''
    else:
        features = str(features_val)
    
    # Handle description
    description_val = row.get('description', '')
    if description_val is None or (not isinstance(description_val, (list, np.ndarray)) and pd.isna(description_val)):
        description = ''
    else:
        description = str(description_val)
    
    # Combine all text
    combined_text = f"{title} {features} {description}".strip()
    return combined_text if combined_text else ""

# Create combined text for each product
df_metadata_filtered['combined_text'] = df_metadata_filtered.apply(combine_product_text, axis=1)

# Filter out products with empty text
df_metadata_filtered = df_metadata_filtered[df_metadata_filtered['combined_text'].str.len() > 0].copy()
print(f"Products with valid text: {len(df_metadata_filtered):,}")

# Get product ASINs for indexing (ensure they match filtered reviews and have valid text)
# Use the same column that was used for filtering
if 'parent_asin' in df_metadata_filtered.columns:
    product_asins = df_metadata_filtered['parent_asin'].values.tolist()
elif 'asin' in df_metadata_filtered.columns:
    product_asins = df_metadata_filtered['asin'].values.tolist()
else:
    product_asins = df_metadata_filtered.index.values.tolist()

# Ensure all product_asins are in valid_product_asins (from filtered reviews)
# This should already be the case, but double-check
product_asins = [asin for asin in product_asins if asin in valid_product_asins]

print(f"Number of products to process: {len(product_asins):,}")
if len(product_asins) == 0:
    print("WARNING: No products found after filtering. This will cause an error in embedding generation.")


Preparing text data for embeddings (using filtered products only)...
Products with ‚â•3 reviews: 15,810
Sample product ASINs from reviews: ['B07PWBRXJG', 'B0855GMD9K', 'B07HN8PP1V', 'B07PVC79VH', 'B07T947ZG1']

Metadata columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together']
Sample 'parent_asin' values from metadata: ['B07NTK7T5P', 'B0751M85FV', 'B01HYNE114', 'B07TKZF3Z1', 'B0BKBJT5MM']
Matched using 'parent_asin' column: 15,810 products

Metadata filtered to 15,810 products (from 164,817 total)
Products with valid text: 15,810
Number of products to process: 15,810


In [11]:
# Load pre-trained sentence transformer model
print("Loading sentence transformer model...")
# Using a lightweight model for efficiency (can be changed to 'all-MiniLM-L6-v2' or 'all-mpnet-base-v2' for better quality)
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully!")


Loading sentence transformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded successfully!


In [12]:
# Generate embeddings for filtered products only
print("Generating text embeddings for filtered products...")
print(f"Number of product_asins to process: {len(product_asins):,}")
print(f"Metadata filtered shape: {df_metadata_filtered.shape}")

# Check if we have product_asins
if len(product_asins) == 0:
    raise ValueError("No products found to generate embeddings for. Check filtering constraints.")

# Filter metadata to match product_asins order
# Use the same column that was used for filtering
if 'parent_asin' in df_metadata_filtered.columns:
    df_metadata_for_embeddings = df_metadata_filtered[df_metadata_filtered['parent_asin'].isin(product_asins)].copy()
    if len(df_metadata_for_embeddings) == 0:
        raise ValueError(f"No metadata found for {len(product_asins)} product ASINs. Check if 'parent_asin' column matches 'parent_asin' from reviews.")
    # Sort to match product_asins order (only for products that exist in metadata)
    existing_asins = [asin for asin in product_asins if asin in df_metadata_for_embeddings['parent_asin'].values]
    if len(existing_asins) == 0:
        raise ValueError("No matching products found between metadata and filtered reviews.")
    df_metadata_for_embeddings = df_metadata_for_embeddings.set_index('parent_asin').loc[existing_asins].reset_index()
    product_asins_final = df_metadata_for_embeddings['parent_asin'].values.tolist()
elif 'asin' in df_metadata_filtered.columns:
    df_metadata_for_embeddings = df_metadata_filtered[df_metadata_filtered['asin'].isin(product_asins)].copy()
    if len(df_metadata_for_embeddings) == 0:
        raise ValueError(f"No metadata found for {len(product_asins)} product ASINs. Check if 'asin' column matches 'parent_asin' from reviews.")
    # Sort to match product_asins order (only for products that exist in metadata)
    existing_asins = [asin for asin in product_asins if asin in df_metadata_for_embeddings['asin'].values]
    if len(existing_asins) == 0:
        raise ValueError("No matching products found between metadata and filtered reviews.")
    df_metadata_for_embeddings = df_metadata_for_embeddings.set_index('asin').loc[existing_asins].reset_index()
    product_asins_final = df_metadata_for_embeddings['asin'].values.tolist()
else:
    df_metadata_for_embeddings = df_metadata_filtered[df_metadata_filtered.index.isin(product_asins)].copy()
    if len(df_metadata_for_embeddings) == 0:
        raise ValueError(f"No metadata found for {len(product_asins)} product ASINs. Check if index matches 'parent_asin' from reviews.")
    # Sort to match product_asins order (only for products that exist in metadata)
    existing_asins = [asin for asin in product_asins if asin in df_metadata_for_embeddings.index.values]
    if len(existing_asins) == 0:
        raise ValueError("No matching products found between metadata and filtered reviews.")
    df_metadata_for_embeddings = df_metadata_for_embeddings.loc[existing_asins]
    product_asins_final = df_metadata_for_embeddings.index.values.tolist()

texts = df_metadata_for_embeddings['combined_text'].tolist()
# Update product_asins to match the order of texts/embeddings
product_asins = product_asins_final

print(f"Products with valid metadata and text: {len(texts):,}")

# Check if we have any texts to process
if len(texts) == 0:
    raise ValueError("No valid text found for embedding generation. All products may have empty combined_text.")

# Generate embeddings in batches for efficiency
batch_size = 32
embeddings = []
for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
    batch_texts = texts[i:i+batch_size]
    if len(batch_texts) > 0:  # Ensure batch is not empty
        batch_embeddings = model.encode(batch_texts, show_progress_bar=False)
        embeddings.append(batch_embeddings)

# Check if any embeddings were generated
if len(embeddings) == 0:
    raise ValueError("No embeddings were generated. Check if texts are valid and model is working correctly.")

# Concatenate all embeddings
embeddings = np.vstack(embeddings)
print(f"Generated embeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Number of products with embeddings: {len(product_asins):,}")


Generating text embeddings for filtered products...
Number of product_asins to process: 15,810
Metadata filtered shape: (15810, 15)
Products with valid metadata and text: 15,810


Generating embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 495/495 [00:27<00:00, 17.68it/s]

Generated embeddings shape: (15810, 384)
Embedding dimension: 384
Number of products with embeddings: 15,810





In [13]:
# Compute cosine similarity matrix
print("Computing cosine similarity matrix...")
similarity_matrix = cosine_similarity(embeddings)
print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"Similarity values range: [{similarity_matrix.min():.4f}, {similarity_matrix.max():.4f}]")


Computing cosine similarity matrix...
Similarity matrix shape: (15810, 15810)
Similarity values range: [-0.2348, 1.0000]


In [14]:
# Convert similarity matrix to DataFrame with ASINs as index and columns
print("Creating item-item interaction matrix DataFrame...")
df_similarity = pd.DataFrame(
    similarity_matrix,
    index=product_asins,
    columns=product_asins
)

print(f"Item-item matrix shape: {df_similarity.shape}")
print(f"\nSample of similarity matrix:")
print(df_similarity.iloc[:5, :5])


Creating item-item interaction matrix DataFrame...
Item-item matrix shape: (15810, 15810)

Sample of similarity matrix:
            B01676ZK7I  B077LRWPY5  B0BN4DZJ3K  B097B8WH61  B01MT90C67
B01676ZK7I    1.000000    0.241234    0.388235    0.097201    0.214085
B077LRWPY5    0.241234    1.000000    0.357865    0.352327    0.253351
B0BN4DZJ3K    0.388235    0.357865    1.000000    0.183518    0.347541
B097B8WH61    0.097201    0.352327    0.183518    1.000000    0.362750
B01MT90C67    0.214085    0.253351    0.347541    0.362750    1.000000


In [15]:
# Save item-item interaction matrix to CSV
print("Saving item-item interaction matrix to item_item_similarity.csv...")
df_similarity.to_csv('item_item_similarity.csv')
file_size_mb = os.path.getsize('item_item_similarity.csv') / (1024 * 1024)
print(f"Successfully saved similarity matrix to item_item_similarity.csv")
print(f"File size: {file_size_mb:.2f} MB")
print(f"\nMatrix contains {len(df_similarity):,} x {len(df_similarity):,} products")
print(f"Each value represents cosine similarity between product text embeddings")


Saving item-item interaction matrix to item_item_similarity.csv...
Successfully saved similarity matrix to item_item_similarity.csv
File size: 2573.79 MB

Matrix contains 15,810 x 15,810 products
Each value represents cosine similarity between product text embeddings


## 4. Create User-Item Interaction Matrix


In [16]:
# Prepare review data for user-item matrix
print("Preparing user-item interaction matrix...")

# Filter out rows with missing user_id, product_id, or rating
df_reviews_clean = df_reviews[
    df_reviews['user_id'].notna() & 
    df_reviews['parent_asin'].notna() & 
    df_reviews['rating'].notna()
].copy()

# Convert rating to numeric if it's not already
df_reviews_clean['rating'] = pd.to_numeric(df_reviews_clean['rating'], errors='coerce')

# Remove rows with invalid ratings
df_reviews_clean = df_reviews_clean[df_reviews_clean['rating'].notna()]

print(f"Valid reviews for matrix: {len(df_reviews_clean):,}")
print(f"Unique users: {df_reviews_clean['user_id'].nunique():,}")
print(f"Unique products: {df_reviews_clean['parent_asin'].nunique():,}")

# Check for duplicate user-product pairs
duplicates = df_reviews_clean.groupby(['user_id', 'parent_asin']).size()
if (duplicates > 1).any():
    print(f"\nWarning: Found {len(duplicates[duplicates > 1])} user-product pairs with multiple ratings")
    print("Taking average rating for duplicate pairs...")
    # Take average rating for duplicate user-product pairs
    df_reviews_clean = df_reviews_clean.groupby(['user_id', 'parent_asin'])['rating'].mean().reset_index()
    print(f"After aggregation: {len(df_reviews_clean):,} unique user-product pairs")


Preparing user-item interaction matrix...
Valid reviews for matrix: 664,162
Unique users: 586,613
Unique products: 164,728

Taking average rating for duplicate pairs...
After aggregation: 656,096 unique user-product pairs


In [None]:
# Create user-item interaction matrix using pivot_table
print("\nCreating user-item interaction matrix...")
df_user_item = df_reviews_clean.pivot_table(
    index='user_id',
    columns='parent_asin',
    values='rating',
    fill_value=0  # Fill missing values with 0 (no interaction)
)

print(f"User-item matrix shape: {df_user_item.shape}")
print(f"  ‚Ä¢ Rows (users): {df_user_item.shape[0]:,}")
print(f"  ‚Ä¢ Columns (products): {df_user_item.shape[1]:,}")
print(f"  ‚Ä¢ Total cells: {df_user_item.size:,}")
print(f"  ‚Ä¢ Non-zero cells: {(df_user_item != 0).sum().sum():,}")
print(f"  ‚Ä¢ Sparsity: {(1 - (df_user_item != 0).sum().sum() / df_user_item.size) * 100:.2f}%")

print(f"\nRating statistics:")
print(f"  ‚Ä¢ Min rating: {df_user_item[df_user_item != 0].min().min():.2f}")
print(f"  ‚Ä¢ Max rating: {df_user_item.max().max():.2f}")
print(f"  ‚Ä¢ Mean rating: {df_user_item[df_user_item != 0].mean().mean():.2f}")

print(f"\nSample of user-item matrix:")
print(df_user_item.iloc[:5, :5])



Creating user-item interaction matrix...


In [None]:
# Save user-item interaction matrix to CSV
print("Saving user-item interaction matrix to user_item_matrix.csv...")
df_user_item.to_csv('user_item_matrix.csv')
file_size_mb = os.path.getsize('user_item_matrix.csv') / (1024 * 1024)
print(f"Successfully saved user-item matrix to user_item_matrix.csv")
print(f"File size: {file_size_mb:.2f} MB")
print(f"\nMatrix contains:")
print(f"  ‚Ä¢ {len(df_user_item):,} users (rows)")
print(f"  ‚Ä¢ {len(df_user_item.columns):,} products (columns)")
print(f"  ‚Ä¢ Each value represents the rating from a user for a product")
print(f"  ‚Ä¢ Missing interactions are represented as 0")


## 6. Summary


In [None]:
print("="*70)
print("DATA PREPROCESSING SUMMARY")
print("="*70)

print("\nüìä REVIEW DATA:")
print(f"  ‚Ä¢ Total Reviews: {len(df_reviews):,}")
print(f"  ‚Ä¢ Columns: {', '.join(df_reviews.columns)}")
print(f"  ‚Ä¢ Saved to: review.csv")

print("\nüîç FILTERED REVIEW DATA:")
print(f"  ‚Ä¢ Filtered Reviews: {len(df_reviews_filtered):,}")
print(f"  ‚Ä¢ Minimum reviews per user: {MIN_USER_REVIEWS}")
print(f"  ‚Ä¢ Minimum reviews per product: {MIN_PRODUCT_REVIEWS}")
print(f"  ‚Ä¢ Unique users (filtered): {df_reviews_filtered['user_id'].nunique():,}")
print(f"  ‚Ä¢ Unique products (filtered): {df_reviews_filtered['parent_asin'].nunique():,}")
print(f"  ‚Ä¢ Saved to: review_filtered.csv")

print("\nüì¶ METADATA:")
print(f"  ‚Ä¢ Total Records: {len(df_metadata):,}")
print(f"  ‚Ä¢ Columns: {len(df_metadata.columns)}")
print(f"  ‚Ä¢ Saved to: metadata.csv")

print("\nüîó ITEM-ITEM INTERACTION MATRIX (Filtered):")
print(f"  ‚Ä¢ Products processed: {len(df_similarity):,}")
print(f"  ‚Ä¢ Matrix dimensions: {df_similarity.shape[0]:,} x {df_similarity.shape[1]:,}")
print(f"  ‚Ä¢ Similarity range: [{similarity_matrix.min():.4f}, {similarity_matrix.max():.4f}]")
print(f"  ‚Ä¢ Embedding model: all-MiniLM-L6-v2")
print(f"  ‚Ä¢ Text features used: title + features + description")
print(f"  ‚Ä¢ Only includes products with ‚â•{MIN_PRODUCT_REVIEWS} reviews")
print(f"  ‚Ä¢ Saved to: item_item_similarity.csv")

print("\nüë• USER-ITEM INTERACTION MATRIX (Filtered):")
print(f"  ‚Ä¢ Users: {len(df_user_item):,} (‚â•{MIN_USER_REVIEWS} reviews each)")
print(f"  ‚Ä¢ Products: {len(df_user_item.columns):,} (‚â•{MIN_PRODUCT_REVIEWS} reviews each)")
print(f"  ‚Ä¢ Matrix dimensions: {df_user_item.shape[0]:,} x {df_user_item.shape[1]:,}")
print(f"  ‚Ä¢ Non-zero interactions: {(df_user_item != 0).sum().sum():,}")
print(f"  ‚Ä¢ Rating range: [{df_user_item[df_user_item != 0].min().min():.2f}, {df_user_item.max().max():.2f}]")
print(f"  ‚Ä¢ Saved to: user_item_matrix.csv")

print("\n‚úÖ PREPROCESSING COMPLETE!")
print("="*70)
