In [6]:
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import time
import os
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# 1. Load the data
df = pd.read_parquet('data/reviews_electronics_500k.parquet')

# 2. Check for GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# 3. Load the model onto the device
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
print("✅ Model loaded successfully.")

Using device: cuda


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


✅ Model loaded successfully.


In [None]:

# 1. Prepare the text data
# embedding the 'reviewText' column
sentences = df['reviewText'].tolist()

print(f"Starting embedding for {len(sentences):,} reviews...")
start_time = time.time()

# 2. Generate Embeddings
embeddings = model.encode(
    sentences, 
    batch_size=128, 
    show_progress_bar=True, 
    convert_to_numpy=True
)

end_time = time.time()
duration = end_time - start_time

print(f"\n✅ Success! Generated {embeddings.shape[0]} embeddings.")
print(f"Dimension of each vector: {embeddings.shape[1]}")
print(f"Total Time: {duration/60:.2f} minutes")
print(f"Speed: {len(sentences)/duration:.1f} reviews per second")

Starting embedding for 379,019 reviews...


Batches:   0%|          | 0/2962 [00:00<?, ?it/s]


✅ Success! Generated 379019 embeddings.
Dimension of each vector: 384
Total Time: 3.51 minutes
Speed: 1797.5 reviews per second


In [None]:
# 1. Create a directory for processed AI assets if it doesn't exist
os.makedirs('models', exist_ok=True)

# 2. Save the embeddings as a binary numpy file
np.save('models/review_embeddings_500k.npy', embeddings)

print("✅ Embeddings saved to 'models/review_embeddings_500k.npy'")

✅ Embeddings saved to 'models/review_embeddings_500k.npy'


In [11]:
# 1. Define a test query
query = "awesome batetry life"

# 2. Turn the query into a vector
query_vector = model.encode([query])

# 3. Calculate similarity between the query and ALL 379k reviews
similarities = cosine_similarity(query_vector, embeddings)

# 4. Find the index of the most similar review
top_index = np.argmax(similarities)

# 5. Show the result
print(f"Query: {query}")
print("-" * 30)
print(f"Most Similar Review Found (Index {top_index}):")
print(f"Product: {df.iloc[top_index]['title']}")
print(f"Review: {df.iloc[top_index]['reviewText']}")
print(f"Similarity Score: {similarities[0][top_index]:.4f}")

Query: awesome batetry life
------------------------------
Most Similar Review Found (Index 265269):
Product: Nikon EN-MH2-B4/MH-73 2 hour Charger with 4 2300mAh Ni-MH AA Rechargeable Batteries - Retail Packaging
Review: This is ideal to keep my batt,s ready to go when needed!!
Similarity Score: 0.5875


In [10]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)