In [2]:

# %% [code]
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Function to get embeddings in batches
def get_embeddings(texts, batch_size=128):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        with torch.no_grad():
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            outputs = model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).cpu().numpy())
    return np.vstack(embeddings)

# Load the dataset
df = pd.read_csv("cleaned_dataset_with_renamed_description.csv")

# Generate embeddings for all descriptions
embeddings = get_embeddings(df['description'].tolist())

# Calculate cosine similarities
cos_sim_matrix = cosine_similarity(embeddings)

# Data structures for triplet data
anchors, positives, negatives = [], [], []

# Generate triplets with a more sophisticated method
def generate_triplets(cos_sim_matrix, descriptions):
    for idx, anchor_desc in enumerate(descriptions):
        sorted_indices = np.argsort(cos_sim_matrix[idx])[::-1]
        positive_indices = sorted_indices[sorted_indices != idx]  # Exclude the anchor itself

        if len(positive_indices) > 0:
            # Choose the closest as the positive
            positive_idx = positive_indices[0]
            # Choose the farthest as the negative
            negative_idx = positive_indices[-1]  # This is simplistic; consider a more sophisticated selection
            
            anchors.append(anchor_desc)
            positives.append(descriptions[positive_idx])
            negatives.append(descriptions[negative_idx])

generate_triplets(cos_sim_matrix, df['description'].tolist())

# Save to DataFrame and then to CSV
triplet_df = pd.DataFrame({
    'anchor': anchors,
    'positive': positives,
    'negative': negatives
})
triplet_df.to_csv('triplet_data.csv', index=False)
print("Triplet data generated and saved to triplet_data.csv")


  from .autonotebook import tqdm as notebook_tqdm


OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 