In [8]:
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load the CSV file
file_path = 'test_with_predictions_MPM_desc.csv'
df = pd.read_csv(file_path)

# Initialize BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)  # Move model to GPU

# Function to encode text using BERT
def encode_text(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length').to(device)  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Move output to CPU before converting to numpy

# Filter dataframe for ships_idx == 1032
df_filtered = df[df['ships_idx'] == 1024]

# Calculate similarity for each row in the filtered dataframe
similarities = []
for index, row in tqdm(df_filtered.iterrows(), total=df_filtered.shape[0], desc="Processing rows"):
    tag_description = row['tag_description']
    p_thing_property = row['p_thing_property']
    
    # Skip rows where either tag_description or p_thing_property is null
    if pd.isnull(tag_description) or pd.isnull(p_thing_property):
        similarities.append(np.nan)
        continue
    
    embedding1 = encode_text(tag_description)
    embedding2 = encode_text(p_thing_property)
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
    similarities.append(similarity_score)

# Add the similarity column to the filtered dataframe
df_filtered[model_name] = similarities

# Save the updated dataframe to a new CSV file
output_file_path = 'test_with_predictions_MPM_desc_similarity.csv'
df_filtered.to_csv(output_file_path, index=False)

print("Updated file saved successfully.")


Using device: cuda


Processing rows: 100%|██████████| 1081/1081 [00:21<00:00, 50.82it/s]

Updated file saved successfully.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered[model_name] = similarities
