In [1]:
import os
os.environ['HF_HOME'] = '/mnt/sagemaker-nvme/cache'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import torch
import pandas as pd
from sklearn.cluster import KMeans
from utils import last_token_pool, get_detailed_instruct
from transformers import AutoTokenizer, AutoModel

# Load your CSV file
file_path = 'low.csv'
data = pd.read_csv(file_path)

def get_formatted_persona_dim(row):
    task = 'Given a persona dimension description, retrieve semantically similar persona dimension descriptions.'
    persona = f"{row['name']}: {row['description']}. Candidate values: {row['candidate_values']}"
    return get_detailed_instruct(task, persona)

data['formatted'] = data.apply(get_formatted_persona_dim, axis=1)

In [2]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-2_R')
model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-2_R', device_map='auto')


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Get the embeddings
max_length = 4096
input_texts = data['formatted'].to_list()
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**batch_dict)
    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    print(embeddings)

tensor([[ 4.1695,  4.6872,  2.8439,  ...,  9.8889,  1.9589,  4.9600],
        [ 3.9163,  2.8792, -0.4433,  ..., 13.2968,  3.3441, -3.0337],
        [ 0.4857,  2.2519, -1.5539,  ..., 11.8453, -4.5226, -0.2279],
        ...,
        [-1.2200,  2.7208,  5.6652,  ..., 11.7049, -5.5550, -3.7038],
        [-3.4693,  2.8005,  4.0805,  ...,  7.9247, -4.8398,  0.1347],
        [ 1.7381, -2.5070,  0.9047,  ...,  8.5441,  0.7178, -6.1294]])


In [4]:
# Clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
data['cluster'] = clustering_model.labels_
data = data.sort_values(by='cluster')
data.to_csv('low_clustered.csv')

In [6]:


# Summarization
summarizer = pipeline('summarization')
def summarize_text(texts):
    combined_text = " ".join(texts)
    summary = summarizer(combined_text, max_length=50, min_length=25, do_sample=False)
    return summary[0]['summary_text']

data['summary'] = data.groupby('cluster')['candidate_values'].transform(lambda x: summarize_text(x.tolist()))

# Save the result
data.to_csv('/mnt/data/merged_user_persona_dimensions.csv', index=False)


NameError: name 'pipeline' is not defined