## Loading and preparing the dataset
El primer paso es cargar el datset desde HuggingFace

In [27]:
import pandas as pd

In [28]:
from datasets import load_dataset

# https://huggingface.co/datasets/talkmap/banking-conversation-corpus

banking_conversation_corpus = load_dataset("talkmap/banking-conversation-corpus", split="train")
banking_conversation_corpus

Dataset({
    features: ['conversation_id', 'speaker', 'date_time', 'text'],
    num_rows: 5532112
})

En el segundo paso eliminamos las conversaciones menores a 10 y que tengan None

In [29]:
banking_conversation_corpus = banking_conversation_corpus.filter(lambda x: x['text'] is not None and len(x['text']) > 10)
banking_conversation_corpus

Dataset({
    features: ['conversation_id', 'speaker', 'date_time', 'text'],
    num_rows: 5421377
})

In [30]:
banking_conversation_corpus_sample = banking_conversation_corpus[:100000]

Debemos agrupar todas las conversaciones por Conversation ID y crear una celda con la conversacion entera, para ello uso Pandas por ser mas optimo

In [31]:
import pandas as pd

# Convert the dictionary sample to a pandas DataFrame directly
df = pd.DataFrame(banking_conversation_corpus_sample)

import pandas as pd

# 2. Ordenar cronológicamente
# Es indispensable para que el diálogo sea coherente.
df = df.sort_values(by=['conversation_id', 'date_time'])

# 3. Formateo Vectorizado (Mucho más rápido que .apply)
# Creamos la línea "SPEAKER: mensaje" usando operaciones nativas de strings
df['temp_line'] = df['speaker'].str.upper() + ": " + df['text'].fillna('')

# 4. Agrupación y Concatenación
# Usamos el método 'join' directamente sobre el objeto agrupado
df_conversations = (
    df.groupby('conversation_id', sort=False)['temp_line']
    .apply(lambda x: "\n".join(x))
    .reset_index(name='full_conversation')
)

# 5. Limpieza de memoria
# Eliminamos la columna temporal del dataframe original si es necesario
df.drop(columns=['temp_line'], inplace=True)

# Visualización del resultado
print(df_conversations.head())

# Guardar resultado (Parquet es mejor para archivos grandes)
# df_conversations.to_parquet('conversations_merged.parquet', index=False)

                    conversation_id  \
0  0007b43c697f40a38ba2395d6fee20dd   
1  001ce2f3448143d3b8df2e5185b330de   
2  001f3ecbff8e4b169cbb99069155a6d3   
3  0021f9e5b6e044f69e070c4a70de4693   
4  00232e8c81e1409ca89d54ed802acf3a   

                                   full_conversation  
0  AGENT: Good morning, thank you for calling Uni...  
1  AGENT: Hello, thank you for calling Union Fina...  
2  AGENT: Hello you for calling Union Financial. ...  
3  AGENT: Good morning, thank you for calling Uni...  
4  AGENT: Good morning, thank you for holding. My...  


Una vez esta agrupado lo vuelvo a formato Dataset

In [32]:
from datasets import Dataset

banking_conversation_corpus = Dataset.from_pandas(df_conversations)
banking_conversation_corpus

Dataset({
    features: ['conversation_id', 'full_conversation'],
    num_rows: 5659
})

Creating text embeddings

In [33]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

MPNetModel LOAD REPORT from: sentence-transformers/multi-qa-mpnet-base-dot-v1
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Voy a usar GPUs

In [34]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

Usamos pooling

In [35]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

Aux function

In [36]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

Test

In [37]:
embedding = get_embeddings(banking_conversation_corpus["full_conversation"][0])
embedding.shape

torch.Size([1, 768])

Aplicamos a todo con MAP

In [None]:
embeddings_dataset = banking_conversation_corpus.map(
    lambda x: {"embeddings": get_embeddings(x["full_conversation"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/5659 [00:00<?, ? examples/s]

Using FAISS for eddicient similarity serach

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

Ahora ya puedo hacer busqyedas

In [None]:
question = "calling Union Financial"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

Con los mejores 5 o los 5 mas similares

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [None]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [None]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.full_conversation}")
    print(f"SCORE: {row.scores}")
    print("=" * 50)
    print()

In [None]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0 )
)

issues_dataset

In [None]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

In [None]:
issues_dataset.set_format('pandas')
df = issues_dataset[:]
df[:1]

In [None]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head()

In [None]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

In [None]:
comments_dataset = comments_dataset.map(
    lambda x: {'comment_length': len(x["comments"].split())}
)

In [None]:
comments_dataset[:1]

In [None]:
comments_dataset = comments_dataset.filter(lambda x: x['comment_length'] > 15)
comments_dataset

In [None]:
def concatenate_text(examples):
  return {
      "text": examples["title"]
      + " \n "
      + examples["body"]
      + " \n "
      + examples["comments"]
  }

comments_dataset = comments_dataset.map(concatenate_text)


In [None]:
comments_dataset[:1]

In [None]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
import torch

device = torch.device("cuda")
model.to(device)

In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [None]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

In [None]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [None]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [None]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()