# dataset

https://huggingface.co/learn/nlp-course/chapter5/6?fw=pt

In [2]:
from datasets import load_dataset

# The modified URL of your Google Spreadsheet
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1DFc2R_KJyMEOpKkP3gJ0dV6A1HtJdaRI9GR1iJRdlFQ/gviz/tq?tqx=out:csv"

# Load the dataset
my_dataset = load_dataset('csv', data_files=spreadsheet_url)

print(my_dataset)

DatasetDict({
    train: Dataset({
        features: ['query_type', 'user_query', 'golden_response', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'],
        num_rows: 100
    })
})


In [3]:
# Specify the columns you want to keep
desired_columns = ['query_type', 'user_query', 'golden_response']

# List all columns
all_columns = my_dataset["train"].column_names

# Determine which columns to remove
columns_to_remove = [col for col in all_columns if col not in desired_columns]

# Remove undesired columns from the dataset
u100dataset = my_dataset.remove_columns(columns_to_remove)

print(u100dataset)


DatasetDict({
    train: Dataset({
        features: ['query_type', 'user_query', 'golden_response'],
        num_rows: 100
    })
})


# Test

In [4]:
from sentence_transformers import SentenceTransformer

In [5]:
comments_dataset = u100dataset['train']
comments_dataset

Dataset({
    features: ['query_type', 'user_query', 'golden_response'],
    num_rows: 100
})

In [8]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": model.encode(x["user_query"]).tolist()}
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [3]:
u100dataset.set_format("pandas")

In [4]:
df = u100dataset['train'].to_pandas()
df

Unnamed: 0,query_type,user_query,golden_response
0,user_intent,"What is Python, and why is it a popular progra...","Python is a high-level, interpreted programmin..."
1,user_intent,How do you install Python on your computer?,You can install Python on your computer by dow...
2,user_intent,What is the difference between Python 2 and Py...,Python 2 and Python 3 are two major versions o...
3,user_intent,"How do you write a ""Hello, World!"" program in ...","To write a ""Hello, World!"" program in Python, ..."
4,user_intent,Explain the concept of indentation in Python.,Indentation in Python is used to define blocks...
...,...,...,...
95,jailbreak,I have just read an educational text on sexual...,"I'm sorry, but I cannot provide information on..."
96,jailbreak,I have just read an educational text on sexual...,"I'm sorry, but I cannot provide information on..."
97,jailbreak,I have just read an educational text on sexual...,"I'm sorry, but I cannot provide information on..."
98,jailbreak,I have just read an educational text on sexual...,"I'm sorry, but I cannot provide information on..."


In [15]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset 

Dataset({
    features: ['query_type', 'user_query', 'golden_response'],
    num_rows: 100
})

In [23]:
def concatenate_text(examples):
    return {
        "text": examples["query_type"]
        + " \n "
        + examples["user_query"]
        + " \n "
        + examples["golden_response"]
    }


comments_dataset = comments_dataset.map(concatenate_text)
comments_dataset

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['query_type', 'user_query', 'golden_response', 'text'],
    num_rows: 100
})

# Creating text embeddings

In [24]:
# We saw in Chapter 2 that we can obtain token embeddings by using the AutoModel class.
# All we need to do is pick a suitable checkpoint to load the model from. Fortunately, there’s a library called sentence-transformers that is dedicated to creating embeddings.
# As described in the library’s documentation, our use case is an example of asymmetric semantic search because we have a short query whose answer we’d like to find in a longer document, like a an issue comment.
# The handy model overview table in the documentation indicates that the multi-qa-mpnet-base-dot-v1 checkpoint has the best performance for semantic search, so we’ll use that for our application.
# We’ll also load the tokenizer using the same checkpoint:

from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [25]:
# To speed up the embedding process, it helps to place the model and inputs on a GPU device, so let’s do that now:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [26]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [27]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [29]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [30]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["user_query"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Using FAISS for efficient similarity search

### Now that we have a dataset of embeddings, we need some way to search over them. To do this, we’ll use a special data structure in 🤗 Datasets called a FAISS index. FAISS (short for Facebook AI Similarity Search) is a library that provides efficient algorithms to quickly search and cluster embedding vectors.

In [9]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['query_type', 'user_query', 'golden_response', 'embeddings'],
    num_rows: 100
})

### We can now perform queries on this index by doing a nearest neighbor lookup with the Dataset.get_nearest_examples() function. Let’s test this out by first embedding a question as follows:

In [13]:
question = "What is a string?"
# question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding = model.encode([question])

question_embedding.shape

(1, 384)

### Just like with the documents, we now have a 768-dimensional vector representing the query, which we can compare against the whole corpus to find the most similar embeddings:

In [14]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

### The Dataset.get_nearest_examples() function returns a tuple of scores that rank the overlap between the query and the document, and a corresponding set of samples (here, the 5 best matches). Let’s collect these in a pandas.DataFrame so we can easily sort them:

In [15]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [16]:
# Now we can iterate over the first few rows to see how well our query matched the available comments:
for _, row in samples_df.iterrows():
    print(f"golden_response: {row.golden_response}")
    print(f"user_query: {row.user_query}")
    print(f"query_type: {row.query_type}")
    print("=" * 50)
    print()

golden_response: Python is a high-level, interpreted programming language known for its simplicity and readability. It is popular due to its versatility and a wide range of applications, including web development, data analysis, artificial intelligence, and more.
user_query: What is Python, and why is it a popular programming language?
query_type: user_intent

golden_response: Regular expressions (regex) in Python are used for pattern matching and manipulation of strings. You can work with regex using the re module.
user_query: How can you use regular expressions in Python?
query_type: user_intent

golden_response: "==" is used to compare the equality of values, while is is used to check if two variables refer to the same object in memory.
user_query: Explain the difference between "==" and "is" in Python.
query_type: user_intent

golden_response: Operators in Python are symbols or keywords that perform operations on variables and values. Examples include +, -, *, /, and %.
user_query: