In [44]:
from datasets import load_dataset, Dataset
import html
import psutil
import timeit
from transformers import AutoTokenizer
from itertools import islice
from datasets import interleave_datasets
import requests
import time
import math
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel
import torch

- Hugging Face API to share the model (the Hub is a simple Git repository): https://huggingface.co/learn/llm-course/chapter4/3
- Proper model documentation to include: https://huggingface.co/learn/llm-course/chapter4/4

### Managing data format

Loading:  
```data_type = ['csv', 'text', 'json', 'pandas']```  
```dataset = load_dataset(data_type[0], data_files='my_file.xxx')```  
For TSV, use ```csv``` and add the parameter ```delimiter='\t'```  
  
Saving:  
- Arrow format: ```dataset.save_to_disk()```  
- CSV format: ```dataset.to_csv()```  
- JSON format: ```dataset.to_json()```

Now three modifications:
- By default, loading local files creates a DatasetDict object with a train split, but to also include the test split.
- Datasets supports automatic decompression of the input files.
- Loading remote files is possible.

In [None]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {'train': url + 'SQuAD_it-train.json.gz', 'test': url + 'SQuAD_it-test.json.gz'}
dataset = load_dataset('json', data_files=data_files, field='data')

### Inspecting and correcting data

In [4]:
# Loading data and inspecting type of data (good practice)
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip
data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [None]:
# Correcting 'Unnamed: 0' column which looks suspiciously like an anonymized ID for each patient
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))
drug_dataset = drug_dataset.rename_column(original_column_name="Unnamed: 0", new_column_name="patient_id")

In [11]:
# Normalizing condition labels
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset.map(lowercase_condition)

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [15]:
# Counting the number of words in each review.
def compute_review_length(example):
    return {"review_length": len(example["review"].split())} # Returns a dictionary whose key does not correspond to one of the column names in the dataset, so Dataset.map() creates a new review_length column

drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset["train"].sort("review_length")[:3] # Inspecting extreme values (reviews with one word)

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['Hepatitis C', 'ADHD', 'Birth Control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [16]:
# Eliminating reviews with less than 30 words (not optimal for sentimental analysis)
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows) # This has removed around 15% of the reviews

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'train': 138514, 'test': 46108}


In [19]:
# Unescaping HTML characters
text = "I&#039;m a transformer called BERT"
html.unescape(text)
%time new_drug_dataset = drug_dataset.map(lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True, num_proc=8) # Batched allows several elements to be processed at the same time

Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 578 ms, sys: 331 ms, total: 909 ms
Wall time: 5.05 s


### Conversion between various third-party libraries

In [20]:
# To Pandas
drug_dataset.set_format("pandas")
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [27]:
# From Pandas
train_df = drug_dataset["train"][:] # Ee need to slice the whole dataset to obtain a pandas.DataFrame
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "count": "frequency"})
)
freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [28]:
# Reseting the output format 
drug_dataset.reset_format()

### Creating a validation set

In [29]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [None]:
# Each split is stored as a separate file
for split, dataset in drug_dataset_clean.items():
    dataset.to_json(f"drug-reviews-{split}.jsonl")

### Managing big data

In [None]:
# Few minutes to run
data_files = "https://huggingface.co/datasets/casinca/PUBMED_title_abstracts_2019_baseline/resolve/main/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
pubmed_dataset = load_dataset("json", data_files=data_files, split="train")
size_gb = pubmed_dataset.dataset_size / (1024**3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB") # Bytes to megabytes

Thanks to pyarrow library (Apache Arrow memory format), Datasets treats each dataset as a memory-mapped file, which provides a mapping between RAM and filesystem storage that allows the library to access and operate on elements of the dataset across multiple processes without needing to fully load it into memory:

In [32]:
code_snippet = """batch_size = 1000

for idx in range(0, len(pubmed_dataset), batch_size):
    _ = pubmed_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)



```streaming=True``` argument in the load_dataset() function enables dataset streaming. The object returned is an IterableDataset, which allows to access the elements of an IterableDataset if iterating over it:

In [None]:
pubmed_dataset_streamed = load_dataset("json", data_files=data_files, split="train", streaming=True) # Not loading the whole dataset 
next(iter(law_dataset_streamed)) # Peeking at a single item without loading everything

In [None]:
# It is possible to iterate over the suffle
shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=10_000, seed=42) # Shuffling the elements in a predefined buffer_size
next(iter(shuffled_dataset))

In [None]:
# Similar way to Dataset.select() for IterableDataset
train_dataset = shuffled_dataset.skip(1000) # Skip the first 1,000 examples and include the rest in the training set
validation_dataset = shuffled_dataset.take(1000) # Take the first 1,000 examples for the validation set

In [None]:
# Combining two big dataset
combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed]) # law_dataset_streamed not available any more in https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst)
list(islice(combined_dataset, 2))

### Creating a dataset

The comments associated with an issue in GitHub provide a rich source of information, especially if we’re interested in building a search engine to answer user queries about the library.

In [43]:
url = "https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"
response = requests.get(url)
print(response.status_code)
response.json()

200


[{'url': 'https://api.github.com/repos/huggingface/datasets/issues/7590',
  'repository_url': 'https://api.github.com/repos/huggingface/datasets',
  'labels_url': 'https://api.github.com/repos/huggingface/datasets/issues/7590/labels{/name}',
  'comments_url': 'https://api.github.com/repos/huggingface/datasets/issues/7590/comments',
  'events_url': 'https://api.github.com/repos/huggingface/datasets/issues/7590/events',
  'html_url': 'https://github.com/huggingface/datasets/issues/7590',
  'id': 3101654892,
  'node_id': 'I_kwDODunzps64339s',
  'number': 7590,
  'title': '`ArrowNotImplementedError: Unsupported cast from list<item:struct<…>> to struct` when loading nested `Sequence(Features)` JSONL',
  'user': {'login': 'AHS-uni',
   'id': 183279820,
   'node_id': 'U_kgDOCuygzA',
   'avatar_url': 'https://avatars.githubusercontent.com/u/183279820?v=4',
   'gravatar_id': '',
   'url': 'https://api.github.com/users/AHS-uni',
   'html_url': 'https://github.com/AHS-uni',
   'followers_url': 'h

In [None]:
# Using the GitHub REST API to fetch up issues, avoiding hitting API rate limits, and storing the issues as JSON
with open("api_token.txt", "r") as f:
    token = f.readline().strip()
headers = {"Authorization": f"token {token}"}

def fetch_issues(
    owner="huggingface",
    repo="datasets",
    num_issues=10_000,
    rate_limit=5_000,
    issues_path=Path("."),
):
    if not issues_path.is_dir():
        issues_path.mkdir(exist_ok=True)

    batch = []
    all_issues = []
    per_page = 100  # Number of issues to return per page (max API value)
    num_pages = math.ceil(num_issues / per_page)
    base_url = "https://api.github.com/repos"

    for page in tqdm(range(num_pages)):
        # Query with state=all to get both open and closed issues
        query = f"issues?page={page}&per_page={per_page}&state=all"
        issues = requests.get(f"{base_url}/{owner}/{repo}/{query}", headers=headers)
        batch.extend(issues.json()) # Add the list of issues from this page to batch

        if len(batch) > rate_limit and len(all_issues) < num_issues:
            all_issues.extend(batch)
            batch = []  # Flush batch for next time period
            print(f"Reached GitHub rate limit. Sleeping for one hour ...")
            time.sleep(60 * 60 + 1)

    all_issues.extend(batch)
    df = pd.DataFrame.from_records(all_issues)
    df.to_json(f"{issues_path}/{repo}-issues.jsonl", orient="records", lines=True)
    print(
        f"Downloaded all the issues for {repo}! Dataset stored at {issues_path}/{repo}-issues.jsonl"
    )

In [None]:
#1 Running defined API (long time)
fetch_issues()
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

In [51]:
#2 Alternative (faster)
issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


datasets-issues-with-comments.jsonl:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3019 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [None]:
# Function that returns all the comments associated with an issue (already included in lewtun/github-issues)
def get_comments(issue_number):
    url = f"https://api.github.com/repos/huggingface/datasets/issues/{issue_number}/comments"
    response = requests.get(url, headers=headers)
    return [r["body"] for r in response.json()]

issues_with_comments_dataset = issues_dataset.map(
    lambda x: {"comments": get_comments(x["number"])}
)

There are several thousand issues because the API also considers every pull request an issue; the ```pull_request``` is associated with various URLs for pull requests, while ordinary issues have a None entry, so:

In [53]:
issues_dataset = issues_dataset.filter(lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0))
issues_dataset

Filter:   0%|          | 0/3019 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

Although we could proceed to further clean up the dataset by dropping or renaming some columns, it is generally a good practice to keep the dataset as “raw” as possible at this stage so that it can be easily used in multiple applications.

### Embeddings to develop a semantic search engine

In [None]:
# Most informative columns for embeddings are title, body, and comments, while html_url provides us with a link back to the source issue
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)

Because our comments column is currently a list of comments for each issue, we need to “explode” the column so that each row consists of an (html_url, title, body, comment) tuple; in Pandas we can do this with the DataFrame.explode() function:

In [55]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]
comments_df = df.explode("comments", ignore_index=True)
comments_dataset = Dataset.from_pandas(comments_df)
comments_df.head(4)

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,created_at,updated_at,closed_at,author_association,active_lock_reason,pull_request,body,timeline_url,performed_via_github_app,is_pull_request
0,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1000624883,I_kwDODunzps47pFLz,2945,Protect master branch,...,1632120421000,1632139287000,1632139000000.0,MEMBER,,,After accidental merge commit (91c55355b634d0d...,https://api.github.com/repos/huggingface/datas...,,False
1,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1000624883,I_kwDODunzps47pFLz,2945,Protect master branch,...,1632120421000,1632139287000,1632139000000.0,MEMBER,,,After accidental merge commit (91c55355b634d0d...,https://api.github.com/repos/huggingface/datas...,,False
2,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1000355115,I_kwDODunzps47oDUr,2943,Backwards compatibility broken for cached data...,...,1632068197000,1632155143000,1632155000000.0,CONTRIBUTOR,,,## Describe the bug\r\nAfter upgrading to data...,https://api.github.com/repos/huggingface/datas...,,False
3,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://github.com/huggingface/datasets/issues...,1000355115,I_kwDODunzps47oDUr,2943,Backwards compatibility broken for cached data...,...,1632068197000,1632155143000,1632155000000.0,CONTRIBUTOR,,,## Describe the bug\r\nAfter upgrading to data...,https://api.github.com/repos/huggingface/datas...,,False


In [56]:
# Filtering short comments
comments_dataset = comments_dataset.map(lambda x: {"comment_length": len(x["comments"].split())})
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [57]:
# Concatenating the issue title, description, and comments together in a new text column
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }

comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [61]:
device = torch.device("cpu") # "cuda" is not available in my PC
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" # Dedicated to creating embeddings
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

2025-05-30 23:10:06.856783: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748639407.055915     129 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748639407.077333     129 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748639407.276539     129 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748639407.276592     129 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748639407.276594     129 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

Transformers output an embedding for each token, but we want one vector per whole issue, so we need to combine (or pool) all those token embeddings into one vector.

In [58]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt"    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

embeddings_dataset = comments_dataset.map(lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]})

- ```.detach()``` detaches the tensor from the computation graph so it no longer tracks gradients.
- ```.cpu()``` moves the tensor to the CPU, in case it's still on the GPU.
- ```.numpy()``` converts the PyTorch tensor to a NumPy array, required by Datasets when trying to index them with FAISS (next section).
- ```[0]``` takes the first (and only) row of the array, since this was a batch of 1.

The basic idea behind FAISS (short for Facebook AI Similarity Search) is to create a special data structure called an index that allows one to find which embeddings are similar to an input embedding.

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
# Creating a query
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

Just like with the documents, we now have a 768-dimensional vector representing the query, which we can compare against the whole corpus to find the most similar embeddings:

In [None]:
# Performing a query
scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()