# Process the TeleQnA Dataset to create and save train datasets

Processing TeleQnA dataset to produces datasets to fine tunne a model and then test it.

In [4]:
# mount google drive to save train dataset...
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import re

def extract_option(text):
    # Find all occurrences of 'option X:' followed by text, where X can be any number
    option_matches = re.findall(r'option\s*\d+:\s*(.*)', text, re.IGNORECASE | re.DOTALL)

    # Return the text after the last 'option X:' found
    return option_matches[-1].strip() if option_matches else None

# Dataset TeleQnA
We will load json files that we generated and create dataset to fine-tunning the model.

## Questions Release 17

## Dataset with 4000 questions

### Questions Release 17

In [8]:
drive_dataset_path = r"/content/drive/MyDrive/doutorado/experimentos/3GPP/datasets"

In [9]:
import json

# Path to the TeleQnA processed question in JSON file with only Rel 17...
rel17_questions_path = drive_dataset_path + r"/raw/rel17_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_questions_path, "r", encoding="utf-8") as file:
    rel17_questions = json.load(file)
print(len(rel17_questions))

# Path to the TeleQnA processed question in JSON file with only Rel 17 and 200 questions...
rel17_200_questions_path = drive_dataset_path + r"/raw/rel17_200_questions.json"

# Load the TeleQnA data just release 17
with open(rel17_200_questions_path, "r", encoding="utf-8") as file:
    rel17_200_questions = json.load(file)
print(len(rel17_200_questions))

rel17_other_questions = [q for q in rel17_questions if q not in rel17_200_questions]
print(len(rel17_other_questions))

rel17_other_questions_length = 500
rel17_other_questions = rel17_other_questions[:rel17_other_questions_length]
print(len(rel17_other_questions))
rel17_other_questions[0]

extract_option(rel17_other_questions[0]['answer'])

733
200
533
500


'MCG bearer'

### Questions without Rel 17 and 18

In [10]:
# Path to the TeleQnA processed question in JSON file with questions without Rel 17 and 18...
no_rel_17_18_path_questions = drive_dataset_path + r"/raw/no_rel_17_18_questions.json"

# Load the TeleQnA data...
with open(no_rel_17_18_path_questions, "r", encoding="utf-8") as file:
    no_rel_17_18_questions = json.load(file)
print(len(no_rel_17_18_questions))

no_rel_17_18_questions_length = 3500
no_rel_17_18_questions = no_rel_17_18_questions[:no_rel_17_18_questions_length]
print(len(no_rel_17_18_questions))
no_rel_17_18_questions[0]

8487
3500


{'question': 'Which non-orthogonal multiple access scheme utilizes the low-complexity message passing algorithm at the receiver for user data detection?',
 'option 1': 'NOMA',
 'option 2': 'PDMA',
 'option 3': 'MUSA',
 'option 4': 'MUST',
 'option 5': 'SCMA',
 'answer': 'option 5: SCMA',
 'explanation': 'The SCMA scheme utilizes the low-complexity message passing algorithm at the receiver for user data detection.',
 'category': 'Research publications'}

### Train data

In [11]:
train_questions = rel17_other_questions + no_rel_17_18_questions
print(len(train_questions))
train_questions[0]

4000


{'question': 'In which bearer option is the S1-U connection terminated in the MeNB? [3GPP Release 17]',
 'option 1': 'Split bearer',
 'option 2': 'SCG bearer',
 'option 3': 'MCG bearer',
 'option 4': 'Primary bearer',
 'answer': 'option 3: MCG bearer',
 'explanation': 'In the MCG bearer option, the S1-U connection for the corresponding bearer(s) is terminated in the MeNB.',
 'category': 'Standards specifications'}

We create two datasets, one with no options and half of questions and another wit options and the other half.

In [13]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [14]:
from datasets import Dataset

# Structure to store pairs of questions and explanations
data = []

half_questions = len(train_questions)//2

# Fill the dataset with (question, explanation) pairs
for item in train_questions[:half_questions]:

    human_value = (
        f"{item['question']}"
    )

    # Combine the answer and explanation
    gpt_value = (
        f"{extract_option(item['answer'])}"
    )

    # Create a dictionary for each input pair
    pair = [
        {'from': 'human', 'value': human_value},  # For the question
        {'from': 'gpt', 'value': gpt_value}  # For the explanation
    ]

    data.append(pair)  # Add the pair to the dataset

data_no_options = data
print(data_no_options[0])

[{'from': 'human', 'value': 'In which bearer option is the S1-U connection terminated in the MeNB? [3GPP Release 17]'}, {'from': 'gpt', 'value': 'MCG bearer'}]


In [15]:
from datasets import Dataset

# Structure to store pairs of questions, options, answers, and explanations
data = []

# Fill the dataset with (question + options, answer + explanation) pairs
for item in train_questions[half_questions:]:

    # Extract options
    options = [f"{key}: {value}" for key, value in item.items() if 'option' in key]
    # Combine the question and options
    human_value = (
        f"Question: {item['question']}\n"
        f"Options:\n" + "\n".join(options) + "\n"
    )

    # Combine the answer and explanation
    gpt_value = (
        f"Answer: {item['answer']}\n"
        f"Explanation: {item['explanation']}"
    )

    # Create a dictionary for each input pair
    pair = [
        {'from': 'human', 'value': human_value},  # Question with options
        {'from': 'gpt', 'value': gpt_value}       # Answer with explanation
    ]

    data.append(pair)  # Add the pair to the dataset

# Create the dataset using Hugging Face
data_options = data
print(data_options[0])

[{'from': 'human', 'value': 'Question: What are distributed representations?\nOptions:\noption 1: Representations where single or subsets of components encode individual data objects\noption 2: Representations where the encoded information is distributed across all components of a hypervector\noption 3: Representations sensitive to the structure of the encoded objects\noption 4: Representations that are robust to local noise\noption 5: Representations that are productive and systematic\n'}, {'from': 'gpt', 'value': 'Answer: option 2: Representations where the encoded information is distributed across all components of a hypervector\nExplanation: Distributed representations are representations where the encoded information is distributed across all components of a hypervector.'}]


Then we join these datasets and shuffle randomly.

In [16]:
import random

data_total = data_no_options + data_options
# Shuffle the combined data
random.shuffle(data_total)
print(len(data_total))

4000


Convert the list of pairs into the appropriate format Transform the data into a Dataset Save dataset on the disk

## Dataset with 4000 questions: short answer label

In [17]:
formatted_data = {'conversations': data_total}
dataset = Dataset.from_dict(formatted_data)

print(dataset)
print(dataset[0])

# dataset.save_to_disk('../files/train_questions_dataset_4000_questions')
dataset.save_to_disk(drive_dataset_path + '/train/train_questions_dataset_4000_questions_short_answer_labels')

Dataset({
    features: ['conversations'],
    num_rows: 4000
})
{'conversations': [{'from': 'human', 'value': 'Question: What advantage does tUAV (tethered UAV) have in terms of backhaul communication compared to uUAVs (untethered UAVs)?\nOptions:\noption 1: tUAVs have a wireless backhaul link with higher capacity compared to uUAVs.\noption 2: tUAVs have more resource blocks for serving mobile users compared to uUAVs.\noption 3: tUAVs have a virtual tether that ensures strong and reliable backhaul link.\noption 4: tUAVs have a wired connection to the GS with higher capacity compared to uUAVs.\noption 5: None of the above.\n'}, {'from': 'gpt', 'value': 'Answer: option 4: tUAVs have a wired connection to the GS with higher capacity compared to uUAVs.\nExplanation: tUAVs have a stable wired connection to the GS with significantly higher capacity compared to a wireless backhaul link in uUAVs.'}]}


Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

## Dataset with 4000 questions: answer label