# Process the TeleQnA Dataset

Processing TeleQnA dataset to produces datasets to fine tunne a model and then test it.

In [1]:
import json

teleQnA_questions_path = r"../datasets/TeleQnA/TeleQnA.txt"

# Load TeleQnA dataset
with open(teleQnA_questions_path, "r", encoding="utf-8") as file:
    teleQnA_dataset = json.load(file)
print(len(teleQnA_dataset))

10000


In [2]:
teleQnA_dataset["question 0"]

{'question': 'What is the purpose of the Nmfaf_3daDataManagement_Deconfigure service operation? [3GPP Release 18]',
 'option 1': 'To configure the MFAF to map data or analytics received by the MFAF to out-bound notification endpoints',
 'option 2': 'To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints',
 'option 3': 'To supply data or analytics from the MFAF to notification endpoints',
 'option 4': 'To fetch data or analytics from the MFAF based on fetch instructions',
 'answer': 'option 2: To configure the MFAF to stop mapping data or analytics received by the MFAF to out-bound notification endpoints',
 'explanation': 'The Nmfaf_3daDataManagement_Deconfigure service operation is used to stop mapping data or analytics received by the MFAF to one or more out-bound notification endpoints.',
 'category': 'Standards specifications'}

# Choose only Release 17 Questions

Filter only the questions that contains "3GPP Release 17" in the question text and save in list

In [3]:
rel17_questions = [
    value for key, value in teleQnA_dataset.items() if "[3GPP Release 17]" in value["question"]
]

print(f"Total questions with '[3GPP Release 17]': {len(rel17_questions)}")

Total questions with '[3GPP Release 17]': 733


In [4]:
rel17_questions[0]

{'question': 'How does a supporting UE attach to the same core network operator from which it detached in a shared network? [3GPP Release 17]',
 'option 1': 'It requests the core network node to remember its previous selection.',
 'option 2': 'It uses information stored in the UE when it was detached.',
 'option 3': 'It relies on the SIM/USIM card for information.',
 'option 4': 'It performs a fresh attach procedure.',
 'answer': 'option 2: It uses information stored in the UE when it was detached.',
 'explanation': 'A supporting UE in a shared network attaches to the same core network operator it detached from by using information stored in the UE when it was detached.',
 'category': 'Standards specifications'}

In [7]:
## Save the filtered questions to a new JSON file
#rel17_questions_path = r"../files/rel17_questions.json"
#with open(rel17_questions_path, "w", encoding="utf-8") as file:
#    json.dump(rel17_questions, file, indent=4, ensure_ascii=False)

# Choose 100 questions
* Create a dictionary to store the count of each category
* Extract categories from the questions and count occurrences
* Print unique categories and their counts

In [8]:
category_counts = {}
for question in rel17_questions:
    category = question.get("category", "Unknown")
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

print("Categories found and counts:")
for category, count in category_counts.items():
    print(f"- {category}: {count}")

Categories found and counts:
- Standards specifications: 641
- Standards overview: 92


In [9]:
number_questions = 100

In [16]:
# Calculate how many questions to take from each category
questions_per_category = number_questions // len(category_counts)

rel17_100_questions = []

for category, count in category_counts.items():
    category_questions = [q for q in rel17_questions if q.get("category", "Unknown") == category]
    rel17_100_questions.extend(category_questions[:questions_per_category])

len(rel17_100_questions)

100

In [17]:
# Print the total number of selected questions
print(f"\nTotal selected questions: {len(rel17_100_questions)}")
for idx, question in enumerate(rel17_100_questions):
    print(f"{idx + 1}. {question['question']} (Category: {question['category']})")


Total selected questions: 100
1. How does a supporting UE attach to the same core network operator from which it detached in a shared network? [3GPP Release 17] (Category: Standards specifications)
2. When can the setting of the Privacy exception list be changed? [3GPP Release 17] (Category: Standards specifications)
3. What should the UE consider if it cannot detect any cell meeting the S criterion on a frequency for sidelink operation? [3GPP Release 17] (Category: Standards specifications)
4. When can target UEs be positioned for lawful interception services? [3GPP Release 17] (Category: Standards specifications)
5. What are DCI formats with CRC scrambled by MCCH-RNTI or G-RNTI referred to as? [3GPP Release 17] (Category: Standards specifications)
6. What kind of access can enterprise UEs have to non-public networks? [3GPP Release 17] (Category: Standards specifications)
7. What action is necessary to get access to services in Idle mode? [3GPP Release 17] (Category: Standards specif

In [18]:
## Save the selected questions to a new JSON file
#rel17_100_questions_path = r"../files/rel17_100_questions.json"
#with open(rel17_100_questions_path, "w", encoding="utf-8") as file:
#    json.dump(rel17_100_questions, file, indent=4, ensure_ascii=False)

# Choose 200 questions

In [21]:
number_questions = 216 # Only 92 questions of Standards overview

# Calculate how many questions to take from each category
questions_per_category = number_questions // len(category_counts)

rel17_200_questions = []

for category, count in category_counts.items():
    category_questions = [q for q in rel17_questions if q.get("category", "Unknown") == category]
    rel17_200_questions.extend(category_questions[:questions_per_category])

len(rel17_200_questions)

200

In [22]:
# Save the selected questions to a new JSON file
rel17_200_questions_path = r"../files/rel17_200_questions.json"
with open(rel17_200_questions_path, "w", encoding="utf-8") as file:
    json.dump(rel17_200_questions, file, indent=4, ensure_ascii=False)

# Take only release 18 Questions

* Filter only the questions that contain "[3GPP Release 18]" in the question text and save in a list

In [23]:
rel18_questions = [
    value for key, value in teleQnA_dataset.items() if "[3GPP Release 18]" in value["question"]
]

print(f"Total questions with '[3GPP Release 18]': {len(rel18_questions)}")

Total questions with '[3GPP Release 18]': 780


In [25]:
## Save the filtered questions to a new JSON file
#rel18_questions_path = r"../files/rel18_questions.json"
#with open(rel18_questions_path, "w", encoding="utf-8") as file:
#    json.dump(rel18_questions, file, indent=4, ensure_ascii=False)

# Separate TeleQnA without rel 17 and 18 Questions

In [26]:
import json

questions_no_rel_17_18 = [value for key, value in teleQnA_dataset.items() if (value not in rel17_questions) and (value not in rel18_questions)]
len(questions_no_rel_17_18)

8487

In [28]:
## Save the filtered questions to a new JSON file
#questions_no_rel_17_18_path = r"../files/no_rel_17_18_questions.json"
#with open(questions_no_rel_17_18_path, "w", encoding="utf-8") as file:
#    json.dump(questions_no_rel_17_18, file, indent=4, ensure_ascii=False)

# Choose data were not in Fine Tunning

## Questions out of release 17 and 18 that were not used in training

In [29]:
# Path to the TeleQnA processed question in JSON file
no_rel_17_18_questions_path = r"../files/no_rel_17_18_questions.json"

# Load the TeleQnA data withou release 17 and 18
with open(no_rel_17_18_questions_path, "r", encoding="utf-8") as file:
    no_rel_17_18_questions = json.load(file)
print(len(no_rel_17_18_questions))

8487


In [30]:
training_dataset_length = 3500
training_dataset = no_rel_17_18_questions[:training_dataset_length]
print(len(training_dataset))

3500


In [31]:
no_training_dataset = no_rel_17_18_questions[training_dataset_length:]
print(len(no_training_dataset))

4987


In [32]:
# Save the filtered questions to a new JSON file
other_rel_questions_path = r"../files/other_rel_questions.json"
with open(other_rel_questions_path, "w", encoding="utf-8") as file:
    json.dump(no_training_dataset, file, indent=4, ensure_ascii=False)

# Questions in release 17 that were not used in training

In [33]:
# Path to the TeleQnA processed question in JSON file
rel17_question_path = r"../files/rel17_questions.json"

with open(rel17_question_path, "r", encoding="utf-8") as file:
    rel17_questions = json.load(file)

print(len(rel17_questions))

733


In [34]:
# Path to the TeleQnA processed question in JSON file
rel17_200_questions_path = r"../files/rel17_200_questions.json"

with open(rel17_200_questions_path, "r", encoding="utf-8") as file:
    rel17_200_questions = json.load(file)

print(len(rel17_200_questions))

200


In [35]:
rel17_other_questions = [q for q in rel17_questions if q not in rel17_200_questions]
print(len(rel17_other_questions))

533


In [36]:
rel17_other_questions_length = 500
rel17_no_training_questions = rel17_200_questions + rel17_other_questions[rel17_other_questions_length:]
print(len(rel17_no_training_questions))

233


# Final possible test dataset (Questions with '[3GPP Release 18]' not included, possible to use in other tests)

In [37]:
possible_test_dataset = rel17_no_training_questions + no_training_dataset
print(len(possible_test_dataset)) # 233 questions of release 17 + 4987 other questions (release 18 not included)
## 500 questions of release 17 were used for traning and 3500 of the other questions were used for traning. Total of 4000 questions used for traning

5220


In [38]:
# Save the filtered questions to a new JSON file
possible_test_dataset_path = r"../files/possible_test_dataset.json"
with open(possible_test_dataset_path, "w", encoding="utf-8") as file:
    json.dump(possible_test_dataset, file, indent=4, ensure_ascii=False)