<a href="https://colab.research.google.com/github/johnmelwin/ResearchProject1/blob/main/1_Traceability_dataset_preprocessing_for_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **PreProcess the data**

In [None]:
import json

# Define the paths to the JSON files
traces_file_path = '/content/traces.json'
methods_file_path = '/content/methods.json'

# Load the traces.json file
with open(traces_file_path, 'r') as file:
    traces_data = json.load(file)

# Load the methods.json file
with open(methods_file_path, 'r') as file:
    methods_data = json.load(file)

# Initialize a list to hold the content for the JSONL output
jsonl_content = []

# Initialize counters for each goldfinal value
count_T = 0
count_E = 0
count_N = 0

# Iterate through each trace in traces_data
for trace in traces_data:
    # Find the corresponding method implementation using the method id
    method_implementation = None
    for method in methods_data:
        if method.get("id") == trace["methodid"]:
            method_implementation = method.get("method")
            break

    # Determine the traceability status and update counts based on the 'goldfinal' value
    if trace["goldfinal"] == "E":
        gold_final_interpretation = "unsure"
        count_E += 1
    elif trace["goldfinal"] == "T":
        gold_final_interpretation = "traceable"
        count_T += 1
    else:  # Assuming the only other value is "N"
        gold_final_interpretation = "not traceable"
        count_N += 1

    # Create an entry for the JSONL content
    jsonl_entry = {
        "messages": [
            {"role": "system", "content": "You are a tracelink identifying system between a requirement and method details."},
            {"role": "user", "content": f"Check if the following requirement: {trace['requirement']} is linked with method name: {trace['methodname']} and method implementation: {method_implementation}"},
            {"role": "assistant", "content": gold_final_interpretation}
        ]
    }
    jsonl_content.append(jsonl_entry)

# Print the counts for each goldfinal value
print(f'Count of T (traceable): {count_T}')
print(f'Count of E (unsure): {count_E}')
print(f'Count of N (not traceable): {count_N}')
print(f'Count of Total Traces): {count_T + count_N + count_E}')
# Convert the list to JSONL string (if needed for further processing)
jsonl_str = "\n".join(json.dumps(entry) for entry in jsonl_content)


KeyboardInterrupt: 

In [None]:
jsonl_file_path = '/content/total_traces_links.jsonl'

# Write the JSONL string to the file
with open(jsonl_file_path, 'w') as file:
    file.write(jsonl_str)

# Confirm the path to the saved JSONL file
jsonl_file_path

'/content/total_traces_links.jsonl'

## **Binary Pre-Processing**

In [None]:
import json
import random

# Function to sample a specific number of traces based on user input
def sample_specific_traces(traces, train_count, validation_count, test_count):
    random.shuffle(traces)
    train_traces = traces[:train_count]
    remaining_traces = traces[train_count:]
    validation_traces = remaining_traces[:validation_count]
    test_traces = remaining_traces[validation_count:validation_count + test_count]
    return train_traces, validation_traces, test_traces

# Load the traces.json and methods.json files
with open('/content/traces.json', 'r') as file:
    traces_data = json.load(file)
with open('methods.json', 'r') as file:
    methods_data = json.load(file)

# Separate traces based on goldfinal value, excluding traces with goldfinal value "E"
traces_T = [trace for trace in traces_data if trace["goldfinal"] == "T"]
traces_N = [trace for trace in traces_data if trace["goldfinal"] == "N"]

# Interactive input for the sizes of the training, validation, and testing sets and their compositions
training_set_size = int(input("Enter the desired size for the training set: "))
validation_set_size = int(input("Enter the desired size for the validation set: "))
testing_set_size = int(input("Enter the desired size for the testing set: "))

# Input for the number of 'T' and 'N' traces in the training, validation, and testing sets
training_T_count = int(input("Enter the number of 'T' traces for the training set: "))
training_N_count = training_set_size - training_T_count
validation_T_count = int(input("Enter the number of 'T' traces for the validation set: "))
validation_N_count = validation_set_size - validation_T_count
testing_T_count = int(input("Enter the number of 'T' traces for the testing set: "))
testing_N_count = testing_set_size - testing_T_count

# Sample the specified numbers of 'T' and 'N' traces for training, validation, and testing sets
train_T, validation_T, test_T = sample_specific_traces(traces_T, training_T_count, validation_T_count, testing_T_count)
train_N, validation_N, test_N = sample_specific_traces(traces_N, training_N_count, validation_N_count, testing_N_count)

# Combine and shuffle the training, validation, and testing sets
training_data = train_T + train_N
validation_data = validation_T + validation_N
testing_data = test_T + test_N
random.shuffle(training_data)
random.shuffle(validation_data)
random.shuffle(testing_data)

# Function to create JSONL content from traces (remains unchanged)
def create_jsonl_content(traces):
    jsonl_content = []
    for trace in traces:
        method_implementation = None
        for method in methods_data:
            if method.get("id") == trace["methodid"]:
                method_implementation = method.get("method")
                break
        gold_final_interpretation = "True" if trace["goldfinal"] == "T" else "False" if trace["goldfinal"] == "N" else "unsure"
        jsonl_entry = {
            "messages": [
                {"role": "system", "content": "You are a traceability identifying system. Your task is to analyze if a given requirement is linked with the specified method name and its implementation."},
                {"role": "user", "content": f"Check if the following REQUIREMENT: '{trace['requirement']}' is linked with METHOD NAME: '{trace['methodname']}' and METHOD IMPLEMENTATION: '{method_implementation}'. After analysis, only state whether the requirement is linked ('True') or not linked ('False')"},
                {"role": "assistant", "content": gold_final_interpretation}
            ]
        }
        jsonl_content.append(jsonl_entry)
    return jsonl_content

# Create JSONL content for training, validation, and testing datasets
training_jsonl_content = create_jsonl_content(training_data)
validation_jsonl_content = create_jsonl_content(validation_data)
testing_jsonl_content = create_jsonl_content(testing_data)

# Print sizes and counts of the created datasets
print(f"Training set size: {len(training_data)}")
print(f"Validation set size: {len(validation_data)}")
print(f"Testing set size: {len(testing_data)}")
print(f"Training Dataset Counts: {{'T': {len(train_T)}, 'N': {len(train_N)}}}")
print(f"Validation Dataset Counts: {{'T': {len(validation_T)}, 'N': {len(validation_N)}}}")
print(f"Testing Dataset Counts: {{'T': {len(test_T)}, 'N': {len(test_N)}}}")

#From the traces.json (total)
#"goldfinal":"N" 26,842
#"goldfinal":"T" 1429
# T is 13.7% of (T+N)

# 5000 total for training
# 800 for T in traning

# 1000 total for validation
# 150 for T in validation

# 1000 total for testing
# 150 T in Testing

Enter the desired size for the training set: 1000
Enter the desired size for the validation set: 10
Enter the desired size for the testing set: 1000
Enter the number of 'T' traces for the training set: 150
Enter the number of 'T' traces for the validation set: 5
Enter the number of 'T' traces for the testing set: 150
Training set size: 1000
Validation set size: 10
Testing set size: 1000
Training Dataset Counts: {'T': 150, 'N': 850}
Validation Dataset Counts: {'T': 5, 'N': 5}
Testing Dataset Counts: {'T': 150, 'N': 850}


Saving the JSONL


In [None]:
# Print sizes of the created datasets
print(f"Training set size: {len(training_data)}")
print(f"Validation set size: {len(validation_data)}")
print(f"Testing set size: {len(testing_data)}")
print(f"Training Dataset Counts: {{'T': {len(train_T)}, 'N': {len(train_N)}}}")
print(f"Validation Dataset Counts: {{'T': {len(validation_T)}, 'N': {len(validation_N)}}}")
print(f"Testing Dataset Counts: {{'T': {len(test_T)}, 'N': {len(test_N)}}}")

# Function to save JSONL content to a file with dataset size in the filename
def save_jsonl_content(filename, size, content):
    # Attach the dataset size to the file name
    filename_with_size = f"{filename.split('.')[0]}_{size}.jsonl"
    with open(filename_with_size, 'w', encoding='utf-8') as f:
        for entry in content:
            json_line = json.dumps(entry) + "\n"  # Convert dict to JSON string and add newline
            f.write(json_line)

# Save training, validation, and testing datasets to JSONL files with dataset sizes in filenames
save_jsonl_content('GPT4_training_dataset_binary_JHOT', len(training_data), training_jsonl_content)
save_jsonl_content('GPT4_validation_dataset_binary_JHOT', len(validation_data), validation_jsonl_content)
save_jsonl_content('GPT4_testing_dataset_binary_JHOT', len(testing_data), testing_jsonl_content)

print("Training, validation, and testing datasets have been saved with sizes in the filenames.")

Training set size: 1000
Validation set size: 10
Testing set size: 1000
Training Dataset Counts: {'T': 150, 'N': 850}
Validation Dataset Counts: {'T': 5, 'N': 5}
Testing Dataset Counts: {'T': 150, 'N': 850}
Training, validation, and testing datasets have been saved with sizes in the filenames.


In [None]:
import json
import pandas as pd

# Path to the uploaded file
file_path = '/content/testing_dataset_binary_1000.jsonl'

# Initialize lists to store data
ids = []
inputs = []
traceability = []

# Read the file and extract required information
with open(file_path, 'r') as file:
    for i, line in enumerate(file):
        data = json.loads(line)
        user_content = ""
        assistant_content = ""
        for message in data['messages']:
            if message['role'] == 'user':
                user_content = message['content']
            elif message['role'] == 'assistant':
                assistant_content = message['content']

        # Populate lists
        ids.append(i+1)
        inputs.append(user_content)
        traceability.append(True if assistant_content == "True" else False)

# Create a DataFrame
df = pd.DataFrame({'ID': ids, 'input': inputs, 'Traceability': traceability})

# Define the output CSV file path
output_csv_path = 'Final_Metrics.csv'

# Save the DataFrame to CSV
df.to_csv(output_csv_path, index=False)

# Return the path of the created CSV file
output_csv_path

'Final_Metrics.csv'