## Combine and Shuffle CSV files
In order to prepare the dataset for trainging, we need a combined file that contains both wontfix and non-wontfix issues evenly.

This makes sure that the created model isn't overly biased in one way or another.

In [1]:
import pandas as pd

ISSUE_LIST_HEADERS = [
        "id", "title", "number", "repoId", "repoName", "labels", "state",
        "comments", "createdAt", "updatedAt", "closedAt", "body", "user", "userId"
    ]

# File paths
file1 = "../data/no-wontfix/issues.csv"
file2 = "../data/issues.csv"
output_file = "../data/shuffled_issues.csv"

# Read CSV files
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Combine DataFrames
combined_df = pd.concat([df1, df2], ignore_index=True)

# Shuffle the DataFrame
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to a new CSV file
shuffled_df.to_csv(output_file, index=False, header=ISSUE_LIST_HEADERS)

print(f"Combined and shuffled CSV saved to {output_file}")


Combined and shuffled CSV saved to ../data/shuffled_issues.csv


## Create Dataset Format

This creates a dataset for model training

In [2]:
import json
import pandas as pd

# Define a function to process the CSV and convert it to JSONL format
def convert_csv_to_jsonl(csv_filepath, jsonl_filepath):
    df = pd.read_csv(csv_filepath)
    
    # Convert the 'labels' column to string
    df['body'] = df['body'].astype(str)
    df['title'] = df['title'].astype(str)

    # Process each row and structure it into a dictionary
    jsonl_data = []
    for _, row in df.iterrows():
        issue_data = {
            "title": row["title"],
            "body": row["body"],
            # "number": row["number"],
            # "state": row["state"],
            # "labels": json.loads(row["labels"].replace("'", '"')),  # Convert string list to actual list
            # "state": row["state"],
            # "repo": row["repoName"],
            # "comments": row["comments"],
            # "created_at": row["createdAt"],
            # "updated_at": row["updatedAt"],
            # "closed_at": row["closedAt"],
            "target": "WONTFIX" if "wontfix" in row["labels"].lower() else "FIXED"
        }
        jsonl_data.append(issue_data)

    # Save to JSONL file
    with open(jsonl_filepath, "w") as jsonl_file:
        for entry in jsonl_data:
            jsonl_file.write(json.dumps(entry) + "\n")


In [4]:
# Define output file path
conversion_input_file = "../data/shuffled_issues.csv"
conversion_output_file = "../data/training-dataset/issues/all.jsonl"

# Convert and save the JSONL file
convert_csv_to_jsonl(conversion_input_file, conversion_output_file)

# Return the path of the created JSONL file
conversion_output_file

'../data/training-dataset/issues/all.jsonl'

## Split the Dataset
This will split the dataset into training and testing datasets. By default 80% of the data will be used for training and 20% for testing.

In [7]:
import json
import random

# File paths
datasetsplit_input = '../data/training-dataset/issues/all.jsonl'
datasetsplit_training_output = '../data/training-dataset/issues/training.jsonl'
datasetsplit_testing_output = '../data/training-dataset/issues/testing.jsonl'

# Read the input JSONL file
with open(datasetsplit_input, 'r') as f:
    lines = f.readlines()

# Shuffle the data
random.shuffle(lines)

# Calculate the split index
split_index = int(0.8 * len(lines))

# Split the data
training_lines = lines[:split_index]
testing_lines = lines[split_index:]

# Write the train data to train.jsonl
with open(datasetsplit_training_output, 'w') as f:
    for line in training_lines:
        f.write(line)

# Write the control data to control.jsonl
with open(datasetsplit_testing_output, 'w') as f:
    for line in testing_lines:
        f.write(line)

print(f"Data split into {datasetsplit_training_output} and {datasetsplit_testing_output}")
print(f"Training data: {len(training_lines)} samples")
print(f"Testing data: {len(testing_lines)} samples")


Data split into ../data/training-dataset/issues/training.jsonl and ../data/training-dataset/issues/testing.jsonl
Training data: 253600 samples
Testing data: 63400 samples


In [8]:
import random

reduce_file_input = '../data/training-dataset/issues/training.jsonl'
reduce_file_output = '../data/training-dataset/issues/reduced.jsonl'
output_line_total = 100

with open(reduce_file_input, 'r') as f:
    lines = f.readlines()

# Shuffle the data
random.shuffle(lines)

reduced_lines = lines[:output_line_total]

# Write the train data to train.jsonl
with open(reduce_file_output, 'w') as f:
    for line in reduced_lines:
        f.write(line)

print(f"Data reduced to {reduce_file_output}")

Data reduced to ../data/training-dataset/issues/reduced.jsonl
