## Combine and Shuffle CSV files
In order to prepare the dataset for trainging, we need a combined file that contains both wontfix and non-wontfix issues evenly.

This makes sure that the created model isn't overly biased in one way or another.

In [4]:
import pandas as pd

ISSUE_LIST_HEADERS = [
        "id", "title", "number", "repoId", "repoName", "labels", "state",
        "comments", "createdAt", "updatedAt", "closedAt", "body", "user", "userId"
    ]

# File paths
file1 = "../data/no-wontfix/issues.csv"
file2 = "../data/issues.csv"
output_file = "../data/shuffled_issues.csv"

# Read CSV files
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

# Combine DataFrames
combined_df = pd.concat([df1, df2], ignore_index=True)

# Shuffle the DataFrame
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to a new CSV file
shuffled_df.to_csv(output_file, index=False, header=ISSUE_LIST_HEADERS)

print(f"Combined and shuffled CSV saved to {output_file}")


Combined and shuffled CSV saved to ../data/shuffled_issues.csv


## Create Dataset Format

This creates a dataset for model training

In [17]:
import json
import pandas as pd

# Define a function to process the CSV and convert it to JSONL format
def convert_csv_to_jsonl(csv_filepath, jsonl_filepath):
    df = pd.read_csv(csv_filepath)
    
    # Convert the 'labels' column to string
    df['body'] = df['body'].astype(str)
    df['title'] = df['title'].astype(str)

    # Process each row and structure it into a dictionary
    jsonl_data = []
    for _, row in df.iterrows():
        issue_data = {
            "title": row["title"],
            "body": row["body"],
            # "number": row["number"],
            # "state": row["state"],
            # "labels": json.loads(row["labels"].replace("'", '"')),  # Convert string list to actual list
            # "state": row["state"],
            # "repo": row["repoName"],
            # "comments": row["comments"],
            # "created_at": row["createdAt"],
            # "updated_at": row["updatedAt"],
            # "closed_at": row["closedAt"],
            "target": "WONTFIX" if "wontfix" in row["labels"].lower() else "FIXED"
        }
        jsonl_data.append(issue_data)

    # Save to JSONL file
    with open(jsonl_filepath, "w") as jsonl_file:
        for entry in jsonl_data:
            jsonl_file.write(json.dumps(entry) + "\n")


In [18]:
# Define output file path
input_file_path = "../data/shuffled_issues.csv"
output_file_path = "../data/training-dataset/pulls.jsonl"

# Convert and save the JSONL file
convert_csv_to_jsonl(input_file_path, output_file_path)

# Return the path of the created JSONL file
output_file_path

'../data/training-dataset/pulls.jsonl'