In [7]:
import os
import json
import glob

def is_valid_json(line):
    try:
        json_obj = json.loads(line)
        return True, json_obj
    except ValueError:
        return False, None

def combine_jsonl_files(directory, output_file):
    combined_data = []
    
    # Iterate over all .jsonl files in the specified directory
    jsonl_files = glob.glob(os.path.join(directory, "*.jsonl"))
    
    for i, jsonl_file in enumerate(jsonl_files):
        with open(jsonl_file, 'r', encoding='utf-8') as file:
            for line in file:
                valid, json_obj = is_valid_json(line)
                if valid:
                    combined_data.append(json_obj)
            print(f"Processed {i}/{len(jsonl_files)}")
    
    # Write the combined data to a single JSONL file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for json_obj in combined_data:
            outfile.write(json.dumps(json_obj, ensure_ascii=False) + '\n')
    
    print(f"Combined {len(combined_data)} valid JSON objects into {output_file}")
    


In [9]:
# Replace 'your_directory_path' with the path to your directory containing JSONL files
# Replace 'output.json' with the desired output file name
combine_jsonl_files('./test_jsonl', './all_data.jsonl')

Processed 0/5452
Processed 1/5452
Processed 2/5452
Processed 3/5452
Processed 4/5452
Processed 5/5452
Processed 6/5452
Processed 7/5452
Processed 8/5452
Processed 9/5452
Processed 10/5452
Processed 11/5452
Processed 12/5452
Processed 13/5452
Processed 14/5452
Processed 15/5452
Processed 16/5452
Processed 17/5452
Processed 18/5452
Processed 19/5452
Processed 20/5452
Processed 21/5452
Processed 22/5452
Processed 23/5452
Processed 24/5452
Processed 25/5452
Processed 26/5452
Processed 27/5452
Processed 28/5452
Processed 29/5452
Processed 30/5452
Processed 31/5452
Processed 32/5452
Processed 33/5452
Processed 34/5452
Processed 35/5452
Processed 36/5452
Processed 37/5452
Processed 38/5452
Processed 39/5452
Processed 40/5452
Processed 41/5452
Processed 42/5452
Processed 43/5452
Processed 44/5452
Processed 45/5452
Processed 46/5452
Processed 47/5452
Processed 48/5452
Processed 49/5452
Processed 50/5452
Processed 51/5452
Processed 52/5452
Processed 53/5452
Processed 54/5452
Processed 55/5452
Pr

In [11]:
def is_valid_format(json_obj):
    try:
        if "messages" not in json_obj:
            return False
        
        messages = json_obj["messages"]
        if not isinstance(messages, list) or len(messages) != 3:
            return False
        
        required_roles = ["system", "user", "assistant"]
        for i, role in enumerate(required_roles):
            if messages[i]["role"] != role:
                return False
        
        # Additional checks can be added here for the "content" if needed.
        return True
    except (KeyError, TypeError, IndexError):
        return False

def filter_jsonl(input_file, output_file):
    valid_lines = []

    with open(input_file, 'r', encoding='utf-8') as infile:
        for line in infile:
            try:
                json_obj = json.loads(line)
                if is_valid_format(json_obj):
                    valid_lines.append(json_obj)
            except json.JSONDecodeError:
                continue  # Skip invalid JSON lines

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for json_obj in valid_lines:
            outfile.write(json.dumps(json_obj, ensure_ascii=False) + '\n')

    print(f"Filtered data saved to {output_file}.")
    print(f"Total valid lines: {len(valid_lines)}")

# File paths
input_file = './data/all_data.jsonl'
output_file = './data/cleaned_all_data.jsonl'

# Filter the data
filter_jsonl(input_file, output_file)

Filtered data saved to ./data/cleaned_all_data.jsonl.
Total valid lines: 73966


In [12]:
import json
import random

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError:
                continue  # Skip any invalid JSON lines
    return data

def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for obj in data:
            file.write(json.dumps(obj, ensure_ascii=False) + '\n')

def shuffle_and_split_jsonl(input_file, train_file, val_file, train_ratio=0.8):
    data = load_jsonl(input_file)
    random.shuffle(data)  # Shuffle the data in place

    split_index = int(train_ratio * len(data))
    train_data = data[:split_index]
    val_data = data[split_index:]

    save_jsonl(train_data, train_file)
    save_jsonl(val_data, val_file)

    print(f"Saved {len(train_data)} entries to {train_file}")
    print(f"Saved {len(val_data)} entries to {val_file}")

# File paths
input_file = './data/cleaned_all_data.jsonl'
train_file = './data/training_set.jsonl'
val_file = './data/validation_set.jsonl'

# Shuffle and split the data
shuffle_and_split_jsonl(input_file, train_file, val_file, train_ratio=0.8)

Saved 59172 entries to ./data/training_set.jsonl
Saved 14794 entries to ./data/validation_set.jsonl
