In [None]:
import json
import os
from google.colab import files
from datetime import datetime

# Step 1: Upload JSONL files
print("📤 Please upload two or more JSONL files...")
uploaded = files.upload()

# Step 2: Read and display length of each dataset
dataset_lengths = {}

# Step 3: Read and merge JSONL content
merged_data = []


for filename in uploaded.keys():
    print(f"Reading {filename}...")
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # avoid blank lines
                try:
                    merged_data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in file {filename}: {e}")

# Step 3: Save merged data to new JSONL file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"merged_dataset_{timestamp}.jsonl"

with open(output_filename, 'w', encoding='utf-8') as outfile:
    for item in merged_data:
        outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"\n✅ Merged {len(uploaded)} files into {output_filename} with {len(merged_data)} records.")

# Step 4: Provide the file for download
files.download(output_filename)


📤 Please upload two or more JSONL files...


Saving train.jsonl to train.jsonl
Saving train_1.jsonl to train_1.jsonl
Saving train_2.jsonl to train_2.jsonl
Saving train_3.jsonl to train_3.jsonl
Reading train.jsonl...
Reading train_1.jsonl...
Reading train_2.jsonl...
Reading train_3.jsonl...

✅ Merged 4 files into merged_dataset_20250716_142209.jsonl with 32418 records.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json
from google.colab import files
from datetime import datetime

# Step 1: Upload JSONL files
print("📤 Please upload two or more JSONL files...")
uploaded = files.upload()

# Step 2: Read and display length of each dataset
dataset_lengths = {}
merged_data = []

for filename in uploaded.keys():
    record_count = 0
    print(f"\n📄 Reading {filename}...")
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    record = json.loads(line)
                    merged_data.append(record)
                    record_count += 1
                except json.JSONDecodeError as e:
                    print(f"⚠️ Error decoding JSON in file {filename}: {e}")
    dataset_lengths[filename] = record_count

# Step 3: Show individual and total lengths
print("\n📊 Dataset Record Counts:")
for file, count in dataset_lengths.items():
    print(f" - {file}: {count} records")

print(f"\n🧮 Total merged records: {len(merged_data)}")

# Step 4: Save merged data new JSONL file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"merged_dataset_{timestamp}.jsonl"

with open(output_filename, 'w', encoding='utf-8') as outfile:
    for item in merged_data:
        outfile.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"\n✅ Merged dataset saved as: {output_filename}")

# Step 5: Download merged file
files.download(output_filename)


📤 Please upload two or more JSONL files...


Saving validation.jsonl to validation.jsonl
Saving validation_1.jsonl to validation_1.jsonl
Saving validation_2.jsonl to validation_2.jsonl
Saving validation_3.jsonl to validation_3.jsonl

📄 Reading validation.jsonl...

📄 Reading validation_1.jsonl...

📄 Reading validation_2.jsonl...

📄 Reading validation_3.jsonl...

📊 Dataset Record Counts:
 - validation.jsonl: 3354 records
 - validation_1.jsonl: 838 records
 - validation_2.jsonl: 243 records
 - validation_3.jsonl: 1295 records

🧮 Total merged records: 5730

✅ Merged dataset saved as: merged_dataset_20250716_142957.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Step 6: Print the first 5 records of JSONL dataset

from google.colab import files
import json

# Step 1: Upload the file
uploaded = files.upload()

# Step 2: Read and print the first 5 records
for filename in uploaded.keys():
    print(f"\nFirst 5 records from: {filename}\n" + "-"*40)
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            record = json.loads(line)
            print(f"Record {i+1}: {record}\n")


Saving merged_train_dataset_20250716_142436.jsonl to merged_train_dataset_20250716_142436.jsonl

First 5 records from: merged_train_dataset_20250716_142436.jsonl
----------------------------------------
Record 1: {'input': 'What Does Medicare IME Stand For? ', 'output': 'According to the Centers for Medicare and Medicaid Services website , cms.gov , IME stands for Indirect Medical Education and is in regards to payment calculation adjustments for a Medicare discharge of higher cost patients receiving care from teaching hospitals relative to non-teaching hospitals . I would recommend contacting CMS to get more information about IME '}

Record 2: {'input': 'Is Long Term Care Insurance Tax Free? ', 'output': 'As a rule , if you buy a tax qualified long term care insurance policy -LRB- as nearly all are , these days -RRB- , and if you are paying the premium yourself , there are tax advantages you will receive . If you are self employed , the entire premium is tax deductible . If working so

In [5]:
# Step 6: Print the first 5 records of JSONL dataset in a tabular format

from google.colab import files
import json
import pandas as pd

# Step 1: Upload the file
uploaded = files.upload()

# Step 2: Read and display the first 5 records in a DataFrame
for filename in uploaded.keys():
    records = []
    with open(filename, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            records.append(json.loads(line))

    df = pd.DataFrame(records)
    print(f"\nFirst 5 records from: {filename}")
    display(df)


Saving merged_validation.jsonl to merged_validation.jsonl

First 5 records from: merged_validation.jsonl


Unnamed: 0,input,output
0,Where To Get Homeowners Insurance?,Homeowners insurance is available through the ...
1,What Is An Elimination Period In Disability In...,The elimination period on disability insurance...
2,Can You Deduct Homeowners Insurance From Your ...,Not unless your home is also your business . I...
3,What Is The Best Kind Of Life Insurance To Buy?,The best kind of life insurance is the one fin...
4,What Is The Best Kind Of Life Insurance To Buy?,What is the best kind of life insurance to buy...
