In [1]:
from typing import Optional, Sequence, List, Dict, Any, Type, Union
import os
import json
from copy import deepcopy
import random


def convert_to_list(x: Union[Any, Sequence[Any]]) -> List[Any]:
    if isinstance(x, (list, tuple)):
        return list(x)
    else:
        return [x]

def load_json(path: Union[str, List[str]]) -> Union[dict, List[dict]]:
    paths = convert_to_list(path)
    
    data = None
    for path in paths:
        if not os.path.exists(path):
            raise ValueError(f"{path} does not exist")
        
        with open(path, "r", encoding="utf-8") as f:
            json_data = json.load(f)
            if isinstance(json_data, dict):
                if data is None:
                    data = json_data
                else:
                    assert isinstance(data, dict), f"Each previous json file contains a list of json dicts, while {path} contains only a json dict"
                    data.update(json_data)
            elif isinstance(json_data, list):
                if data is None:
                    data = json_data
                else:
                    assert isinstance(data, list), f"Each previous json file contains a json dict, while {path} contains only a list of json dicts"
                    data.extend(json_data)
            else:
                raise ValueError(f"{path} is not a valid json file")
            
    return data

def load_jsonl(path: Union[str, List[str]]) -> List[dict]:
    paths = convert_to_list(path)
    
    data = []
    for path in paths:
        if not os.path.exists(path):
            raise ValueError(f"{path} does not exist")
        
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data.append(json.loads(line))
    
    return data

def save_json(data: Union[dict, List[dict]], path: str) -> None:
    if not path.endswith(".json"):
        raise ValueError(f"{path} is not a json file")
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
        
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def save_jsonl(data: List[dict], path: str) -> None:
    if not path.endswith(".jsonl"):
        raise ValueError(f"{path} is not a jsonl file")
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
        
    with open(path, "w", encoding="utf-8") as f:
        for line in data:
            f.write(json.dumps(line, ensure_ascii=False) + "\n")


In [2]:
train_data_path = "./original/claude_multiround_chat_30k.json"
test_data_path = "./original/claude_multiround_chat_1k.json"

In [3]:
train_data = load_json(train_data_path)
test_data = load_json(test_data_path)

In [4]:
len(train_data), len(test_data), test_data[0].keys(), train_data[0].keys()

(32170,
 1609,
 dict_keys(['id', 'conversations']),
 dict_keys(['id', 'conversations']))

In [5]:
print(f"Conversations:")
for i, c in enumerate(test_data[0]['conversations']):
    print(f"{i//2+1}. [{c['from']}]: {c['value']}")
    print()

Conversations:
1. [human]: Imagine you are sitting by an open window on a warm summer day. Jot down whatever creative thoughts and descriptions spring to mind as you gaze at the scene outside. Focus on sensory details of sight, sound and smell to convey the mood and atmosphere.

1. [gpt]: The warm summer breeze flows lazily through the window, carrying with it the sweet scent of freshly mowed grass and roses in full bloom. Bumblebees  hover lazily over the flowerbeds, their humming joining with the chorus of birds nesting in the trees lining the driveway. Gold sunlight streams through the leaves of the towering oak in the front yard, casting dappled shadows that dance across the siding of the house.  Children's laughter drifts through the air from a nearby backyard pool party as they splash and swim beneath a cerulean blue sky devoid of clouds. The air holds a stillness, a waiting, as if even nature itself pauses to soak in the simple pleasures of a perfect summer day. Before me lies a

In [6]:
user_role_value = "user"
bot_role_value = "chatbot"

role_key = "role"
content_key = "content"
conversations_key = "conversations"

In [7]:
test_data_ = []
test_max_seq_len = int(1000 * 0.75)
test_max_num_samples = 500

for test_sample in test_data:
    conversations = test_sample["conversations"]
    total_len = sum(len(c["value"].split()) for c in conversations)
    if total_len > test_max_seq_len:
        continue
    
    test_data_.append({
        conversations_key: [
            {
                role_key: (user_role_value if c["from"] == "human" else bot_role_value),
                content_key: c["value"],
            }
            for c in conversations
        ]
    })
print(len(test_data_))

if len(test_data_) > test_max_num_samples:
    random.seed(42)
    test_data_ = random.sample(test_data_, test_max_num_samples)

len(test_data_)

580


500

In [8]:
print(f"Conversations:")
for i, c in enumerate(test_data_[0][conversations_key]):
    print(f"{i//2+1}. [{c[role_key]}]: {c[content_key]}")
    print()

Conversations:
1. [user]: Derive the quadratic formula through algebraic manipulation of the quadratic equation, beginning with:

If a quadratic equation in the form ax2 + bx + c = 0, where a ≠ 0, has roots r1 and r2, then the quadratic equation may be rewritten as a(x - r1)(x - r2) = 0. Expanding this yields a familiar form. Show each step of the algebraic manipulation, proving that the quadratic formula,

r1,2 = ( -b ± √(b2 - 4ac) ) / (2a),

emerges as the solution.

1. [chatbot]: If ax2 + bx + c = 0, where a ≠ 0, has roots r1 and r2, then    
         a(x - r1)(x - r2) = 0       
     Expanding the left side:
          a(x2 - (r1 + r2)x + r1r2) = 0 
 Using the summation of roots formula −b/a = r1 + r2 and the product of roots formula c/a = r1r2, we can substitute:
         a(x2  - (-b/a)x + c/a) = 0
         ax2 -bx + c = 0        
By equating the two quadratic equations, we arrive at the quadratic formula:
 
         x = ( -b ± √(b2 - 4ac) ) / (2a)

In summary, through algebraic ma

In [9]:
train_data_ = []
train_max_seq_len = int(1000 * 0.75)
train_max_num_samples = 10000
eval_ratio = 0.02

for train_sample in train_data:
    conversations = train_sample["conversations"]
    total_len = sum(len(c["value"].split()) for c in conversations)
    if total_len > train_max_seq_len:
        continue
    
    train_data_.append({
        conversations_key: [
            {
                role_key: (user_role_value if c["from"] == "human" else bot_role_value),
                content_key: c["value"],
            }
            for c in conversations
        ]
    })
print(len(train_data_))

if len(train_data_) > train_max_num_samples:
    random.seed(42)
    train_data_ = random.sample(train_data_, train_max_num_samples)


num_train_samples = len(train_data_)
num_eval_samples = int(num_train_samples * eval_ratio)

train_data_ = train_data_[:-num_eval_samples]
eval_data_ = train_data_[-num_eval_samples:]

len(train_data_), len(eval_data_)

11441


(9800, 200)

In [10]:
print(f"Conversations:")
for i, c in enumerate(train_data_[0][conversations_key]):
    print(f"{i//2+1}. [{c[role_key]}]: {c[content_key]}")
    print()

Conversations:
1. [user]: Explain how to set up and start playing an enjoyable game of Monopoly with at least three other people. Cover explaining the rules, placing the properties and other game pieces on the board, choosing game pieces, taking turns rolling the dice and moving around the board, buying and trading properties, building houses and hotels, paying rents and taxes, avoiding jail, using Chance and Community Chest cards, and declaring a winner.

1. [chatbot]: 1. Gather 3-6 players to join the game. Each player chooses a game piece like the shoe, hat, car, wheelbarrow, dog or iron.  

2. Shuffle the Chance and Community Chest cards and place them face down on their respective spaces on the board.

3. Place the property title deed cards face down and sort them by color groups. Place the houses and hotels nearby.

4. Place all game money - bills, houses, hotels - in the central bank. Each player receives an initial $1500 to 1499 as starting cash.

5. Explain the basic rules to 

In [11]:
test_save_path = "./chat_test.jsonl"
train_save_path = "./chat_train.jsonl"
eval_save_path = "./chat_eval.jsonl"

In [12]:
save_jsonl(test_data_, test_save_path) # 0.5k, token length < 1k
save_jsonl(train_data_, train_save_path) # 10k - 0.2k, token length < 1k
save_jsonl(eval_data_, eval_save_path) # 0.2k, token length < 1k