In [22]:
import os
import json

In [23]:
output_dir = "clean_outputs"
output_files = sorted([f for f in os.listdir(output_dir) if f.endswith(".jsonl")])

print(output_files)

['batch_1_output_no_mcq.jsonl', 'batch_2_output_no_mcq.jsonl', 'batch_3_output_no_mcq.jsonl', 'iteration_1_mcq_response.jsonl', 'iteration_2_mcq_response.jsonl', 'iteration_3_mcq_response.jsonl', 'iteration_4_mcq_response.jsonl']


In [24]:
model_responses = []
for out_file in output_files:
    out_file = os.path.join(output_dir, out_file)
    with open(out_file, "r", encoding="utf-8") as fp:
        for line_num, line in enumerate(fp, 1):
                try:
                    result_obj = json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Line {line_num}: JSON decode error: {e}")
                    continue

                if result_obj.get("error"):
                    print(f"Line {line_num}: OpenAI error")
                    continue  # Skip this iteration if there is an error

                # Safely extract nested 'content' string
                response_body = result_obj.get("response", {}).get("body", {})
                content = response_body.get('choices', [{}])[0].get("message", {}).get("content", "{}")

                try:
                    model_response = json.loads(content)
                except json.JSONDecodeError as e:
                    print(f"Line {line_num}: Error parsing content JSON: {e}")
                    continue

                # If model_response is a dict with exactly one key, extract its only value
                if isinstance(model_response, dict) and len(model_response) == 1:
                    model_response = list(model_response[list(model_response.keys())[0]])
                    model_responses.append(model_response)

Line 690: Error parsing content JSON: Expecting ',' delimiter: line 84 column 1 (char 3538)
Line 690: Error parsing content JSON: Expecting ',' delimiter: line 84 column 1 (char 3538)


In [26]:
def normalize_conversations(conversations):
    normalized = []
    for conv in conversations:
        new_conv = []
        for msg in conv:
            if isinstance(msg, dict):
                new_conv.append(msg)
            elif isinstance(msg, list):
                new_conv.extend(msg)  # Unpack the sublist directly into the conversation
        normalized.append(new_conv)
    return normalized

# Apply this to your `model_responses`
model_responses = normalize_conversations(model_responses)


In [27]:
for i, conv in enumerate(model_responses):
    if not isinstance(conv, list):
        print(f"Conversation {i} is not a list but {type(conv)}")
    elif not conv:
        print(f"Conversation {i} is an empty list")
    else:
        for msg in conv:
            if not isinstance(msg, dict):
                print(f"In conversation {i}, found msg that is not dict: {msg} (type: {type(msg)})")

In [28]:
invalid_key = []
for model_response in model_responses:
    for item in model_response:
        it_keys = list(item.keys())
        for it_key in it_keys:
            if it_key not in ['role', 'content']:
                invalid_key.append(it_key)
                
print(invalid_key)

['assistant', 'closed']


In [29]:
def fix_model_responses(model_responses):
    allowed_keys = {'role', 'content'}
    fixed_responses = []

    for conv_idx, conv in enumerate(model_responses):
        fixed_conv = []
        expected_role = "user"  # Conversations should start with 'user'

        for msg_idx, msg in enumerate(conv):
            # Find keys that are invalid (not in allowed_keys)
            invalid_keys = [key for key in msg.keys() if key not in allowed_keys]

            for key in invalid_keys:
                # If invalid key is 'assistant', and 'content' missing or empty, move content
                if key == 'assistant':
                    if not msg.get('content'):
                        msg['content'] = msg.pop('assistant')
                    else:
                        msg.pop('assistant')
                else:
                    # For any other invalid key, just remove it
                    msg.pop(key)

            # Fix missing or invalid 'role'
            if 'role' not in msg or msg['role'] not in ['user', 'assistant']:
                msg['role'] = expected_role

            # Ensure 'content' key exists and is not empty
            if 'content' not in msg or not msg['content']:
                raise ValueError(
                    f"Message missing content at conversation {conv_idx} message {msg_idx}: {msg}"
                )

            fixed_conv.append(msg)

            # Toggle expected role for next message
            expected_role = 'assistant' if expected_role == 'user' else 'user'

        fixed_responses.append(fixed_conv)

    return fixed_responses


fixed_model_responses = fix_model_responses(model_responses)

In [30]:
def remove_repeated_roles(conversation):
    if not conversation:
        return conversation

    cleaned_conv = [conversation[0]]
    for msg in conversation[1:]:
        if msg['role'] != cleaned_conv[-1]['role']:
            cleaned_conv.append(msg)
        else:
            # Optionally merge messages, or just skip the repeated role message
            pass
    return cleaned_conv

fixed_and_cleaned_responses = [remove_repeated_roles(conv) for conv in fixed_model_responses]


In [31]:
# add system message

system_message = {
    "role": "system",
    "content": "You are KalviMate, a friendly, patient, and encouraging AI tutor. Your purpose is to help students from rural government schools in India learn from their state board syllabus. Explain concepts simply, use relatable examples, and always be supportive. Your knowledge is based strictly on the provided textbook content."
}

def add_system_message(conversations, system_msg):
    updated_conversations = []
    for conv in conversations:
        # Prepend the system message copy to avoid mutating the original dict by reference
        updated_conv = [system_msg.copy()] + conv
        updated_conversations.append(updated_conv)
    return updated_conversations

# Add system message to each conversation
final_responses = add_system_message(fixed_and_cleaned_responses, system_message)


In [32]:
def validate_conversations(conversations):
    for i, conv in enumerate(conversations):
        roles = [msg.get('role') for msg in conv]
        if not roles:
            print(f"Empty conversation at index {i}")
            continue
        
        # Find the index of the first non-system message
        first_non_system_idx = next((idx for idx, r in enumerate(roles) if r != "system"), None)
        
        if first_non_system_idx is None:
            print(f"Conversation {i} has only system messages")
            continue
        
        # Check that the first non-system message is from user
        if roles[first_non_system_idx] != 'user':
            print(f"Conversation {i} does not start with 'user' after system message(s)")
        
        # Validate consecutive roles only ignoring initial system messages
        for j in range(first_non_system_idx, len(roles) - 1):
            # Skip system role checks; only validate 'user' and 'assistant'
            if roles[j] == "system" or roles[j+1] == "system":
                continue
            if roles[j] == roles[j + 1]:
                print(f"Consecutive identical roles in conversation {i} at messages {j} and {j+1}: {roles[j]}")

# Now validate
validate_conversations(final_responses)



In [None]:
# from datasets import Dataset

# exchange_pairs = []
# for conv in final_responses:
#     # Make sure only user-assistant pairs are grouped
#     i = 0
#     while i < len(conv) - 1:
#         if conv[i]['role'] == 'user' and conv[i+1]['role'] == 'assistant':
#             pair = [conv[i], conv[i+1]]
#             exchange_pairs.append({"messages": pair})
#             i += 2  # move to next potential pair
#         else:
#             i += 1  # if no user-assistant pair, just advance

# hf_dataset = Dataset.from_list(exchange_pairs)

# exchange pair with system message
# from datasets import Dataset

# exchange_pairs = []
# for conv in final_responses:
#     # Check for system message at the beginning
#     system_message = None
#     start_idx = 0
#     if len(conv) > 0 and conv[0]['role'] == 'system':
#         system_message = conv[0]
#         start_idx = 1

#     # Iterate over the rest and group user/assistant pairs, always include system if present
#     i = start_idx
#     while i < len(conv) - 1:
#         if conv[i]['role'] == 'user' and conv[i+1]['role'] == 'assistant':
#             # Create pair, optionally prepend system
#             if system_message:
#                 pair = [system_message, conv[i], conv[i+1]]
#             else:
#                 pair = [conv[i], conv[i+1]]
#             exchange_pairs.append({"messages": pair})
#             i += 2
#         else:
#             i += 1

# hf_dataset = Dataset.from_list(exchange_pairs)



In [33]:
from datasets import Dataset
keys = set()

hf_dataset = Dataset.from_list([{"conversations": conv} for conv in final_responses])

In [34]:
print(len(hf_dataset))

4822


In [35]:
print(hf_dataset[2456])

{'conversations': [{'content': 'You are KalviMate, a friendly, patient, and encouraging AI tutor. Your purpose is to help students from rural government schools in India learn from their state board syllabus. Explain concepts simply, use relatable examples, and always be supportive. Your knowledge is based strictly on the provided textbook content.', 'role': 'system'}, {'content': 'So, palaeobotany is all about the study of living plants, right?', 'role': 'user'}, {'content': 'That’s a good thought, but let’s look at it another way, palaeobotany actually focuses on the study of plant remains from the geological past, not living plants.', 'role': 'assistant'}, {'content': 'I think plant fossils are always whole plants that are found preserved.', 'role': 'user'}, {'content': 'You’re almost there! Just a tiny correction needed, the majority of plant fossils are actually disarticulated parts, and it’s rare to find complete plants preserved.', 'role': 'assistant'}, {'content': "I heard that

In [None]:
hf_dataset.push_to_hub(
    token="",
    repo_id="heissanjay/km-fullset-all-final-sm-full-conv",
    private=False
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/heissanjay/km-fullset-all-final-sm-full-conv/commit/db533409dfce1da0046ebe53ce42665e8c7ff25a', commit_message='Upload dataset', commit_description='', oid='db533409dfce1da0046ebe53ce42665e8c7ff25a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/heissanjay/km-fullset-all-final-sm-full-conv', endpoint='https://huggingface.co', repo_type='dataset', repo_id='heissanjay/km-fullset-all-final-sm-full-conv'), pr_revision=None, pr_num=None)