In [2]:
import pandas as pd

splits = {'train': 'train.parquet', 'test': 'test.parquet'}
df = pd.read_parquet("hf://datasets/HelloBoieeee/kubernetes_config/" + splits["train"])
df.to_json("kubernetes_config-10k.json")

In [3]:
import json

with open('kubernetes_config-10k.json', 'r') as file:
    data = json.load(file)

print(data.get("text").get("0"))



<s>[INST]Can you generate a K8S deployment config file?[/INST]apiVersion: v1
kind: Service
metadata:
  name: open-api-doc
spec:
  type: ClusterIP
  ports:
  - port: 80
    targetPort: 80
    protocol: TCP
    name: http
  selector:
    app: open-api-doc</s>


In [4]:
import json
import re
from html import unescape

class KubernetesDatasetCleaner:
    def __init__(self):
        # Compile regex patterns for better performance
        self.patterns = {
            # Special tokens for this dataset
            'inst_tags': re.compile(r'<s>\[INST\](.*?)\[/INST\](.*?)</s>', re.DOTALL),
            'start_token': re.compile(r'^<s>'),
            'end_token': re.compile(r'</s>$'),
            'inst_start': re.compile(r'\[INST\]'),
            'inst_end': re.compile(r'\[/INST\]'),
            
            # General cleaning patterns
            'html_tags': re.compile(r'<[^>]+>'),
            'multiple_newlines': re.compile(r'\n\s*\n\s*\n+'),
            'multiple_spaces': re.compile(r'[ \t]+'),
            'leading_trailing_space': re.compile(r'^\s+|\s+$', re.MULTILINE)
        }
    
    def extract_conversation(self, text):
        """
        Extract user question and assistant response from the special token format
        """
        if not text:
            return None, None
        
        # Try to match the full pattern: <s>[INST]question[/INST]answer</s>
        match = self.patterns['inst_tags'].search(text)
        if match:
            user_content = match.group(1).strip()
            assistant_content = match.group(2).strip()
            return user_content, assistant_content
        
        # Fallback: try to split on [/INST] if full pattern doesn't work
        if '[INST]' in text and '[/INST]' in text:
            # Remove start/end tokens
            text = self.patterns['start_token'].sub('', text)
            text = self.patterns['end_token'].sub('', text)
            
            # Split on [/INST]
            parts = text.split('[/INST]')
            if len(parts) == 2:
                user_content = parts[0].replace('[INST]', '').strip()
                assistant_content = parts[1].strip()
                return user_content, assistant_content
        
        return None, None
    
    def clean_text(self, text):
        """
        Clean text by removing extra whitespace and formatting
        """
        if not text:
            return ""
        
        # Convert to string if not already
        text = str(text)
        
        # Remove any remaining HTML tags (just in case)
        text = self.patterns['html_tags'].sub('', text)
        
        # Clean up whitespace
        text = self.patterns['multiple_newlines'].sub('\n\n', text)
        text = self.patterns['multiple_spaces'].sub(' ', text)
        text = self.patterns['leading_trailing_space'].sub('', text)
        
        return text.strip()
    
    def is_valid_content(self, text):
        """
        Check if the content is valid (not empty, has meaningful content)
        """
        if not text or len(text.strip()) < 5:
            return False
        
        # Check if it has some alphanumeric content
        alphanumeric = re.sub(r'[^a-zA-Z0-9]', '', text)
        if len(alphanumeric) < 3:
            return False
        
        return True
    
    def clean_and_convert_dataset(self, input_file, output_file):
        """
        Clean the Kubernetes dataset and convert to ChatML format
        """
        print(f"Loading Kubernetes dataset from {input_file}...")
        
        try:
            with open(input_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return 0
        
        text_data = data.get("text", {})
        print(f"Found {len(text_data)} text entries")
        
        cleaned_conversations = []
        skipped = 0
        
        # Get all keys and sort them numerically
        keys = list(text_data.keys())
        try:
            # Try to sort numerically if keys are string numbers
            keys = sorted(keys, key=lambda x: int(x))
        except ValueError:
            # If not all numeric, sort alphabetically
            keys = sorted(keys)
        
        print(f"Processing {len(keys)} entries...")
        
        for i, key in enumerate(keys):
            try:
                raw_text = text_data[key]
                
                # Extract user question and assistant response
                user_content, assistant_content = self.extract_conversation(raw_text)
                
                if user_content is None or assistant_content is None:
                    skipped += 1
                    if i % 1000 == 0 and i > 0:
                        print(f"Processed {i}/{len(keys)} entries, skipped {skipped} so far...")
                    continue
                
                # Clean the content
                clean_user = self.clean_text(user_content)
                clean_assistant = self.clean_text(assistant_content)
                
                # Validate content
                if not self.is_valid_content(clean_user) or not self.is_valid_content(clean_assistant):
                    skipped += 1
                    if i % 1000 == 0 and i > 0:
                        print(f"Processed {i}/{len(keys)} entries, skipped {skipped} so far...")
                    continue
                
                # Create ChatML conversation
                conversation = {
                    "messages": [
                        {
                            "role": "user",
                            "content": clean_user
                        },
                        {
                            "role": "assistant",
                            "content": clean_assistant
                        }
                    ]
                }
                
                cleaned_conversations.append(conversation)
                
                # Progress update
                if i % 1000 == 0 and i > 0:
                    print(f"Processed {i}/{len(keys)} entries, {len(cleaned_conversations)} valid conversations...")
                
            except Exception as e:
                print(f"Error processing entry {key}: {e}")
                skipped += 1
                continue
        
        # Save to JSONL format
        print(f"Saving {len(cleaned_conversations)} conversations to {output_file}...")
        
        try:
            with open(output_file, 'w', encoding='utf-8') as file:
                for conversation in cleaned_conversations:
                    file.write(json.dumps(conversation, ensure_ascii=False) + '\n')
        except Exception as e:
            print(f"Error saving dataset: {e}")
            return 0
        
        print(f"✅ Successfully cleaned and converted {len(cleaned_conversations)} conversations")
        print(f"❌ Skipped {skipped} invalid entries")
        print(f"📁 Output saved to: {output_file}")
        
        return len(cleaned_conversations)
    
    def preview_cleaned_data(self, output_file, num_examples=3):
        """
        Preview the cleaned data
        """
        print(f"\n{'='*60}")
        print(f"PREVIEW OF CLEANED KUBERNETES DATA")
        print(f"{'='*60}")
        
        try:
            with open(output_file, 'r', encoding='utf-8') as file:
                for i, line in enumerate(file):
                    if i >= num_examples:
                        break
                    
                    conversation = json.loads(line)
                    print(f"\n📝 CONVERSATION {i+1}:")
                    print(f"{'─'*40}")
                    
                    user_msg = conversation["messages"][0]["content"]
                    assistant_msg = conversation["messages"][1]["content"]
                    
                    print(f"👤 USER ({len(user_msg)} chars):")
                    print(user_msg)
                    
                    print(f"\n🤖 ASSISTANT ({len(assistant_msg)} chars):")
                    print(assistant_msg[:500] + "..." if len(assistant_msg) > 500 else assistant_msg)
                    
                    print(f"{'─'*40}")
                    
        except Exception as e:
            print(f"Error previewing data: {e}")



In [5]:
def main():
    """
    Main function to run the Kubernetes dataset cleaning and conversion
    """
    print("🧹 Kubernetes Dataset Cleaner and ChatML Converter")
    print("=" * 50)
    
    cleaner = KubernetesDatasetCleaner()
    
    # Clean and convert the dataset
    input_file = 'kubernetes_config-10k.json'
    output_file = 'kubernetes_config-10k.jsonl'
    
    num_conversations = cleaner.clean_and_convert_dataset(input_file, output_file)
    
    # Preview the results if successful
    if num_conversations > 0:
        cleaner.preview_cleaned_data(output_file, num_examples=2)
        
        print(f"\n✨ Processing complete!")
        print(f"📊 Statistics:")
        print(f"   - Input file: {input_file}")
        print(f"   - Output file: {output_file}")
        print(f"   - Valid conversations: {num_conversations}")
        print(f"   - Ready for training! 🚀")
    else:
        print("❌ No valid conversations were generated. Check your input file.")

if __name__ == "__main__":
    main()

🧹 Kubernetes Dataset Cleaner and ChatML Converter
Loading Kubernetes dataset from kubernetes_config-10k.json...
Found 10000 text entries
Processing 10000 entries...
Processed 1000/10000 entries, 1001 valid conversations...
Processed 2000/10000 entries, 2001 valid conversations...
Processed 3000/10000 entries, 3001 valid conversations...
Processed 4000/10000 entries, 4001 valid conversations...
Processed 5000/10000 entries, 5001 valid conversations...
Processed 6000/10000 entries, 6001 valid conversations...
Processed 7000/10000 entries, 7001 valid conversations...
Processed 8000/10000 entries, 8001 valid conversations...
Processed 9000/10000 entries, 9001 valid conversations...
Saving 10000 conversations to kubernetes_config-10k.jsonl...
✅ Successfully cleaned and converted 10000 conversations
❌ Skipped 0 invalid entries
📁 Output saved to: kubernetes_config-10k.jsonl

PREVIEW OF CLEANED KUBERNETES DATA

📝 CONVERSATION 1:
────────────────────────────────────────
👤 USER (46 chars):
Can y