In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/mcipriano/stackoverflow-kubernetes-questions/data/kubernetes_dump.parquet")
df.to_json("kubernetes-sof-qa.json")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

with open('kubernetes-sof-qa-30k.json', 'r') as file:
    data = json.load(file)

print(data.get("Question")["0"])
print()
print(data.get("Answer")["0"])

<p>How to resolve the error no module named pandas when one node (in Airflow's DAG) is successful in using it(pandas) and the other is not?</p>

<p>I am unable to deduce as to why I am getting an error no module named pandas.</p>

<p>I have checked via <code>pip3 freeze</code> and yes, the desired pandas version does show up.</p>

<p>I have deployed this using docker on a kubernetes cluster.</p>


<p><a href="https://github.com/apache/incubator-airflow/blob/v1-10-stable/setup.py#L292" rel="nofollow noreferrer">Pandas is generally required</a>, and sometimes used in some hooks to return dataframes. Well, it's possible that Airflow was installed with <code>pip</code> and not <code>pip3</code> possibly being added as a Python 2 module and not a Python 3 module (though, using <code>pip</code> should have installed Pandas when one looks at the <a href="https://github.com/apache/incubator-airflow/blob/v1-10-stable/setup.py#L292" rel="nofollow noreferrer"><code>setup.py</code></a>).</p>

<p>W

In [3]:
import json
import re
from html import unescape

class KubernetesSOFDatasetCleaner:
    def __init__(self):
        # Compile regex patterns for better performance
        self.patterns = {
            # HTML cleaning
            'html_tags': re.compile(r'<[^>]+>'),
            'code_blocks': re.compile(r'<code>(.*?)</code>', re.DOTALL),
            'paragraph_tags': re.compile(r'</?p>'),
            'link_tags': re.compile(r'<a[^>]*>(.*?)</a>', re.DOTALL),
            
            # Whitespace cleaning
            'multiple_newlines': re.compile(r'\n\s*\n\s*\n+'),
            'multiple_spaces': re.compile(r'[ \t]+'),
            'leading_trailing_space': re.compile(r'^\s+|\s+$', re.MULTILINE),
            
            # Content validation
            'meaningful_words': re.compile(r'\b[a-zA-Z]{3,}\b')
        }
    
    def clean_html_text(self, text):
        """
        Clean HTML content while preserving code blocks and structure
        """
        if not text:
            return ""
        
        # Convert to string if not already
        text = str(text)
        
        # Unescape HTML entities first
        text = unescape(text)
        
        # Preserve code blocks by replacing them with placeholders
        code_blocks = []
        def preserve_code(match):
            code_blocks.append(match.group(1))
            return f"__CODE_BLOCK_{len(code_blocks)-1}__"
        
        text = self.patterns['code_blocks'].sub(preserve_code, text)
        
        # Extract link text (keep the text, remove the link)
        text = self.patterns['link_tags'].sub(r'\1', text)
        
        # Remove paragraph tags but keep the content
        text = self.patterns['paragraph_tags'].sub('', text)
        
        # Remove remaining HTML tags
        text = self.patterns['html_tags'].sub('', text)
        
        # Restore code blocks with backticks
        for i, code_content in enumerate(code_blocks):
            # Clean the code content
            code_content = code_content.strip()
            # Use backticks for inline code or code blocks
            if '\n' in code_content:
                code_replacement = f"```\n{code_content}\n```"
            else:
                code_replacement = f"`{code_content}`"
            text = text.replace(f"__CODE_BLOCK_{i}__", code_replacement)
        
        # Clean up whitespace
        text = self.patterns['multiple_newlines'].sub('\n\n', text)
        text = self.patterns['multiple_spaces'].sub(' ', text)
        text = self.patterns['leading_trailing_space'].sub('', text)
        
        return text.strip()
    
    def is_valid_content(self, text):
        """
        Check if the content is valid (not empty, has meaningful content)
        """
        if not text or len(text.strip()) < 10:
            return False
        
        # Check if it has some meaningful words
        words = self.patterns['meaningful_words'].findall(text)
        if len(words) < 3:
            return False
        
        # Check if it's not just whitespace and punctuation
        alphanumeric = re.sub(r'[^a-zA-Z0-9]', '', text)
        if len(alphanumeric) < 5:
            return False
        
        return True
    
    def clean_and_convert_dataset(self, input_file, output_file):
        """
        Clean the Kubernetes Stack Overflow dataset and convert to ChatML format
        """
        print(f"Loading Kubernetes Stack Overflow dataset from {input_file}...")
        
        try:
            with open(input_file, 'r', encoding='utf-8') as file:
                data = json.load(file)
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return 0
        
        questions = data.get("Question", {})
        answers = data.get("Answer", {})
        
        print(f"Found {len(questions)} questions and {len(answers)} answers")
        
        cleaned_conversations = []
        skipped = 0
        
        # Get all keys and sort them numerically
        question_keys = set(questions.keys())
        answer_keys = set(answers.keys())
        all_keys = question_keys.intersection(answer_keys)  # Only process pairs that exist
        
        try:
            # Try to sort numerically if keys are string numbers
            sorted_keys = sorted(all_keys, key=lambda x: int(x))
        except ValueError:
            # If not all numeric, sort alphabetically
            sorted_keys = sorted(all_keys)
        
        print(f"Processing {len(sorted_keys)} question-answer pairs...")
        
        for i, key in enumerate(sorted_keys):
            try:
                # Get raw content
                raw_question = questions.get(key, "")
                raw_answer = answers.get(key, "")
                
                # Clean the HTML content
                clean_question = self.clean_html_text(raw_question)
                clean_answer = self.clean_html_text(raw_answer)
                
                # Validate content
                if not self.is_valid_content(clean_question) or not self.is_valid_content(clean_answer):
                    skipped += 1
                    if i % 1000 == 0 and i > 0:
                        print(f"Processed {i}/{len(sorted_keys)} entries, skipped {skipped} so far...")
                    continue
                
                # Create ChatML conversation
                conversation = {
                    "messages": [
                        {
                            "role": "user",
                            "content": clean_question
                        },
                        {
                            "role": "assistant",
                            "content": clean_answer
                        }
                    ]
                }
                
                cleaned_conversations.append(conversation)
                
                # Progress update
                if i % 1000 == 0 and i > 0:
                    print(f"Processed {i}/{len(sorted_keys)} entries, {len(cleaned_conversations)} valid conversations...")
                
            except Exception as e:
                print(f"Error processing entry {key}: {e}")
                skipped += 1
                continue
        
        # Save to JSONL format
        print(f"Saving {len(cleaned_conversations)} conversations to {output_file}...")
        
        try:
            with open(output_file, 'w', encoding='utf-8') as file:
                for conversation in cleaned_conversations:
                    file.write(json.dumps(conversation, ensure_ascii=False) + '\n')
        except Exception as e:
            print(f"Error saving dataset: {e}")
            return 0
        
        print(f"✅ Successfully cleaned and converted {len(cleaned_conversations)} conversations")
        print(f"❌ Skipped {skipped} invalid entries")
        print(f"📁 Output saved to: {output_file}")
        
        return len(cleaned_conversations)
    
    def preview_cleaned_data(self, output_file, num_examples=2):
        """
        Preview the cleaned data
        """
        print(f"\n{'='*70}")
        print(f"PREVIEW OF CLEANED KUBERNETES STACK OVERFLOW DATA")
        print(f"{'='*70}")
        
        try:
            with open(output_file, 'r', encoding='utf-8') as file:
                for i, line in enumerate(file):
                    if i >= num_examples:
                        break
                    
                    conversation = json.loads(line)
                    print(f"\n📝 CONVERSATION {i+1}:")
                    print(f"{'─'*50}")
                    
                    user_msg = conversation["messages"][0]["content"]
                    assistant_msg = conversation["messages"][1]["content"]
                    
                    print(f"👤 USER QUESTION ({len(user_msg)} chars):")
                    print(user_msg)
                    
                    print(f"\n🤖 ASSISTANT ANSWER ({len(assistant_msg)} chars):")
                    print(assistant_msg[:400] + "..." if len(assistant_msg) > 400 else assistant_msg)
                    
                    print(f"{'─'*50}")
                    
        except Exception as e:
            print(f"Error previewing data: {e}")



In [5]:
def main():
    """
    Main function to run the Kubernetes Stack Overflow dataset cleaning and conversion
    """
    print("🧹 Kubernetes Stack Overflow Dataset Cleaner and ChatML Converter")
    print("=" * 60)
    
    cleaner = KubernetesSOFDatasetCleaner()
    
    # Clean and convert the dataset
    input_file = 'kubernetes-sof-qa-30k.json'
    output_file = 'kubernetes-sof-qa-30k-clean.jsonl'
    
    num_conversations = cleaner.clean_and_convert_dataset(input_file, output_file)
    
    # Preview the results if successful
    if num_conversations > 0:
        cleaner.preview_cleaned_data(output_file, num_examples=2)
        
        print(f"\n✨ Processing complete!")
        print(f"📊 Statistics:")
        print(f"   - Input file: {input_file}")
        print(f"   - Output file: {output_file}")
        print(f"   - Valid conversations: {num_conversations}")
        print(f"   - Ready for training! 🚀")
    else:
        print("❌ No valid conversations were generated. Check your input file.")

if __name__ == "__main__":
    main()

🧹 Kubernetes Stack Overflow Dataset Cleaner and ChatML Converter
Loading Kubernetes Stack Overflow dataset from kubernetes-sof-qa-30k.json...
Found 30044 questions and 30044 answers
Processing 30044 question-answer pairs...
Processed 1000/30044 entries, 1001 valid conversations...
Processed 2000/30044 entries, 2001 valid conversations...
Processed 3000/30044 entries, 3001 valid conversations...
Processed 4000/30044 entries, 4001 valid conversations...
Processed 5000/30044 entries, 5001 valid conversations...
Processed 6000/30044 entries, 6001 valid conversations...
Processed 7000/30044 entries, 7001 valid conversations...
Processed 8000/30044 entries, 8001 valid conversations...
Processed 9000/30044 entries, 9001 valid conversations...
Processed 10000/30044 entries, 10001 valid conversations...
Processed 11000/30044 entries, 11001 valid conversations...
Processed 12000/30044 entries, 12001 valid conversations...
Processed 13000/30044 entries, 13001 valid conversations...
Processed 1400