In [None]:
%%capture
!pip install stackapi pandas pyarrow --quiet

In [None]:
import json
import re
import time
import nltk
from datetime import datetime, timedelta
from stackapi import StackAPI
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Config
TOTAL_QUESTIONS = 100000
PAGE_SIZE = 50
TAGS = ['python']
OUTPUT_PATH = "/kaggle/working/simple_python_qa_rag_dataset.json"


# Utility functions
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def count_words(text):
    return len(text.split())

def count_sentences(text):
    return len(sent_tokenize(text))

def contains_code_block(text):
    return '<pre>' in text or '```' in text

def is_simple_qa(q, a):
    return (
        count_words(q) <= 20 and
        count_words(a) <= 30 and
        count_sentences(a) <= 2 and
        not contains_code_block(a)
    )

# StackAPI init
site = StackAPI('stackoverflow',  max_retries=5)
site.page_size = PAGE_SIZE
site.max_pages = TOTAL_QUESTIONS // PAGE_SIZE + 1

start_date = datetime.now() - timedelta(days=2920)
end_date = datetime.now()

print(f"Fetching simple Python Q&A pairs (max {TOTAL_QUESTIONS})...")

# Phase 1: Get question IDs
question_ids = []
page = 1
while len(question_ids) < TOTAL_QUESTIONS * 2:  # Oversample for filtering
    try:
        questions = site.fetch('questions',
            tagged=TAGS,
            sort='votes',
            order='desc',
            fromdate=int(start_date.timestamp()),
            todate=int(end_date.timestamp()),
            page=page,
            filter='withbody'
        )
        if not questions['items']:
            break

        question_ids.extend([q['question_id'] for q in questions['items']])
        print(f"Collected {len(question_ids)} question IDs...")
        page += 1
        time.sleep(1)

    except Exception as e:
        print(f"Error fetching page {page}: {str(e)}")
        break

# Phase 2: Process Q&As
processed_data = []
batch_size = 10
for i in range(0, len(question_ids), batch_size):
    batch_ids = question_ids[i:i+batch_size]
    if len(processed_data) >= TOTAL_QUESTIONS:
        break

    try:
        questions = site.fetch('questions', ids=batch_ids, filter='withbody')
        for question in questions['items']:
            try:
                answers = site.fetch(f'questions/{question["question_id"]}/answers',
                    filter='withbody',
                    sort='votes',
                    order='desc'
                )['items']

                if len(answers) < 1:
                    continue

                best_answer = answers[0]
                q_text = clean_text(question['title'])
                a_text = clean_text(best_answer['body'])

                if not is_simple_qa(q_text, a_text):
                    continue

                context_chunks = [
                    {
                        "text": clean_text(question['body']),
                        "contains_answer": False,
                        "score": 0.3,
                        "source": "stackoverflow_question"
                    },
                    {
                        "text": a_text,
                        "contains_answer": True,
                        "score": 1.0,
                        "source": "stackoverflow"
                    }
                ]

                processed_data.append({
                    "question": q_text,
                    "expected_answer": a_text,
                    "context_chunks": context_chunks,
                    "metadata": {
                        "tags": question.get('tags', []),
                        "question_score": question.get('score', 0),
                        "answer_score": best_answer.get('score', 0),
                        "created": datetime.fromtimestamp(question['creation_date']).isoformat(),
                        "question_id": question['question_id'],
                        "answer_id": best_answer['answer_id']
                    }
                })

                print(f"✓ {len(processed_data)} / {TOTAL_QUESTIONS}: {q_text[:50]}")

            except Exception as e:
                print(f"Skipping question {question.get('question_id')}: {e}")
                continue

        time.sleep(2)

    except Exception as e:
        print(f"Batch error {i}-{i+batch_size}: {e}")
        time.sleep(10)
        continue

# Save output
if processed_data:
    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, indent=2, ensure_ascii=False)
    print(f"\n✅ Saved {len(processed_data)} simple Q&A pairs to {OUTPUT_PATH}")
else:
    print("\n❌ No suitable Q&A found.")


In [None]:
from datasets import load_dataset

# Load ONLY the first shard in streaming mode (no full download)
ds = load_dataset("nomic-ai/cornstack-python-v1", streaming=True, split="train")

# Get the first row
first_row = next(iter(ds))
print("First row:")
print(first_row)