In [7]:
import requests
import time
import json

# API configuration
TAGS = ["nlp", "nltk", "spacy", "transformers", "text-classification"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers.json"
API_KEY = "rl_ndUUpkDFr5NnfioGSSC4VfKu4"
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# Loop through tags and paginated questions
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions that have accepted answers
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Fetch bodies of accepted answers in batches
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  

        # Merge question and its accepted answer into the dataset
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# Save all collected data to a JSON file
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [nlp]...
 Total collected so far: 18
Fetching page 2 for tag [nlp]...
 Total collected so far: 38
Fetching page 3 for tag [nlp]...
 Total collected so far: 57
Fetching page 4 for tag [nlp]...
 Total collected so far: 67
Fetching page 5 for tag [nlp]...
 Total collected so far: 79
Fetching page 6 for tag [nlp]...
 Total collected so far: 99
Fetching page 7 for tag [nlp]...
 Total collected so far: 125
Fetching page 8 for tag [nlp]...
 Total collected so far: 153
Fetching page 9 for tag [nlp]...
 Total collected so far: 175
Fetching page 10 for tag [nlp]...
 Total collected so far: 205
Fetching page 11 for tag [nlp]...
 Total collected so far: 223
Fetching page 12 for tag [nlp]...
 Total collected so far: 243
Fetching page 13 for tag [nlp]...
 Total collected so far: 273
Fetching page 14 for tag [nlp]...
 Total collected so far: 298
Fetching page 15 for tag [nlp]...
 Total collected so far: 328
Fetching page 16 for tag [nlp]...
 Total collected so far: 358
Fetchin

In [8]:
import requests
import time
import json

# API configuration
TAGS = ["nlp", "nltk", "spacy", "transformers", "text-classification"]
SITE = "stackoverflow"
PAGE_SIZE = 100
START_PAGE = 19
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_part2.json"
API_KEY = "rl_ikc5MimeHf8zgfjLcqbwYV5rr"
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# Loop through tags and paginated questions
for tag in TAGS:
    for page in range(START_PAGE, MAX_PAGES + 1):
        print(f"Fetching page {page} for tag [{tag}]...")

        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions that have accepted answers
        questions_with_answers = [
            item for item in data["items"] if item.get("accepted_answer_id")
        ]

        # Fetch bodies of accepted answers in batches
        accepted_ids = [str(q["accepted_answer_id"]) for q in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue
            time.sleep(0.5)  

        # Merge question and its accepted answer into the dataset
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

        if valid_question_count >= TARGET_COUNT:
            break
    if valid_question_count >= TARGET_COUNT:
        break

# Save all collected data to a JSON file
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} new Q&A posts to {OUTPUT_FILE}")


Fetching page 19 for tag [nlp]...
 Total collected so far: 29
Fetching page 20 for tag [nlp]...
 Total collected so far: 59
Fetching page 21 for tag [nlp]...
 Total collected so far: 89
Fetching page 22 for tag [nlp]...
 Total collected so far: 119
Fetching page 23 for tag [nlp]...
 Total collected so far: 149
Fetching page 24 for tag [nlp]...
 Total collected so far: 179
Fetching page 25 for tag [nlp]...
 Total collected so far: 209
Fetching page 26 for tag [nlp]...
 Total collected so far: 235
Fetching page 27 for tag [nlp]...
 Total collected so far: 265
Fetching page 28 for tag [nlp]...
 Total collected so far: 295
Fetching page 29 for tag [nlp]...
 Total collected so far: 325
Fetching page 30 for tag [nlp]...
 Total collected so far: 355
Fetching page 31 for tag [nlp]...
 Total collected so far: 385
Fetching page 32 for tag [nlp]...
 Total collected so far: 413
Fetching page 33 for tag [nlp]...
 Total collected so far: 443
Fetching page 34 for tag [nlp]...
 Total collected so far:

In [9]:
import requests
import time
import json

# API configuration
TAGS = ["nlp", "nltk", "spacy", "transformers", "text-classification"]
SITE = "stackoverflow"
PAGE_SIZE = 100
START_PAGE = 32
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_part3.json"
API_KEY = "rl_hgq4LDMkXEWV6kz23rmXAam1R"
TARGET_COUNT = 5000

all_questions = []
valid_question_count = 0

# Loop through tags and paginated questions
for tag in TAGS:
    for page in range(START_PAGE, MAX_PAGES + 1):
        print(f"Fetching page {page} for tag [{tag}]...")

        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions that have accepted answers
        questions_with_answers = [
            item for item in data["items"] if item.get("accepted_answer_id")
        ]

        # Fetch bodies of accepted answers in batches
        accepted_ids = [str(q["accepted_answer_id"]) for q in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue
            time.sleep(0.5)  

        # Merge question and its accepted answer into the dataset
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

        if valid_question_count >= TARGET_COUNT:
            break
    if valid_question_count >= TARGET_COUNT:
        break

# Save all collected data to a JSON file
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} new Q&A posts to {OUTPUT_FILE}")


Fetching page 32 for tag [nlp]...
 Total collected so far: 28
Fetching page 33 for tag [nlp]...
 Total collected so far: 58
Fetching page 34 for tag [nlp]...
 Total collected so far: 88
Fetching page 35 for tag [nlp]...
 Total collected so far: 118
Fetching page 36 for tag [nlp]...
 Total collected so far: 148
Fetching page 37 for tag [nlp]...
 Total collected so far: 178
Fetching page 38 for tag [nlp]...
 Total collected so far: 208
Fetching page 39 for tag [nlp]...
 Total collected so far: 238
Fetching page 40 for tag [nlp]...
 Total collected so far: 268
Fetching page 41 for tag [nlp]...
 Total collected so far: 298
Fetching page 42 for tag [nlp]...
 Total collected so far: 328
Fetching page 43 for tag [nlp]...
 Total collected so far: 358
Fetching page 44 for tag [nlp]...
 Total collected so far: 387
Fetching page 45 for tag [nlp]...
 Total collected so far: 416
Fetching page 46 for tag [nlp]...
 Total collected so far: 446
Fetching page 47 for tag [nlp]...
 Total collected so far:

In [10]:
import json

# Load and merge three batches of JSON files
with open("stackoverflow_nlp_posts_with_answers.json", "r", encoding="utf-8") as f1, \
     open("stackoverflow_nlp_posts_part2.json", "r", encoding="utf-8") as f2, \
     open("stackoverflow_nlp_posts_part3.json", "r", encoding="utf-8") as f3:
    
    data1 = json.load(f1)
    data2 = json.load(f2)
    data3 = json.load(f3)


combined = data1 + data2 + data3


with open("stackoverflow_nlp_combined_20000.json", "w", encoding="utf-8") as out:
    json.dump(combined, out, indent=2)

print(f" Combined file saved as 'stackoverflow_nlp_combined_20000.json' with {len(combined)} entries")


 Combined file saved as 'stackoverflow_nlp_combined_20000.json' with 22267 entries
