# Download StackExchange data

Query the Software Recommendations StackExchange API for questions.

Want questions that show the terminology developers use when searching for libraries.

Get all questions tagged with 'library', or that have 'library' in the title.


In [5]:
# configure the client

from stackapi import StackAPI

site = StackAPI("softwarerecs", max_pages=100)

In [6]:
# filter string queried from https://api.stackexchange.com/docs/create-filter
# includes: question_id, title, tags, creation_date, link

filter = "!.yIVcje83OXko3o5"

In [20]:
# function to filter the response

KEYS_TO_SAVE = ["title", "tags", "creation_date", "link"]


def filter_response(
    response: dict,
) -> dict:
    return {
        item["question_id"]: {key: item[key] for key in KEYS_TO_SAVE if key in item}
        for item in response["items"]
    }

In [None]:
# count all questions with 'library' tag

library_tag_total = site.fetch(
    endpoint="questions",
    tagged="library",
    filter="total",
)
print("Response:", library_tag_total)

Response: {'backoff': 0, 'has_more': False, 'page': 1, 'quota_max': -1, 'quota_remaining': -1, 'total': 1022, 'items': []}


In [13]:
# query all questions with 'library' tag

library_tag_resp = site.fetch(
    endpoint="questions",
    tagged="library",
)
print(f"Have {len(library_tag_resp['items'])} records.")
print(library_tag_resp)

Have 1022 records.
{'backoff': 0, 'has_more': False, 'page': 11, 'quota_max': 300, 'quota_remaining': 185, 'total': 0, 'items': [{'tags': ['library', 'software-development', 'chat', 'saas'], 'owner': {'account_id': 255569, 'reputation': 1, 'user_id': 85068, 'user_type': 'registered', 'profile_image': 'https://www.gravatar.com/avatar/273c010fb8a91fdaa8f4814e1dbe65cb?s=256&d=identicon&r=PG', 'display_name': 'user535704', 'link': 'https://softwarerecs.stackexchange.com/users/85068/user535704'}, 'is_answered': False, 'view_count': 39, 'answer_count': 0, 'score': 0, 'last_activity_date': 1749713228, 'creation_date': 1695564495, 'question_id': 87962, 'content_license': 'CC BY-SA 4.0', 'link': 'https://softwarerecs.stackexchange.com/questions/87962/whitelabel-chat-system-to-embed-in-custom-made-saas-crm', 'title': 'Whitelabel chat system to embed in custom made SaaS CRM'}, {'tags': ['library', 'javascript', 'angular'], 'owner': {'account_id': 20599366, 'reputation': 11, 'user_id': 71494, 'use

In [16]:
# count all questions with 'library' in the title

library_title_total = site.fetch(
    endpoint="search/advanced",
    title="library",
    filter="total",
)
print("Response:", library_title_total)

Response: {'backoff': 0, 'has_more': False, 'page': 1, 'quota_max': -1, 'quota_remaining': -1, 'total': 1195, 'items': []}


In [17]:
# query all questions with 'library' in the title

library_title_resp = site.fetch(
    endpoint="search/advanced",
    title="library",
)
print(f"Have {len(library_title_resp['items'])} records.")
print(library_title_resp)

Have 1195 records.
{'backoff': 0, 'has_more': False, 'page': 12, 'quota_max': 300, 'quota_remaining': 173, 'total': 0, 'items': [{'tags': ['java', 'caching'], 'owner': {'account_id': 7062708, 'reputation': 101, 'user_id': 85368, 'user_type': 'registered', 'profile_image': 'https://lh6.googleusercontent.com/-NsKU95wNnz8/AAAAAAAAAAI/AAAAAAAAAlI/JsEGfhwPOqU/s256-rj/photo.jpg', 'display_name': 'Andriy Makukha', 'link': 'https://softwarerecs.stackexchange.com/users/85368/andriy-makukha'}, 'is_answered': False, 'view_count': 3, 'answer_count': 0, 'score': 0, 'last_activity_date': 1749746371, 'creation_date': 1749746371, 'question_id': 93382, 'content_license': 'CC BY-SA 4.0', 'link': 'https://softwarerecs.stackexchange.com/questions/93382/2-ttl-caching-library-for-java', 'title': '2-TTL caching library for Java'}, {'tags': ['c++', 'images', 'image-processing', 'file-format'], 'owner': {'account_id': 179242, 'reputation': 179, 'user_id': 28613, 'user_type': 'registered', 'profile_image': 'htt

In [21]:
# combine the questions

questions = {
    **filter_response(library_tag_resp),
    **filter_response(library_title_resp),
}
print(f"Have {len(questions)} question records total.")

Have 1551 question records total.


In [22]:
# save the questions data

from llm_cgr import save_json
from datetime import datetime

save_json(
    data=questions,
    file_path=f"data/stackexchange/questions_{datetime.now().date()}.json",
)