# Download & Process StackExchange data *(experiment 1 setup)*

Query the [Software Recommendations StackExchange](https://softwarerecs.stackexchange.com/) [API](https://api.stackexchange.com/) for questions related to coding libraries.

We filter the questions to only those related to libraries, and then use these questions to determine the descriptions to use for our realistic user prompts.

In [1]:
# initial set up

from llm_cgr import load_json, save_json
from datetime import datetime

dir = "../data/stackexchange"

In [13]:
# configure the client

from stackapi import StackAPI

site = StackAPI("softwarerecs", max_pages=25)

In [14]:
# function to filter the response

KEYS_TO_SAVE = ["title", "tags", "creation_date", "link"]


def filter_response(
    response: dict,
) -> dict:
    return {
        item["question_id"]: {key: item[key] for key in KEYS_TO_SAVE if key in item}
        for item in response["items"]
    }

## **1.** Query all recent questions

In [30]:
# query maximum questions, sorted by most recent creation date

recent_response = site.fetch(
    endpoint="questions",
    sort="creation",
    order="desc",
)
print(f"Have {len(recent_response['items'])} records.")

Have 2500 records.


In [None]:
# save the recent questions data

save_json(
    data=filter_response(recent_response),
    file_path=f"{dir}/recent_questions_{datetime.now().date()}.json",
)

## **2.** Sample 200 of the questions for manual analysis

In [None]:
# sample 200 of the recent questions for manual analysis

import random

recent_questions = load_json(
    file_path=f"{dir}/recent_questions_2025-06-30.json",
)
recent_ids = sorted(recent_questions.keys())

random.seed(42)  # for reproducibility
recent_sample_ids = random.sample(
    population=recent_ids,
    k=200,
)
print(recent_sample_ids)

['84729', '83509', '87742', '87357', '87064', '85045', '84578', '91852', '84334', '92297', '90696', '83586', '83566', '84444', '86984', '87173', '91479', '93389', '83526', '92009', '86751', '91850', '90674', '87015', '90975', '92287', '87794', '83296', '85224', '90699', '88652', '87793', '85185', '86943', '88603', '84572', '84432', '90170', '84499', '88919', '88698', '93397', '87604', '83731', '91049', '91777', '84905', '90156', '84196', '91920', '87998', '89953', '92155', '86659', '84088', '83765', '87113', '87964', '84210', '87174', '84550', '91010', '89987', '85249', '90071', '88863', '86874', '87634', '84108', '93441', '85369', '91757', '87356', '85260', '91073', '90163', '87690', '91968', '86998', '88434', '83906', '87127', '83591', '88316', '90458', '87659', '84040', '86886', '92055', '88296', '86901', '91430', '90386', '91044', '85071', '87606', '87374', '92010', '91800', '87583', '92242', '90772', '92232', '90423', '89955', '86992', '85037', '91516', '91388', '84398', '83779', 

In [None]:
# save random sample with dictionary to hold whether the question is library-related

recent_sample = {}
for question_id in recent_sample_ids:
    recent_sample[question_id] = recent_questions[question_id]
    recent_sample[question_id]["is_library_related"] = {
        "manual": None,
        "auto": None,
    }

save_json(
    data=recent_sample,
    file_path=f"{dir}/manual_sample_{datetime.now().date()}.json",
)

## **3.** Define our filter method

Filter to questions that:
* Contain the words "library" or "framework"
* Have the tag "library" or "framework"
* Are tagged with one of the top 10 programming languages (TIOBE Index), and not tagged with "books" or "ide".


In [None]:
# define our filter for library questions

# fmt: off
base_tags = ["library", "framework"]

language_tags = [
    # top 10 programming languages (TIOBE Index)
    "python", "c++", "c", "java", "c#",
    "javascript", "go", "visual-basic", "pascal", "fortran",
]
language_blocks = ["books", "ide"]

words = ["library", "framework"]
# fmt: on

## **4.** Apply the filter to the manual sample

In [40]:
manual_sample_file = f"{dir}/manual_analysis_2025-06-30.json"
manual_sample = load_json(
    file_path=manual_sample_file,
)

for _id, _data in manual_sample.items():
    # check for the words in the title
    if any(word in _data["title"].lower() for word in words):
        manual_sample[_id]["is_library_related"]["auto"] = True

    # check for any base tags
    elif any(tag in _data["tags"] for tag in base_tags):
        manual_sample[_id]["is_library_related"]["auto"] = True

    # check for any language tags
    elif any(tag in _data["tags"] for tag in language_tags) and not any(
        tag in _data["tags"] for tag in language_blocks
    ):
        manual_sample[_id]["is_library_related"]["auto"] = True

    # otherwise not library related
    else:
        manual_sample[_id]["is_library_related"]["auto"] = False

save_json(
    data=manual_sample,
    file_path=manual_sample_file,
)

## **5.** Check the accuracy of the filter compared to the manual classification

In [41]:
from sklearn.metrics import classification_report

manual_sample = load_json(
    file_path=f"{dir}/manual_analysis_2025-06-30.json",
)

y_true, y_pred = [], []

for _, _data in manual_sample.items():
    y_true.append(int(_data["is_library_related"]["manual"]))
    y_pred.append(int(_data["is_library_related"]["auto"]))


print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       177
           1       0.91      0.91      0.91        23

    accuracy                           0.98       200
   macro avg       0.95      0.95      0.95       200
weighted avg       0.98      0.98      0.98       200



## **6.** Query questions for each tag and word

In [42]:
# fetch questions for each tag and filter them

tagged_questions = {}
for tag in base_tags:
    _response = site.fetch(
        endpoint="questions",
        sort="creation",
        order="desc",
        tagged=tag,
    )
    tagged_questions.update(
        filter_response(_response),
    )
    print(f"Have {len(_response['items'])} records for tag {tag}.")

Have 1022 records for tag library.
Have 200 records for tag framework.


In [43]:
# fetch questions for each tag and filter them

language_questions = {}
for tag in language_tags:
    _response = site.fetch(
        endpoint="questions",
        sort="creation",
        order="desc",
        tagged=tag,
    )
    language_questions.update(
        filter_response(_response),
    )
    print(f"Have {len(_response['items'])} records for tag {tag}.")

# filter out language questions that are tagged with the blocks
language_questions = {
    _id: _data
    for _id, _data in language_questions.items()
    if not any(tag in _data["tags"] for tag in language_blocks)
}
print(f"Have {len(language_questions)} language questions after filtering.")

Have 872 records for tag python.
Have 575 records for tag c++.
Have 262 records for tag c.
Have 705 records for tag java.
Have 407 records for tag c#.
Have 839 records for tag javascript.
Have 38 records for tag go.
Have 0 records for tag visual-basic.
Have 0 records for tag pascal.
Have 13 records for tag fortran.
Have 3229 language questions after filtering.


In [44]:
# fetch questions for each word and filter them

word_questions = {}
for word in words:
    _response = site.fetch(
        endpoint="search/advanced",
        sort="creation",
        order="desc",
        title=word,
    )
    word_questions.update(
        filter_response(_response),
    )
    print(f"Have {len(_response['items'])} records for word {word}.")

Have 1197 records for word library.
Have 330 records for word framework.


In [45]:
all_questions = {
    **tagged_questions,
    **language_questions,
    **word_questions,
}
print(f"Have {len(all_questions)} question records total:")

print(f"    ({len(tagged_questions)} questions from base tags,")
print(f"     {len(language_questions)} questions from language tags,")
print(f"     {len(word_questions)} questions from title words.)")

Have 3917 question records total:
    (1207 questions from base tags,
     3229 questions from language tags,
     1499 questions from title words.)


In [None]:
# save all the questions data, ready for analysis

save_json(
    data=all_questions,
    file_path=f"{dir}/library_questions_{datetime.now().date()}.json",
)

## **7.** Extract n-grams from the question titles

Normalise the titles and break them down into n-grams.

In [None]:
# function to normalise the question titles


def process_title(text: str) -> str:
    text = text.lower().strip()
    for _replace, _with in [
        ("opensource", "open source"),
        ("light-weight", "lightweight"),
    ]:
        text = text.replace(_replace, _with)
    return text

In [8]:
# load the questions data and process titles

questions = load_json(
    file_path=f"{dir}/library_questions_2025-07-04.json",
)

ids, titles = [], []
for _id, _data in questions.items():
    ids.append(_id)
    titles.append(process_title(text=_data["title"]))

In [9]:
# extract n-grams from the titles

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    stop_words="english",
)
X = vectorizer.fit_transform(titles)
ngrams = vectorizer.get_feature_names_out()  # all unique n-grams
vocab = vectorizer.vocabulary_  # dict: ngram -> column index

In [10]:
# filter out n-grams that are too short or are numeric

filtered_ngrams = [ng for ng in ngrams if len(ng) > 2 and not ng.isdigit()]

print(f"Have {len(filtered_ngrams)} unique n-grams after filtering.")

Have 35641 unique n-grams after filtering.


In [11]:
# map ngrams to title ids

ngram_titles = {}
for ng in filtered_ngrams:
    col = vocab[ng]
    rows = X[:, col].nonzero()[0]
    ngram_titles[ng] = [ids[i] for i in rows.tolist()]

print(f"Have {len(ngram_titles)} n-grams mapped to their titles.")

Have 35641 n-grams mapped to their titles.


In [12]:
# save the n-grams data

save_json(
    data=ngram_titles,
    file_path=f"{dir}/ngrams_{datetime.now().date()}.json",
)