# Download StackExchange data

Query the Software Recommendations StackExchange API for questions.

Want questions that show the terminology developers use when searching for libraries.

Get all questions tagged with 'library', or that have 'library' in the title.

Website: https://softwarerecs.stackexchange.com/

In [None]:
# configure the client

from stackapi import StackAPI

site = StackAPI("softwarerecs", max_pages=100)

In [None]:
# filter string queried from https://api.stackexchange.com/docs/create-filter
# includes: question_id, title, tags, creation_date, link

filter = "!.yIVcje83OXko3o5"

In [None]:
# function to filter the response

KEYS_TO_SAVE = ["title", "tags", "creation_date", "link"]


def filter_response(
    response: dict,
) -> dict:
    return {
        item["question_id"]: {key: item[key] for key in KEYS_TO_SAVE if key in item}
        for item in response["items"]
    }

In [None]:
# count all questions with 'library' tag

library_tag_total = site.fetch(
    endpoint="questions",
    tagged="library",
    filter="total",
)
print("Response:", library_tag_total)

In [None]:
# query all questions with 'library' tag

library_tag_resp = site.fetch(
    endpoint="questions",
    tagged="library",
)
print(f"Have {len(library_tag_resp['items'])} records.")
print(library_tag_resp)

In [None]:
# count all questions with 'library' in the title

library_title_total = site.fetch(
    endpoint="search/advanced",
    title="library",
    filter="total",
)
print("Response:", library_title_total)

In [None]:
# query all questions with 'library' in the title

library_title_resp = site.fetch(
    endpoint="search/advanced",
    title="library",
)
print(f"Have {len(library_title_resp['items'])} records.")
print(library_title_resp)

In [None]:
# combine the questions

questions = {
    **filter_response(library_tag_resp),
    **filter_response(library_title_resp),
}
print(f"Have {len(questions)} question records total.")

In [None]:
# save the questions data

from llm_cgr import save_json
from datetime import datetime

file_path = f"questions_{datetime.now().date()}.json"
save_json(
    data=questions,
    file_path=file_path,
)

# Process the question titles

Normalise the titles, break them down into n-grams, and query embeddings for them.
Ready for systematic analysis.

In [None]:
# function to normalise the question titles


def process_text(text: str) -> str:
    text = text.lower().strip()
    for _replace, _with in [
        ("opensource", "open source"),
        ("light-weight", "lightweight"),
    ]:
        text = text.replace(_replace, _with)

    return text


titles = [process_text(q["title"]) for q in questions.values()]

In [None]:
# extract n-grams from the titles

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    ngram_range=(1, 3),
    stop_words="english",
)
X = vectorizer.fit_transform(titles)
ngrams = vectorizer.get_feature_names_out()  # all unique n-grams
counts = X.toarray().sum(axis=0)  # frequencies of each n-gram

In [None]:
filtered_ngrams = [ng for ng in ngrams if len(ng) > 3 and not ng.isdigit()]

print(f"Have {len(filtered_ngrams)} unique n-grams after filtering.")

In [None]:
vocab = vectorizer.vocabulary_  # dict: ngram -> column index
ngram_titles = {}
for ng in filtered_ngrams:
    col = vocab[ng]
    # X[:, col] is a sparse column; .nonzero()[0] gives row indices where count>0
    rows = X[:, col].nonzero()[0]
    ngram_titles[ng] = rows.tolist()

print(ngram_titles)

In [None]:
# combine n-grams with their counts
_combined = zip(ngrams, counts)
_combined = sorted(_combined, key=lambda x: x[1], reverse=True)
ngram_counts = dict(_combined)

# filter out small n-grams and those that are purely numeric
ngram_counts = {
    k: int(v) for k, v in ngram_counts.items() if len(k) > 2 and not k.isdigit()
}

print(f"Have {len(ngram_counts)} n-grams after filtering.")

In [None]:
# save the n-grams counts data

file_path = f"ngrams_{datetime.now().date()}.json"
save_json(
    data=ngram_titles,
    file_path=file_path,
)

In [None]:
# get embeddings for the n-grams

from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

_ngrams = list(ngram_counts.keys())
embeddings = model.encode(
    sentences=_ngrams,
    convert_to_numpy=True,
    show_progress_bar=True,
)

In [None]:
# normalise embeddings (for cosine similarity)

embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [None]:
# combine n-grams with their embeddings

ngram_embeddings = dict(zip(_ngrams, embeddings))
ngram_embeddings = {k: v.tolist() for k, v in ngram_embeddings.items()}

In [None]:
# save the n-grams embeddings data

file_path = f"embeddings_{datetime.now().date()}.json"
save_json(
    data=ngram_embeddings,
    file_path=file_path,
)

# Get embeddings for the full question titles

In [None]:
# seperate out questions and ids

question_ids = []
question_titles = []

for _id, _data in questions.items():
    question_ids.append(_id)
    question_titles.append(_data["title"])

In [None]:
# function to normalise the question titles

DENY_LIST = [
    # generic terms
    "library",
    "framework",
    "libraries",
    "code",
    "image",
    "images",
    "file",
    "files",
    "online",
    "audio",
    "graphics",
    "video",
    "interactive",
    "level",
    "similar",
    "equivalent",
    "markup",
    # programming languages
    "python",
    "javascript",
    "java",
    "c++",
    "c#",
    "ruby",
    "php",
    "c/c++",
    "js",
    "go",
    "rust",
    "sql",
    "typescript",
    "kotlin",
    "swift",
    "bash",
    "shell",
    # file types
    "pdf",
    "html",
    "css",
    "markdown",
    "json",
    # technologies and platforms
    "android",
    ".net",
    "windows",
    "database",
    "key",
    "flask",
    "spring",
    "laravel",
    "django",
    "react",
    "angular",
    "vue",
    "node.js",
]


def process_text(text: str) -> str:
    text = text.lower().strip()
    for _replace, _with in [
        ("opensource", "open source"),
        ("light-weight", "lightweight"),
    ]:
        text = text.replace(_replace, _with)

    for _tech in DENY_LIST:
        text = text.replace(_tech, "<TECH>")

    return text


question_titles = [process_text(q) for q in question_titles]

In [None]:
# get embeddings for the titles and normalise them

from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

title_embeddings = model.encode(
    sentences=question_titles,
    convert_to_numpy=True,
    show_progress_bar=True,
)
title_embeddings = title_embeddings / np.linalg.norm(
    title_embeddings, axis=1, keepdims=True
)

In [None]:
# combine question ids with their title embeddings

title_id_embeddings = dict(zip(question_ids, title_embeddings))
title_id_embeddings = {k: v.tolist() for k, v in title_id_embeddings.items()}

In [None]:
# save the question title embeddings data

file_path = f"question_embeddings_{datetime.now().date()}.json"
save_json(
    data=title_id_embeddings,
    file_path=file_path,
)