## Load Libraries

In [52]:
import requests
import pandas as pd
import time
from datetime import datetime
import os
from dotenv import load_dotenv
load_dotenv()

True

### In order to featch 20k+ posts, we decide to use the API and expand tags to all NLP related tags

In [53]:
API_KEY = os.getenv("API_KEY")

In [None]:
# We collected the data from 65 unique tags [1,2]
tags_to_fetch = [
    # Core NLP tags
    "nlp",
    "natural-language-processing",
    "computational-linguistics",

    # Classification & analysis
    "text-classification",
    "document-classification",
    "sentiment-analysis",
    "emotion-detection",
    "text-mining",
    "text-analytics",

    # Tokenization & text preprocessing
    "tokenization",
    "named-entity-recognition",
    "ner",
    "pos-tagging",
    "part-of-speech",
    "lemmatization",
    "stemming",
    "stopwords",
    "text-preprocessing",
    "text-normalization",

    # Libraries & frameworks
    "spacy",
    "nltk",
    "gensim",
    "huggingface",
    "transformers",
    "corenlp",
    "stanza",
    "flair",
    "allennlp",

    # Models & embeddings
    "bert",
    "roberta",
    "gpt",
    "language-model",
    "word2vec",
    "glove",
    "fasttext",
    "word-embeddings",
    "sentence-embeddings",
    "contextual-embeddings",

    # Information retrieval & similarity
    "tf-idf",
    "information-retrieval",
    "semantic-search",
    "document-similarity",
    "text-similarity",
    "cosine-similarity",

    # NLP tasks
    "text-summarization",
    "topic-modeling",
    "question-answering",
    "machine-translation",
    "language-detection",
    "dependency-parsing",
    "coreference-resolution",
    "entity-linking",
    "word-sense-disambiguation",

    # Modern NLP concepts
    "llm",
    "large-language-models",
    "fine-tuning",
    "prompt-engineering",
    "zero-shot-learning",
    "few-shot-learning",
    "transfer-learning",

    # Applications
    "chatbot",
    "dialogue-systems",
    "text-generation",
    "speech-recognition",
    "speech-to-text",
    "text-to-speech",
    "opencv-text",
    "elasticsearch",
    "lucene",
    "langchain",
    "rnn",
    "lstm",
    "gru",
    "bert",
    "gpt",
    "gpt-2",
    "gpt-3",
    "spacy-transformers",
    "pytorch-nlp",
    "tensorflow-text",
    "keras-nlp",
    "rasa",
    "chatgpt-api",
    "llama",
    "openai-whisper",
    "word-count",
    "string-matching",
    "document-understanding",
    "ngram",
    "keyword-extraction",
    "entity-extraction",
    "fuzzy-matching"
]

* In order to speed up data collection, we decided to devide the work within our group.
* Each group member querying for different timeline and then we compiled everything into 1 dataset

In [55]:
# Convert dates to UNIX timestamps
def to_unix_timestamp(date_str):
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    return int(dt.timestamp())

# Convert Unix timestamp to readable format
def from_unix_timestamp(timestamp):
    return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")

### We decided to process only the posts which have atlease 1 accepted answer

In [56]:
def fetch_posts(tag, page, start_date, end_date, request_count):
    try:
        base_url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "site": "stackoverflow",
            "tagged": tag,
            "filter": "withbody",
            "pagesize": 100,
            "fromdate": start_date,
            "todate": end_date,
            "sort": "creation",
            "order": "asc",
            "key": API_KEY,
            "page": page
        }
        response = requests.get(base_url, params=params)
        request_count += 1

        if response.status_code == 200:
            data = response.json()
            return data.get("items", []), data.get("has_more", False), request_count

        if response.status_code == 429:  # Too Many Requests
            backoff_time = int(response.headers.get('Backoff', 30))
            print(f"Rate limited. Backing off for {backoff_time} seconds")
            time.sleep(backoff_time)
            # Retry the request
            return fetch_posts(tag, page, start_date, end_date, request_count) 

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None, False, request_count
    
def fetch_answer(answer_id, request_count):
    try:
        answer_url = f"https://api.stackexchange.com/2.3/answers/{answer_id}"
        answer_params = {"site": "stackoverflow", "filter": "withbody", "key": API_KEY}
        response = requests.get(answer_url, params=answer_params)
        request_count += 1

        if response.status_code == 200:
            data = response.json()
            return response.json()["items"][0]["body"], request_count
        
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None, request_count

In [57]:
# Sample usage
question_1 =fetch_posts("nlp", 1, to_unix_timestamp("2023-01-01"), to_unix_timestamp("2023-10-01"), 0)
answer_1 = fetch_answer(75096054, 0)

In [58]:
print(f"""
Question Title: {question_1[0][0]['title']}
Question Body: {question_1[0][0]['body']}
Answer: {answer_1[0]}
""")


Question Title: snscrape error - twitter scrape crashes after a long time giving &#39;215&#39; error
Question Body: <p>I got the following error:</p>
<blockquote>
<p><a href="https://api.twitter.com/2/search/adaptive.json?include_profile_interstitial_type=1&amp;include_blocking=1&amp;include_blocked_by=1&amp;include_followed_by=1&amp;include_want_retweets=1&amp;include_mute_edge=1&amp;include_can_dm=1&amp;include_can_media_tag=1&amp;skip_status=1&amp;cards_platform=Web-12&amp;include_cards=1&amp;include_ext_alt_text=true&amp;include_quote_count=true&amp;include_reply_count=1&amp;tweet_mode=extended&amp;include_entities=true&amp;include_user_entities=true&amp;include_ext_media_color=true&amp;include_ext_media_availability=true&amp;send_error_codes=true&amp;simple_quoted_tweets=true&amp;q=from%3Ainfobae+since%3A2018-01-01+until%3A2018-12-31&amp;tweet_search_mode=live&amp;count=100&amp;query_source=spelling_expansion_revert_click&amp;cursor=scroll%3AthGAVUV0VFVBaCgLWNpcPlwx0WgICrnczcn_sd

In [59]:
# Create output directory if it doesn't exist
OUTPUT_DIR = "nlp_stackoverflow_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Function to save checkpoint
def save_checkpoint(posts, tag, checkpoint_number):
    filename = f"{OUTPUT_DIR}/nlp_stackoverflow_{tag}_{checkpoint_number}.csv"
    if posts:
        df = pd.DataFrame(posts)
        df.to_csv(filename, index=False)
        print(f"Checkpoint saved: {len(posts)} posts written to {filename}")
    else:
        print(f"No posts to save for tag {tag}")

In [60]:
def process_tag(tag, start_date, end_date):
    request_count = 0
    page = 1
    all_posts = []
    has_more = True
    max_pages = 100 # We Limit to 100 pages to avoid excessive API calls
    checkpoint_interval = 5 # Checkpoint every 5 pages

    while page <= max_pages and request_count < 10000:
        posts, has_more, request_count = fetch_posts(tag, page, start_date, end_date, request_count)
        if posts is not None:
            for post in posts:
                accepted_answer_id = post.get("accepted_answer_id")
                if not accepted_answer_id:
                    continue

                accepted_answer, request_count = fetch_answer(accepted_answer_id, request_count)
                if accepted_answer and request_count < 10000:
                    # Convert Unix timestamp to readable date format
                    creation_date_readable = from_unix_timestamp(post["creation_date"])

                    # Store original tag the post was found with
                    post_tags = post.get("tags", [])

                    all_posts.append({
                        "title": post["title"],
                        "description": post["body"],
                        "tags": ";".join(post_tags),
                        "source_tag": tag,  # Tag used to fetch this post
                        "question_id": post["question_id"],
                        "view_count": post.get("view_count", 0),
                        "creation_date": creation_date_readable,
                        "creation_timestamp": post["creation_date"],
                        "accepted_answer": accepted_answer
                    })

            # Checkpoint every few pages
            if page % checkpoint_interval == 0:
                save_checkpoint(all_posts, tag, page)

            # Sleep to respect rate limits
            time.sleep(2)  # Conservative delay between pages
            page += 1

            if not has_more:
                print(f"No more pages available for tag {tag}.")
                break

    # Final save for this tag
    save_checkpoint(all_posts, tag, "final")
    print(f"Completed processing tag {tag}! Downloaded {len(all_posts)} posts with accepted answers.")
    return all_posts

In [63]:
def main():
    start_date = to_unix_timestamp("2023-01-01")
    end_date = to_unix_timestamp("2023-02-01")
    for tag in tags_to_fetch:
        print(f"******************************************************")
        print(f"Processing tag: {tag}")
        all_posts = process_tag(tag, start_date, end_date)
        print(f"Total posts fetched for tag {tag}: {len(all_posts)}")
        print(f"******************************************************\n")

In [64]:
if __name__ == "__main__":
    main()

******************************************************
Processing tag: nlp
No more pages available for tag nlp.
Checkpoint saved: 44 posts written to nlp_stackoverflow_data/nlp_stackoverflow_nlp_final.csv
Completed processing tag nlp! Downloaded 44 posts with accepted answers.
Total posts fetched for tag nlp: 44
******************************************************

******************************************************
Processing tag: natural-language-processing
No more pages available for tag natural-language-processing.
Checkpoint saved: 44 posts written to nlp_stackoverflow_data/nlp_stackoverflow_natural-language-processing_final.csv
Completed processing tag natural-language-processing! Downloaded 44 posts with accepted answers.
Total posts fetched for tag natural-language-processing: 44
******************************************************

******************************************************
Processing tag: computational-linguistics
No more pages available for tag computation

In [67]:
# Lets save all the data into 1 csv file and remove duplicates and finally save as parquet

import pandas as pd
import os

# Directory containing the CSV files
directory = "nlp_stackoverflow_data"

# Lets read all the csv files and combine them into a single dataframe
all_data = pd.DataFrame()
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        all_data = pd.concat([all_data, df], ignore_index=True)

# Remove duplicates based on 'question_id' and 'accepted_answer'
all_data.drop_duplicates(subset=['question_id', 'accepted_answer'], inplace=True)

# Reset index after dropping duplicates
all_data.reset_index(drop=True, inplace=True)

# Save the combined dataframe as a Parquet file
all_data.to_parquet("nlp_stackoverflow_all_combined.parquet", index=False)

In [None]:
# Like this we combined every team member's data into a single file and removed duplicates
# We ran it for multiple days to overcome the API limits and then combined all the data into a single file
# Now we can use this data for our analysis and model training

Unnamed: 0,title,description,tags,source_tag,question_id,view_count,creation_date,creation_timestamp,accepted_answer
0,Do I need to retrain Bert for NER to create ne...,<p>I am very new to natural language processin...,nlp;bert-language-model;fine-tuning,bert,74978191,650,2023-01-02 10:52:59,1672618979,"<p>Yes, you would have to use a model trained ..."
1,Do BERT word embeddings change depending on co...,"<p>Before answering &quot;yes, of course&quot;...",nlp;huggingface-transformers;bert-language-mod...,bert,74996994,1121,2023-01-04 04:28:21,1672768701,<p>This is a great question (I had the same qu...
2,Why is positional encoding needed while input ...,"<p>For example, in Huggingface's example:</p>\...",huggingface-transformers;bert-language-model,bert,75050748,536,2023-01-09 06:12:12,1673206932,<p>The reason is the design of the neural arch...
3,Having trouble understanding the predictions a...,<p>I'm working on a sarcasm detector with the ...,deep-learning;bert-language-model;text-classif...,bert,75061462,63,2023-01-10 04:51:10,1673288470,<p>There are two values because you have two c...
4,Trouble in installing BERTopic&#39;s dependenc...,<p>I'm trying to run the following code from t...,python;bert-language-model;topic-modeling,bert,75083854,3026,2023-01-12 00:01:47,1673443907,<p>I would advise starting from a completely f...
...,...,...,...,...,...,...,...,...,...
218,How to change the first conv layer in the resn...,"<p>I have a data with 20 class, and I'd like t...",pytorch;transfer-learning,transfer-learning,75049663,984,2023-01-09 03:23:30,1673196810,<p>You can access to the layer <code>(conv2) i...
219,Python ResNet50: model.save() NotImplementedError,<p>My goal is to save (and then load) a resent...,python;resnet;transfer-learning,transfer-learning,75132030,336,2023-01-16 19:28:59,1673859539,<p>The problem is with this line</p>\n<pre><co...
220,ValueError: Exception encountered when calling...,<p>I get this error when I try to train my mod...,tensorflow;keras;transfer-learning,transfer-learning,75205028,1159,2023-01-23 11:50:04,1674436804,<p>Because if you want to make transfer learni...
221,&#39;Word2Vec&#39; object has no attribute &#3...,<p>This is the version of gensim I am using:</...,python;vectorization;word2vec;word-embedding,word2vec,75023586,562,2023-01-06 06:12:32,1672947752,<p><code>.infer_vector()</code> is only availa...


In [None]:
# finally we were able to collect 27k+ posts with accepted answer
df = pd.read_parquet("nlp_stackoverflow_all_combined.parquet")
print(f"Total posts with accepted answers: {len(df)}")

Total posts with accepted answers: 27783


In [72]:
display(df[["title","description","tags","accepted_answer"]]) 

Unnamed: 0,title,description,tags,accepted_answer
0,Methods for Geotagging or Geolabelling Text Co...,<p>What are some good algorithms for automatic...,algorithm;statistics;nlp;named-entity-recognition,"<p>You're looking for a <a href=""https://secur..."
1,Named Entity Recognition Libraries for Java,"<p>I am looking for a simple but ""good enough""...",java;nlp;named-entity-recognition,"<p>BTW, I recently ran across <a href=""http://..."
2,Algorithms recognizing physical address on a w...,<p>What are the best algorithms for recognizin...,algorithm;screen-scraping;nlp;pattern-matching...,<p>A named-entity extraction framework such as...
3,Strategies for recognizing proper nouns in NLP,<p>I'm interested in learning more about <a hr...,nlp;named-entity-recognition;part-of-speech,<p>The task of determining the proper part of ...
4,Algorithms for named entity recognition,<p>I would like to use named entity recognitio...,php;python;extract;analysis;named-entity-recog...,"<p>To start with check out <a href=""http://www..."
...,...,...,...,...
27778,Can&#39;t get UUID from similarity search Weav...,<p>I tried to retrieve documents with similar ...,python;langchain;vector-database;weaviate,<p>Duda Nogueira from Weaviate here!</p>\n<p>C...
27779,When using &#39;interrupt&#39; followed by &#3...,<p>When I invoke a graph that includes <code>i...,javascript;langchain;langgraph,"<p>I'm an engineer on the LangChain team, and ..."
27780,Precision used in ChromaDB Index,<p>I am using BAAI/bge-large-en-v1.5 model to ...,langchain;embedding;dtype;chromadb;vector-data...,<p>like this:</p>\n<pre><code>import chromadb\...
27781,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,python;nlp;spacy;langchain;presidio,<p>After some test I was able to find the solu...


## References

1. Stack Apps, 2022. Fetch all questions of a particular tag from the Stack Exchange API in
Python. [online] Available at: https://stackapps.com/questions/9436/fetch-all-questions-of-aparticular-tag-from-the-stack-exchange-api-in-python [Accessed 20 Apr. 2025].
2. Tewani, Y., 2023. GET request with StackOverflow API, Postman, and Python. [online]
Medium. Available at: https://medium.com/@yash.tewani.nyc/get-request-withstackoverflow-api-using-a-python-script-and-postman-f6d34b3f6f57 [Accessed 20 Apr.
2025].