In [9]:
import pandas as pd

df = pd.read_csv("preprocessed_cleaned_final.csv")

#function to categorise posts based on specific keywords
def categorize_post(title, answer):
    text = f"{str(title)} {str(answer)}".lower()

    implementation_phrases = ["how to", "how do i", "how can", "how should", "how would", "how could", "how might", "how is it possible"]
    task_keywords = ["tokenize", "tokenization", "lemmatize", "lemmatization", "stemming", "similarity", "classification", 
                     "summarization", "clustering", "parsing", "translation", "entity recognition", "embedding"]
    conceptual_keywords = ["what is", "what are", "explain", "meaning of", "difference between", "compare", "definition of"]
    library_keywords = ["spacy", "nltk", "transformers", "gensim", "word2vec", "fasttext", "bert", "gpt", "t5", 
                        "huggingface", "tensorflow", "pytorch", "sklearn", "keras"]
    language_detection_terms = ["language detection", "detect language", "identify language", "language identification"]

    if any(kw in text for kw in task_keywords):
        return "Task-Specific Issues"
    elif any(kw in text for kw in implementation_phrases):
        return "Implementation Issues"
    elif any(kw in text for kw in conceptual_keywords):
        return "Conceptual Understanding"
    elif any(kw in text for kw in library_keywords):
        return "Library-Specific Issues"
    elif any(kw in text for kw in language_detection_terms):
        return "Language Detection"
    else:
        return "Uncategorized"

#apply categorisation to each post using titles and accepted answers
df["category"] = df.apply(lambda row: categorize_post(row['title'], row['accepted_answer_body']), axis=1)

df.to_csv("categorized_posts_updated.csv", index=False)
category_counts = df["category"].value_counts()

print("Categorization complete. Saved to 'categorized_posts_updated.csv'")
print("\n Category Distribution:\n", category_counts)


Categorization complete. Saved to 'categorized_posts_updated.csv'

 Category Distribution:
 category
Task-Specific Issues        7901
Uncategorized               5311
Library-Specific Issues     4323
Implementation Issues       3791
Conceptual Understanding     914
Language Detection            27
Name: count, dtype: int64


In [10]:
df = pd.read_csv("categorized_posts.csv")

In [11]:
df.head()

Unnamed: 0,title,body,tags,accepted_answer_body,category
0,NameError: name 'init_empty_weights' is not de...,I am trying to set up hugging face locally and...,"nlp, huggingface-transformers, huggingface","Try using this version, it should resolve the ...",Library-Specific Issues
1,Why does Presidio with spacy nlp engine not re...,I'm using spaCy with the pl_core_news_lg model...,"python, nlp, spacy, presidio",The configuration file is missing the 'labels_...,Library-Specific Issues
2,GPT-2 and other models from huggingface -100 l...,I understand the -100 label id is used so that...,"nlp, huggingface-transformers, pre-trained-model",The author of the tutorial you mentioned sets ...,Task-Specific Issues
3,Trouble getting importing gensim to work in colab,I am trying to import gensim into colab.\n!pip...,"numpy, nlp, dependencies, google-colaboratory,...",You have to restart the session for the underl...,Library-Specific Issues
4,Store images instead of showing in a server,I am running the code found on this site in my...,"python, nlp, large-language-model",I can't test it but ...\nI checked source code...,Uncategorized
