**Key features implemented:**  

1. Phrase Extraction:  

    a. Uses POS pattern grammar to detect noun/verb phrases  

    b. Combines related words (e.g. "personal attack" → single phrase)  

    c. Handles both single words and multi-word expressions  

2. Semantic Analysis:  

    a. Compares both full phrases and individual words  

    b. Uses Wu-Palmer similarity for meaning comparison  

    c. Considers different parts of speech for accurate matching  

3. Context Handling:  

    a. Prioritizes phrase-level analysis before word-level  

    b. Uses POS filtering for better synset selection  

    c. Implements similarity threshold (0.6) to reduce false positives  


In [None]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag, RegexpParser
import json

In [None]:
# Initialize NLP resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
# Configuration
category_anchors = {
    "Insults & Personal Attacks": [
        'insult.n.01', 'stupidity.n.01', 'fool.n.01',
        'contempt.n.01', 'mock.v.01'
    ],
    "Hate Speech & Discrimination": [
        'prejudice.n.01', 'bigotry.n.01', 'intolerance.n.01',
        'discrimination.n.01', 'racism.n.01'
    ],
    "Threatening or Violent Language": [
        'threat.n.01', 'violence.n.01', 'kill.v.01',
        'harm.v.01', 'attack.v.01'
    ]
}

In [None]:
# Phrase detection grammar
grammar = r"""
    NP: {<JJ>*<NN.*>+}  # Noun phrases
    VP: {<VB.*><RB.*>?} # Verb phrases
"""
chunker = RegexpParser(grammar)

def extract_phrases(text):
    """Extract meaningful phrases using POS tagging"""
    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)
    tree = chunker.parse(tagged)
    
    phrases = []
    for subtree in tree.subtrees():
        if subtree.label() in ['NP', 'VP']:
            phrases.append(' '.join(word for word, tag in subtree.leaves()))
    return phrases or [text]

def get_best_synset(word, pos=None):
    """Get most relevant synset with POS filtering"""
    synsets = wordnet.synsets(word)
    if pos:
        synsets = [s for s in synsets if s.pos() == pos]
    return synsets[0] if synsets else None

def semantic_similarity(phrase, category):
    """Calculate maximum similarity score between phrase and category"""
    max_score = 0
    words = word_tokenize(phrase)
    
    # Try full phrase first
    phrase_syn = wordnet.synsets(phrase)
    if phrase_syn:
        for anchor in category_anchors[category]:
            anchor_syn = wordnet.synset(anchor)
            score = phrase_syn[0].wup_similarity(anchor_syn)
            max_score = max(max_score, score or 0)
    
    # Check individual words
    for word in words:
        word_syn = get_best_synset(word)
        if not word_syn:
            continue
            
        for anchor in category_anchors[category]:
            anchor_syn = wordnet.synset(anchor)
            score = word_syn.wup_similarity(anchor_syn)
            max_score = max(max_score, score or 0)
    
    return max_score

def categorize_text(text):
    phrases = extract_phrases(text)
    scores = {cat: 0 for cat in category_anchors}
    
    for phrase in phrases:
        for cat in category_anchors:
            score = semantic_similarity(phrase.lower(), cat)
            scores[cat] = max(scores[cat], score)
    
    max_score = max(scores.values())
    if max_score < 0.6:
        return "Uncategorized"
    
    return max(scores, key=scores.get)

In [None]:
# Load and process data
df = pd.read_csv("Profanities.csv", encoding="latin1", header=None, names=["Text"])
df["Category"] = df["Text"].apply(categorize_text)
df.to_csv("semantically_labeled_profanities.csv", index=False)

print("✅ Categorization complete! Check 'semantically_labeled_profanities.csv'")