What this code does?
Hybrid Approach to Check if a sentance has profanity words in them. It uses both a CSV file and a dictionary. 

**V-1**

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')

# Step 1: Load the CSV file once and store words in a dictionary
def load_profanity_words(csv_file):
    profanity_df = pd.read_csv(csv_file, encoding="latin1")
    profanity_dict = {row["Text"].lower(): row["Category"] for _, row in profanity_df.iterrows()}
    return profanity_dict

# Load the profanity dictionary (Executed once)
profanity_dict = load_profanity_words("semantically_labeled_profanities.csv")

# Step 2: Function to check profanity in text
def contains_profanity(text):
    words = word_tokenize(text.lower())  # Tokenize sentence
    flagged_words = {word: profanity_dict[word] for word in words if word in profanity_dict}  # Check words

    return flagged_words if flagged_words else "Clean"  # Return flagged words with categories

# Example usage
text1 = "You are an idiot and a loser!"
text2 = "piece of shit"

print(contains_profanity(text1))  
print(contains_profanity(text2))  


**Why Hybrid (CSV + Dictionary)?**

1. Fast O(1) lookup (thanks to the dictionary)
2. Easy to update (just modify CSV, no code change)
3. Scalable 

For scaling further, using a databse or tree structure would be better.

Note: Refining by updating normalization part to handle elongated text.

**V-1.1**

In [None]:
import nltk
import re
import difflib
import pandas as pd
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize Lemmatizer & Stemmerimport nltk
import re
import difflib
import pandas as pd
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize Lemmatizer & Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load English dictionary from CSV file
def load_dictionary(dictionary_path):
    df = pd.read_csv(dictionary_path, header=None)  # Assuming no header
    words = set(df[0].astype(str).str.lower())  # Convert words to lowercase for matching
    return words

# Provide the path to your dictionary.csv file
dictionary_path = "english_dictionary.csv"
english_words = load_dictionary(dictionary_path)

# Step 1: Reduce excessive repetition but keep meaningful words
def reduce_redundant_letters(word):
    """
    Dynamically reduces repeated letters while keeping valid words intact.
    """
    # Reduce 3+ occurrences to 2 (e.g., "loooove" -> "loove")
    reduced_word = re.sub(r"(.)\1{2,}", r"\1\1", word)

    # Further reduce to a single letter if still not valid (e.g., "happyyy" -> "happy")
    if reduced_word not in english_words:
        reduced_word = re.sub(r"(.)\1+", r"\1", reduced_word)

    return reduced_word

# Step 2: Find closest match from dictionary
def find_closest_match(word):
    matches = difflib.get_close_matches(word, english_words, n=1, cutoff=0.8)  # Finds most similar word
    return matches[0] if matches else word  # Return matched word or original if no match

# Step 3: Normalize the word
def normalize_word(word):
    original_word = word.lower()

    # Step 1: Reduce redundant letters
    reduced_word = reduce_redundant_letters(original_word)

    # Step 2: Remove non-alphabetic characters
    cleaned_word = re.sub(r"[^a-zA-Z]", "", reduced_word)

    # Step 3: Apply lemmatization and stemming
    lemma = lemmatizer.lemmatize(cleaned_word)
    stem = stemmer.stem(lemma)

    # Step 4: Check with dictionary and find the best match
    final_word = None
    if cleaned_word in english_words:
        final_word = cleaned_word
    elif lemma in english_words:
        final_word = lemma
    elif stem in english_words:
        final_word = stem
    else:
        final_word = find_closest_match(cleaned_word)  # Use fuzzy matching

    print(f"Original: {original_word}, Reduced: {reduced_word}, Cleaned: {cleaned_word}, Lemma: {lemma}, Stem: {stem}, Final: {final_word}")

    return final_word

# Test Cases
print(normalize_word("hiiiiiii"))  
print(normalize_word("beautifully"))  
print(normalize_word("loooooser"))  
print(normalize_word("hhhaappy"))  
print(normalize_word("shiiitttt"))  
print(normalize_word("badddd"))  
print(normalize_word("ssshuuttt"))

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load English dictionary from CSV file
def load_dictionary(dictionary_path):
    df = pd.read_csv(dictionary_path, header=None)  # Assuming no header
    words = set(df[0].astype(str).str.lower())  # Convert words to lowercase for matching
    return words

# Provide the path to your dictionary.csv file
dictionary_path = "english_dictionary.csv"
english_words = load_dictionary(dictionary_path)

# Step 1: Reduce excessive repetition but keep meaningful words
def reduce_redundant_letters(word):
    """
    Dynamically reduces repeated letters while keeping valid words intact.
    """
    # Reduce 3+ occurrences to 2 (e.g., "loooove" -> "loove")
    reduced_word = re.sub(r"(.)\1{2,}", r"\1\1", word)

    # Further reduce to a single letter if still not valid (e.g., "happyyy" -> "happy")
    if reduced_word not in english_words:
        reduced_word = re.sub(r"(.)\1+", r"\1", reduced_word)

    return reduced_word

# Step 2: Find closest match from dictionary
def find_closest_match(word):
    matches = difflib.get_close_matches(word, english_words, n=1, cutoff=0.8)  # Finds most similar word
    return matches[0] if matches else word  # Return matched word or original if no match

# Step 3: Normalize the word
def normalize_word(word):
    original_word = word.lower()

    # Step 1: Reduce redundant letters
    reduced_word = reduce_redundant_letters(original_word)

    # Step 2: Remove non-alphabetic characters
    cleaned_word = re.sub(r"[^a-zA-Z]", "", reduced_word)

    # Step 3: Apply lemmatization and stemming
    lemma = lemmatizer.lemmatize(cleaned_word)
    stem = stemmer.stem(lemma)

    # Step 4: Check with dictionary and find the best match
    final_word = None
    if cleaned_word in english_words:
        final_word = cleaned_word
    elif lemma in english_words:
        final_word = lemma
    elif stem in english_words:
        final_word = stem
    else:
        final_word = find_closest_match(cleaned_word)  # Use fuzzy matching

    print(f"Original: {original_word}, Reduced: {reduced_word}, Cleaned: {cleaned_word}, Lemma: {lemma}, Stem: {stem}, Final: {final_word}")

    return final_word

# Test Cases
print(normalize_word("hiiiiiii"))     
print(normalize_word("beautifully"))    
print(normalize_word("loooooser"))  
print(normalize_word("hhhaappy"))  
print(normalize_word("shiiitttt"))  
print(normalize_word("badddd"))  
print(normalize_word("ssshuuttt"))  


Final Profanity Check which can handle elongated words. 

Why This Works Well
1. Text Normalization ensures words like "loooooser" → "loser". It cleans, lemmatizes, stems, and corrects misspelled/elongated words. Hence handles excessive letter repetition and typos.  
2. Uses a preloaded dictionary to detect and categorize profane words. Looks up words in the dictionary, falling back on fuzzy matching if needed for better accuracy. 
3. Loads CSVs once to avoid redundant I/O operations.  

**V-2**

In [None]:
import nltk
import re
import difflib
import pandas as pd
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

# Initialize Lemmatizer & Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load English Dictionary
def load_dictionary(dictionary_path):
    df = pd.read_csv(dictionary_path, header=None)  # Assuming no header
    words = set(df[0].astype(str).str.lower())  # Convert words to lowercase for matching
    return words

# Load Profanity Dictionary
def load_profanity_words(csv_file):
    profanity_df = pd.read_csv(csv_file, encoding="latin1")
    profanity_dict = {row["Text"].lower(): row["Category"] for _, row in profanity_df.iterrows()}
    return profanity_dict

# Load dictionaries (Executed once)
dictionary_path = "english_dictionary.csv"
profanity_path = "semantically_labeled_profanities.csv"
english_words = load_dictionary(dictionary_path)
profanity_dict = load_profanity_words(profanity_path)

# Reduce excessive repetition in words
def reduce_redundant_letters(word):
    reduced_word = re.sub(r"(.)\1{2,}", r"\1\1", word)  # Limit 3+ to 2
    if reduced_word not in english_words:
        reduced_word = re.sub(r"(.)\1+", r"\1", reduced_word)  # Further reduce to 1 if needed
    return reduced_word

# Find closest dictionary match
def find_closest_match(word):
    matches = difflib.get_close_matches(word, english_words, n=1, cutoff=0.8)
    return matches[0] if matches else word

# Normalize the word
def normalize_word(word):
    original_word = word.lower()
    reduced_word = reduce_redundant_letters(original_word)
    cleaned_word = re.sub(r"[^a-zA-Z]", "", reduced_word)

    lemma = lemmatizer.lemmatize(cleaned_word)
    stem = stemmer.stem(lemma)

    final_word = (
        cleaned_word if cleaned_word in english_words else
        lemma if lemma in english_words else
        stem if stem in english_words else
        find_closest_match(cleaned_word)
    )

    return final_word

# Check if text contains profanity and only return flagged words
def contains_profanity(text):
    words = word_tokenize(text.lower())  # Tokenize text
    normalized_words = [normalize_word(word) for word in words]  # Normalize words

    flagged_words = {word: profanity_dict[word] for word in normalized_words if word in profanity_dict}
    
    return flagged_words if flagged_words else "Clean"

# Example Usage
text1 = "You are an idiot and a loooooser!"
text2 = "piece of shiiitttt"

print(contains_profanity(text1))  
print(contains_profanity(text2))  

Improving model to handle symbol-based Obfuscations.

**V-3**

In [None]:
pip install rapidfuzz

In [None]:
import nltk
import re
import difflib
import pandas as pd
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from rapidfuzz import process

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

# Initialize Lemmatizer & Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load English Dictionary
def load_dictionary(dictionary_path):
    df = pd.read_csv(dictionary_path, header=None)  # Assuming no header
    words = set(df[0].astype(str).str.lower())  # Convert words to lowercase for matching
    return words

# Load Profanity Dictionary
def load_profanity_words(csv_file):
    profanity_df = pd.read_csv(csv_file, encoding="utf-8")  # Force UTF-8 encoding
    profanity_df["Text"] = profanity_df["Text"].str.replace(r"[\x92\x91\x96\x97]", "'", regex=True) 
    profanity_dict = {row["Text"].lower(): row["Category"] for _, row in profanity_df.iterrows()}
    return profanity_dict

# Load dictionaries (Executed once)
dictionary_path = "english_dictionary.csv"
profanity_path = "semantically_labeled_profanities.csv"
english_words = load_dictionary(dictionary_path)
profanity_dict = load_profanity_words(profanity_path)

# Symbol based obfuscations
def deobfuscate_word(word):
    """ Convert common leetspeak & symbol obfuscations back to letters """
    word = word.lower()
    word = re.sub(r'[@]', 'a', word)
    word = re.sub(r'[$]', 's', word)
    word = re.sub(r'[1!]', 'i', word)
    word = re.sub(r'[0]', 'o', word)
    word = re.sub(r'[3]', 'e', word)
    word = re.sub(r'[4]', 'a', word)
    word = re.sub(r'[5]', 's', word)
    word = re.sub(r'[7]', 't', word)
    word = re.sub(r'[8]', 'b', word)
    word = re.sub(r'[9]', 'g', word)
    return word


# Fuzzy matching using rapidfuzz 
def find_closest_profanity(word):
    match = process.extractOne(word, profanity_dict.keys(), score_cutoff=85)  # Increased cutoff
    return match[0] if match else None  # Return None if no good match

# Reduce excessive repetition in words
def reduce_redundant_letters(word):
    reduced_word = re.sub(r"(.)\1{2,}", r"\1\1", word)  # Limit 3+ to 2
    if reduced_word not in english_words:
        reduced_word = re.sub(r"(.)\1+", r"\1", reduced_word)  # Further reduce to 1 if needed
    return reduced_word

# Find closest dictionary match
def find_closest_match(word):
    matches = difflib.get_close_matches(word, english_words, n=1, cutoff=0.8)
    return matches[0] if matches else word

# Normalize the word
def normalize_word(word):
    # Skip very short words (they cause false positives)
    if len(word) <= 2:
        return word
        
    original_word = word.lower()
    deobfuscated_word = deobfuscate_word(original_word)
    reduced_word = reduce_redundant_letters(deobfuscated_word)
    cleaned_word = re.sub(r"[^a-zA-Z]", "", reduced_word)  # Remove ALL non-alphabetic
    
    # Skip normalization if the original is a known English word
    if cleaned_word in english_words:
        return cleaned_word
        
    # Only proceed with lemmatization/stemming if necessary
    if cleaned_word not in profanity_dict:
        lemma = lemmatizer.lemmatize(cleaned_word)
        if lemma in profanity_dict:
            return lemma
        stem = stemmer.stem(cleaned_word)
        if stem in profanity_dict:
            return stem
    
    return cleaned_word

# Check if text contains profanity and only return flagged words
def contains_profanity(text):
    words = word_tokenize(text.lower())
    flagged_words = {}
    
    for word in words:
        normalized = normalize_word(word)
        
        # Direct match
        if normalized in profanity_dict:
            flagged_words[word] = profanity_dict[normalized]  # Keep original word
            continue
            
        # Fuzzy match only if word is suspiciously similar to profanity
        if len(normalized) > 4:  # Only check longer words
            closest = find_closest_profanity(normalized)
            if closest:
                flagged_words[word] = profanity_dict[closest]
    
    return flagged_words or "Clean"

# Example Usage
text1 = "You are a f@cking id!0t"
text2 = "h3ll yeah! that's $tupid"
text3 = "Sh1t, you're an @ssh0le!"


print(contains_profanity(text1))  
print(contains_profanity(text2))
print(contains_profanity(text3))

output of V-3:  
{'cking': 'Vulgar'}  
{'that': 'Insults & Personal Attacks', 'tupid': 'Insults & Personal Attacks'}  
{'Sh1t': 'Insults & Personal Attacks', 'ssh0le': 'Explicit'}  

**V-4**

In [None]:
import nltk
import re
import difflib
import pandas as pd
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from rapidfuzz import process

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

#English stop words
STOP_WORDS = set(stopwords.words('english'))

# Initialize Lemmatizer & Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load English Dictionary
def load_dictionary(dictionary_path):
    df = pd.read_csv(dictionary_path, header=None)  # Assuming no header
    words = set(df[0].astype(str).str.lower())  # Convert words to lowercase for matching
    return words

# Load Profanity Dictionary
def load_profanity_words(csv_file):
    profanity_df = pd.read_csv(csv_file, encoding="utf-8")  # Force UTF-8 encoding
    profanity_df["Text"] = profanity_df["Text"].str.replace(r"[\x92\x91\x96\x97]", "'", regex=True)
    profanity_dict = {row["Text"].lower(): row["Category"] for _, row in profanity_df.iterrows()}
    return profanity_dict

# Load dictionaries (Executed once)
dictionary_path = "english_dictionary.csv"
profanity_path = "semantically_labeled_profanities.csv"
english_words = load_dictionary(dictionary_path)
profanity_dict = load_profanity_words(profanity_path)

# Symbol based obfuscations
def deobfuscate_word(word):
    """More comprehensive leetspeak handling with priority for profanity patterns"""
    # First handle common profanity obfuscations
    profanity_patterns = {
        r'f[!@#$%^&*.]*u[!@#$%^&*.]*c[!@#$%^&*.]*k': 'fuck',
        r's[!@#$%^&*.]*h[!@#$%^&*.]*i[!@#$%^&*.]*t': 'shit',
        r'a[!@#$%^&*.]*s[!@#$%^&*.]*s': 'ass',
        r'b[!@#$%^&*.]*i[!@#$%^&*.]*t[!@#$%^&*.]*c[!@#$%^&*.]*h': 'bitch',
        r'c[!@#$%^&*.]*u[!@#$%^&*.]*n[!@#$%^&*.]*t': 'cunt',
        r'd[!@#$%^&*.]*i[!@#$%^&*.]*c[!@#$%^&*.]*k': 'dick',
        r'p[!@#$%^&*.]*u[!@#$%^&*.]*s[!@#$%^&*.]*s[!@#$%^&*.]*y': 'pussy',
    }

    lower_word = word.lower()
    for pattern, replacement in profanity_patterns.items():
        if re.fullmatch(pattern, lower_word):
            return replacement

    # Then handle general leetspeak
    replacements = {
        r'[@4]': 'a',
        r'[$5]': 's',
        r'[1!|i]': 'i',
        r'[0°]': 'o',
        r'[3]': 'e',
        r'[7]': 't',
        r'[8]': 'b',
        r'[9]': 'g',
    }

    for pattern, replacement in replacements.items():
        lower_word = re.sub(pattern, replacement, lower_word)

    return lower_word

def custom_tokenize(text):
    # Handle contractions and apostrophes properly
    text = re.sub(r"(\w+)'(\w+)", r"\1'\2", text)  # Keep apostrophes within words
    # Tokenize while preserving obfuscated words
    tokens = re.findall(r"[@\w$!]+", text.lower())
    return tokens

# Fuzzy matching using rapidfuzz
def find_closest_profanity(word):
    match = process.extractOne(word, profanity_dict.keys(), score_cutoff=85)  # Increased cutoff
    return match[0] if match else None  # Return None if no good match

# Reduce excessive repetition in words
def reduce_redundant_letters(word):
    """More conservative letter reduction"""
    # First pass: Reduce 3+ repeats to 2 (e.g., "shittt" → "shitt")
    reduced = re.sub(r'(.)\1{2,}', r'\1\1', word)

    # Only reduce to single letter if the word isn't in dictionary
    if reduced not in english_words:
        reduced = re.sub(r'(.)\1+', r'\1', reduced)
    return reduced

# Find closest dictionary match
def find_closest_match(word):
    matches = difflib.get_close_matches(word, english_words, n=1, cutoff=0.8)
    return matches[0] if matches else word

# Normalize the word
def normalize_word(word):
    if word in STOP_WORDS:
        return word
    """Better handling of edge cases with profanity priority"""
    if len(word) <= 2:  # Skip very short words
        return word

    # Step 1: Deobfuscate (convert numbers/symbols to letters)
    deobfuscated = deobfuscate_word(word)

    # Step 2: Handle common profanity fragments
    if deobfuscated.endswith(('ing', 'in', 'ed')):
        base_form = deobfuscated.rstrip('ing').rstrip('in').rstrip('ed')
        if base_form in profanity_dict:
            return base_form

    # Step 3: Clean special characters but preserve intentional obfuscation
    cleaned = re.sub(r"[^a-z]", "", deobfuscated)

    # Step 4: Check for direct match in profanity dictionary
    if cleaned in profanity_dict:
        return cleaned

    # Step 5: Check for close matches in the profanity dictionary
    closest = process.extractOne(cleaned, profanity_dict.keys(), score_cutoff=90)
    if closest:
        return closest[0]

    return cleaned

# Check if text contains profanity and only return flagged words
def contains_profanity(text):
    words = custom_tokenize(text)
    flagged_words = {}

    for original_word in words:
        # Skip stop words entirely
        if original_word in STOP_WORDS:
            continue

        normalized = normalize_word(original_word)

        # Additional check - skip if normalized became a stop word
        if normalized in STOP_WORDS:
            continue

        # Only check against profanity dictionary
        if normalized in profanity_dict:
            flagged_words[original_word] = profanity_dict[normalized]

    return flagged_words if flagged_words else "Clean"

# Example Usage
text1 = "You are a f@cking id!0t"
text2 = "h3ll yeah! that's $tupid"
text3 = "Sh1t, you're an @ssh0le!"


print(contains_profanity(text1))
print(contains_profanity(text2))
print(contains_profanity(text3))

output of V-4:  
{'id!0t': 'Insults & Personal Attacks'}  
{'$tupid': 'Insults & Personal Attacks'}  
{'sh1t': 'Insults & Personal Attacks', '@ssh0le!': 'Explicit'}  

**V-5**  

In [None]:
import nltk
import re
import difflib
import pandas as pd
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from rapidfuzz import process
from collections import defaultdict

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')

# Initialize NLP tools
STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load dictionaries
def load_dictionary(dictionary_path):
    """Load English dictionary words"""
    df = pd.read_csv(dictionary_path, header=None)
    return set(df[0].astype(str).str.lower())

def load_profanity_words(csv_file):
    """Load and preprocess profanity dictionary"""
    profanity_df = pd.read_csv(csv_file, encoding="latin-1")
    profanity_df["Text"] = profanity_df["Text"].str.replace(r"[\x92\x91\x96\x97]", "'", regex=True)

    # Create both single-word and phrase dictionaries
    single_words = {}
    phrases = defaultdict(list)

    for _, row in profanity_df.iterrows():
        text = row["Text"].lower()
        if ' ' in text:  # It's a phrase
            phrases[len(text.split())].append((text, row["Category"]))
        else:
            single_words[text] = row["Category"]

    return single_words, phrases

# Load dictionaries
dictionary_path = "english_dictionary.csv"
profanity_path = "semantically_labeled_profanities.csv"
english_words = load_dictionary(dictionary_path)
profanity_dict, profanity_phrases = load_profanity_words(profanity_path)

def deobfuscate_word(word):
    """Convert leetspeak and symbols back to letters with profanity priority"""
    # Common profanity patterns
    profanity_patterns = {
        r'f[!@#$%^&*.]*u[!@#$%^&*.]*c[!@#$%^&*.]*k': 'fuck',
        r's[!@#$%^&*.]*h[!@#$%^&*.]*i[!@#$%^&*.]*t': 'shit',
        r'a[!@#$%^&*.]*s[!@#$%^&*.]*s': 'ass',
        r'b[!@#$%^&*.]*i[!@#$%^&*.]*t[!@#$%^&*.]*c[!@#$%^&*.]*h': 'bitch',
        r'c[!@#$%^&*.]*u[!@#$%^&*.]*n[!@#$%^&*.]*t': 'cunt',
        r'd[!@#$%^&*.]*i[!@#$%^&*.]*c[!@#$%^&*.]*k': 'dick',
        r'p[!@#$%^&*.]*u[!@#$%^&*.]*s[!@#$%^&*.]*s[!@#$%^&*.]*y': 'pussy',
    }

    lower_word = word.lower()
    for pattern, replacement in profanity_patterns.items():
        if re.fullmatch(pattern, lower_word):
            return replacement

    # General leetspeak replacements
    replacements = {
        r'[@4]': 'a',
        r'[$5]': 's',
        r'[1!|i]': 'i',
        r'[0°]': 'o',
        r'[3]': 'e',
        r'[7]': 't',
        r'[8]': 'b',
        r'[9]': 'g',
    }

    for pattern, replacement in replacements.items():
        lower_word = re.sub(pattern, replacement, lower_word)

    return lower_word

def custom_tokenize(text):
    """Improved tokenizer that handles contractions and obfuscations"""
    # Handle contractions
    text = re.sub(r"(\w+)'(\w+)", r"\1'\2", text)
    # Tokenize while preserving obfuscated words
    return re.findall(r"[@\w$!']+", text.lower())

def reduce_redundant_letters(word):
    """Handle excessive letter repetition"""
    # First reduce 3+ repeats to 2
    reduced = re.sub(r'(.)\1{2,}', r'\1\1', word)
    # Only reduce further if needed
    if reduced not in english_words:
        reduced = re.sub(r'(.)\1+', r'\1', reduced)
    return reduced

def normalize_word(word):
    """Normalize word with profanity detection focus"""
    if len(word) <= 2 or word in STOP_WORDS:
        return word

    # Deobfuscate first
    deobfuscated = deobfuscate_word(word)

    # Handle common profanity fragments
    if deobfuscated.endswith(('ing', 'in', 'ed')):
        base_form = deobfuscated.rstrip('ing').rstrip('in').rstrip('ed')
        if base_form in profanity_dict:
            return base_form

    # Clean and check
    cleaned = re.sub(r"[^a-z]", "", deobfuscated)
    if cleaned in profanity_dict:
        return cleaned

    # Fuzzy match only if word looks suspicious
    if len(cleaned) > 4:
        closest = process.extractOne(cleaned, profanity_dict.keys(), score_cutoff=90)
        if closest:
            return closest[0]

    return cleaned

def contains_profanity(text):
    """Main profanity detection function with phrase support"""
    words = custom_tokenize(text)
    flagged_words = {}

    # Check single words
    for i, word in enumerate(words):
        if word in STOP_WORDS or len(word) <= 2:
            continue

        normalized = normalize_word(word)
        if normalized in STOP_WORDS:
            continue

        if normalized in profanity_dict:
            flagged_words[word] = profanity_dict[normalized]

    # Check multi-word phrases (2-4 words)
    for phrase_length in range(2, min(5, len(words) + 1)): #Corrected the syntax error here
        for i in range(len(words) - phrase_length + 1):
            phrase = ' '.join(words[i:i+phrase_length])
            #The following if statement and for loop were outside the contains_profanity function, they have been correctly indented.
            if phrase in profanity_phrases[phrase_length]:
                for term, category in profanity_phrases[phrase_length]:
                    if term == phrase:
                        flagged_words[phrase] = category
                        break

    return flagged_words if flagged_words else "Clean"

# Example Usage
if __name__ == "__main__":
    test_cases = [
        "You are a f@cking id!0t",
        "h3ll yeah! that's $tupid",
        "Sh1t, you're an @ssh0le!",
        "This is a clean sentence",
        "Go to hell you motherfucker",
        "That's some bullshit right there"
    ]

    for text in test_cases:
        print(f"Text: {text}")
        result = contains_profanity(text)
        print(f"Result: {result}\n")

Output of V5:  

Text: You are a f@cking id!0t  
Result: {'id!0t': 'Insults & Personal Attacks'}  

Text: h3ll yeah! that's $tupid  
Result: {'$tupid': 'Insults & Personal Attacks'}  

Text: Sh1t, you're an @ssh0le!  
Result: {'sh1t': 'Insults & Personal Attacks', '@ssh0le!': 'Explicit'}  

Text: This is a clean sentence  
Result: Clean  

Text: Go to hell you motherfucker  
Result: {'motherfucker': 'Explicit'}  

Text: That's some bullshit right there  
Result: {'bullshit': 'Vulgar'}  

**V-6**  

In [None]:
import nltk
import re
import difflib
import pandas as pd
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from rapidfuzz import process
from collections import defaultdict

# Ensure necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')

# Initialize NLP tools
STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load dictionaries
def load_dictionary(dictionary_path):
    """Load English dictionary words"""
    df = pd.read_csv(dictionary_path, header=None)
    return set(df[0].astype(str).str.lower())

def load_profanity_words(csv_file):
    """Load and preprocess profanity dictionary"""
    profanity_df = pd.read_csv(csv_file, encoding="latin-1")
    profanity_df["Text"] = profanity_df["Text"].str.replace(r"[\x92\x91\x96\x97]", "'", regex=True)

    # Create both single-word and phrase dictionaries
    single_words = {}
    phrases = defaultdict(list)

    for _, row in profanity_df.iterrows():
        text = row["Text"].lower()
        if ' ' in text:  # It's a phrase
            phrases[len(text.split())].append((text, row["Category"]))
        else:
            single_words[text] = row["Category"]

    return single_words, phrases

# Load dictionaries
dictionary_path = "english_dictionary.csv"
profanity_path = "semantically_labeled_profanities.csv"
english_words = load_dictionary(dictionary_path)
profanity_dict, profanity_phrases = load_profanity_words(profanity_path)

def deobfuscate_word(word):
    """Convert leetspeak and symbols back to letters with profanity priority"""
    # Common profanity patterns
    profanity_patterns = {
        r'f[!@#$%^&*.]*u[!@#$%^&*.]*c[!@#$%^&*.]*k': 'fuck',
        r's[!@#$%^&*.]*h[!@#$%^&*.]*i[!@#$%^&*.]*t': 'shit',
        r'a[!@#$%^&*.]*s[!@#$%^&*.]*s': 'ass',
        r'b[!@#$%^&*.]*i[!@#$%^&*.]*t[!@#$%^&*.]*c[!@#$%^&*.]*h': 'bitch',
        r'c[!@#$%^&*.]*u[!@#$%^&*.]*n[!@#$%^&*.]*t': 'cunt',
        r'd[!@#$%^&*.]*i[!@#$%^&*.]*c[!@#$%^&*.]*k': 'dick',
        r'p[!@#$%^&*.]*u[!@#$%^&*.]*s[!@#$%^&*.]*s[!@#$%^&*.]*y': 'pussy',
    }

    lower_word = word.lower()
    for pattern, replacement in profanity_patterns.items():
        if re.fullmatch(pattern, lower_word):
            return replacement

    # General leetspeak replacements
    replacements = {
        r'[@4]': 'a',
        r'[$5]': 's',
        r'[1!|i]': 'i',
        r'[0°]': 'o',
        r'[3]': 'e',
        r'[7]': 't',
        r'[8]': 'b',
        r'[9]': 'g',
    }

    for pattern, replacement in replacements.items():
        lower_word = re.sub(pattern, replacement, lower_word)

    return lower_word

def custom_tokenize(text):
    """Improved tokenizer that handles contractions and obfuscations"""
    # Handle contractions
    text = re.sub(r"(\w+)'(\w+)", r"\1'\2", text)
    # Tokenize while preserving obfuscated words
    return re.findall(r"[@\w$!']+", text.lower())

def reduce_redundant_letters(word):
    """Handle excessive letter repetition"""
    # First reduce 3+ repeats to 2
    reduced = re.sub(r'(.)\1{2,}', r'\1\1', word)
    # Only reduce further if needed
    if reduced not in english_words:
        reduced = re.sub(r'(.)\1+', r'\1', reduced)
    return reduced

def normalize_word(word):
    """Normalize word with profanity detection focus"""
    if len(word) <= 2 or word in STOP_WORDS:
        return word

    # Deobfuscate first
    deobfuscated = deobfuscate_word(word)

    # Handle common suffixes
    if deobfuscated.endswith('ing'):
        base_form = deobfuscated[:-3]
        if base_form in profanity_dict:
            return base_form
        if f"{base_form}in" in profanity_dict:  # For words like "fuckin"
            return f"{base_form}in"

    # Check direct match
    if deobfuscated in profanity_dict:
        return deobfuscated

    # Try fuzzy matching for close variants
    closest = process.extractOne(deobfuscated, profanity_dict.keys(), score_cutoff=85)
    if closest:
        return closest[0]

    return deobfuscated

def contains_profanity(text):
    """Main profanity detection function with phrase support"""
    words = custom_tokenize(text)
    flagged_words = {}

    # Check single words
    for i, word in enumerate(words):
        if word in STOP_WORDS or len(word) <= 2:
            continue

        normalized = normalize_word(word)
        if normalized in STOP_WORDS:
            continue

        if normalized in profanity_dict:
            flagged_words[word] = profanity_dict[normalized]

    # Check multi-word phrases (2-4 words)
    for phrase_length in range(2, min(5, len(words) + 1)): #Corrected the syntax error here
        for i in range(len(words) - phrase_length + 1):
            phrase = ' '.join(words[i:i+phrase_length])
            #The following if statement and for loop were outside the contains_profanity function, they have been correctly indented.
            if phrase in profanity_phrases[phrase_length]:
                for term, category in profanity_phrases[phrase_length]:
                    if term == phrase:
                        flagged_words[phrase] = category
                        break

    return flagged_words if flagged_words else "Clean"

# Example Usage
if __name__ == "__main__":
    # Test cases
    test_cases = [
      "You are a f@cking id!0t",
      "h3ll yeah! that's $tupid",
      "Sh1t, you're an @ssh0le!",
      "This is a clean sentence",
      "Go to hell you motherfucker",
      "That's some bullshit right there",
      "What the f*** is this?",
      "You're a dumb@ss"
    ]

    for text in test_cases:
      print(f"Text: {text}")
      result = contains_profanity(text)
      print(f"Result: {result}\n")

This model Handling most of the sentences:  

Text: You are a f@cking id!0t  
Result: {'f@cking': 'Explicit', 'id!0t': 'Insults & Personal Attacks'}  

Text: h3ll yeah! that's $tupid  
Result: {'$tupid': 'Insults & Personal Attacks'}  

Text: Sh1t, you're an @ssh0le!  
Result: {'sh1t': 'Insults & Personal Attacks', '@ssh0le!': 'Explicit'}  

Text: This is a clean sentence  
Result: Clean  

Text: Go to hell you motherfucker  
Result: {'motherfucker': 'Explicit'}  

Text: That's some bullshit right there  
Result: {'bullshit': 'Vulgar'}  

Text: What the f*** is this?  
Result: Clean  

Text: You're a dumb@ss  
Result: {'dumb@ss': 'Insults & Personal Attacks'}  