In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:

import spacy
import numpy as np
import torch
from typing import List, Dict, Tuple
from transformers import RobertaForMaskedLM, RobertaTokenizer
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet
from scipy.spatial.distance import cosine
import random
from termcolor import colored
from tqdm import tqdm

import nltk
nltk.download('wordnet')

class AdvancedTextHumanizer:
    def __init__(self):
        # Load models
        self.nlp = spacy.load('en_core_web_sm')
        self.roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.target_pos = ['ADJ', 'ADV', 'NOUN', 'VERB', 'CONJ', 'PREP', 'PRON', 'INTJ']
        self.cache = {}

        # Expanded formal/AI vocabulary
        self.formal_replacements = {
            'utilize': ['use', 'apply'],
            'implement': ['use', 'add', 'put in'],
            'demonstrate': ['show', 'prove'],
            'comprehensive': ['complete', 'full'],
            'subsequently': ['then', 'after'],
            'methodology': ['method', 'approach'],
            'leverage': ['use', 'apply'],
            'furthermore': ['also', 'plus'],
            'therefore': ['so', 'thus'],
            'approximately': ['about', 'around'],
            'facilitate': ['help', 'aid'],
            'optimal': ['best', 'ideal'],
            'initiated': ['started', 'began'],
            'sufficient': ['enough', 'adequate'],
            'preliminary': ['initial', 'first'],
            'endeavor': ['try', 'attempt'],
            'ascertain': ['find out', 'check'],
            'pursuant': ['following', 'under'],
            'expedite': ['speed up', 'hurry'],
            'commence': ['start', 'begin']
        }

        print(colored("Initializing AdvancedTextHumanizer...", "cyan"))
        print(colored("✓ Loading language models", "green"))

        # Add common words list (top 1000 most frequent English words)
        self.common_words = set()
        try:
            with open('common_words.txt', 'r') as f:
                self.common_words = set(f.read().splitlines())
        except:
            # Fallback to a small set of very common words
            self.common_words = {'good', 'great', 'nice', 'better', 'best', 'easy', 'simple',
                               'clear', 'fast', 'quick', 'well', 'sure', 'right', 'fine'}

        # Add negative prefixes to check
        self.negative_prefixes = {'un', 'in', 'im', 'ir', 'dis', 'non'}

    def char_similarity_ratio(self, str1: str, str2: str) -> float:
        """Calculate character-level similarity between two strings."""
        if len(str1) == 0 or len(str2) == 0:
            return 0.0

        common_chars = sum(1 for c in str1 if c in str2)
        return common_chars / max(len(str1), len(str2))

    def has_opposite_meaning(self, word: str, candidate: str) -> bool:
        """Check if the candidate is just the negative form of the word."""
        for prefix in self.negative_prefixes:
            if (candidate.startswith(prefix) and word in candidate) or \
               (word.startswith(prefix) and candidate in word):
                return True
        return False

    def compute_similarity_score(self, word1: str, word2: str) -> float:
        # Combine multiple similarity metrics
        scores = []


        # Sentence-BERT similarity
        sbert_sim = self.sentence_model.encode([word1, word2])
        sbert_sim = 1 - cosine(sbert_sim[0], sbert_sim[1])
        scores.append(sbert_sim)

        # WordNet similarity
        wn_sim = self.get_wordnet_similarity(word1, word2)
        if wn_sim:
            scores.append(wn_sim)

        return np.mean(scores)

    def get_wordnet_similarity(self, word1: str, word2: str) -> float:
        synsets1 = wordnet.synsets(word1)
        synsets2 = wordnet.synsets(word2)

        if not synsets1 or not synsets2:
            return None

        max_sim = 0
        for s1 in synsets1:
            for s2 in synsets2:
                sim = s1.path_similarity(s2)
                if sim and sim > max_sim:
                    max_sim = sim
        return max_sim

    def get_replacement_candidates(self, sentence: str, word: str, pos: str) -> List[str]:
        if word.lower() in self.formal_replacements:
            candidates = self.formal_replacements[word.lower()]
            # Prefer common words from predefined replacements
            common_candidates = [c for c in candidates if c in self.common_words]
            return common_candidates if common_candidates else candidates

        if (sentence, word) in self.cache:
            return self.cache[(sentence, word)]

        candidates = []

        # RoBERTa predictions
        masked_text = sentence.replace(word, self.roberta_tokenizer.mask_token)
        inputs = self.roberta_tokenizer(masked_text, return_tensors="pt", padding=True)

        with torch.no_grad():
            outputs = self.roberta_model(**inputs)
            predictions = outputs.logits

        mask_idx = torch.where(inputs["input_ids"][0] == self.roberta_tokenizer.mask_token_id)[0]
        top_tokens = torch.topk(predictions[0, mask_idx], 20, dim=1)

        # Combine candidates
        for token_id in top_tokens.indices[0]:
            candidate = self.roberta_tokenizer.decode(token_id).strip()
            if self.validate_candidate(candidate, word, pos):
                candidates.append(candidate)

        # Filter and rank candidates
        ranked_candidates = []
        for candidate in set(candidates):
            similarity = self.compute_similarity_score(word, candidate)
            if similarity > 0.5:  # Adjusted threshold
                ranked_candidates.append((candidate, similarity))

        ranked_candidates.sort(key=lambda x: x[1], reverse=True)

        # Additional filtering for final candidates
        final_candidates = []
        for candidate, similarity in ranked_candidates:
            # Prefer common words
            score_boost = 0.1 if candidate.lower() in self.common_words else 0
            adjusted_similarity = similarity + score_boost

            if adjusted_similarity > 0.5:
                final_candidates.append(candidate)

        # Limit to top 5 candidates, preferring common words
        final_candidates.sort(key=lambda x: x in self.common_words, reverse=True)
        final_candidates = final_candidates[:5]

        self.cache[(sentence, word)] = final_candidates
        return final_candidates

    def validate_candidate(self, candidate: str, original: str, pos: str) -> bool:
        if not candidate.isalpha() or len(candidate) < 3:
            return False

        if candidate.lower() == original.lower():
            return False

        # Check for character similarity to avoid misspellings
        char_sim = self.char_similarity_ratio(candidate.lower(), original.lower())
        if char_sim > 0.8:  # If words are too similar, likely a misspelling
            return False

        # Check for negative forms
        if self.has_opposite_meaning(original.lower(), candidate.lower()):
            return False

        doc = self.nlp(candidate)
        if doc[0].pos_ != pos:
            return False

        return True

    def humanize_text(self, text: str, replacement_prob: float = 0.7) -> Tuple[str, Dict]:
        doc = self.nlp(text)
        new_text = []
        replacements = {}

        print(colored("\nProcessing text...", "cyan"))
        for sent in tqdm(list(doc.sents), desc="Analyzing sentences"):
            sent_text = sent.text
            sent_embedding = self.sentence_model.encode([sent_text])[0]

            for token in sent:
                local_prob = replacement_prob
                if len(token.text) > 8:
                    local_prob += 0.2

                if (token.pos_ in self.target_pos and
                    len(token.text) > 3 and
                    random.random() < local_prob):

                    candidates = self.get_replacement_candidates(sent_text, token.text, token.pos_)
                    if candidates:
                        replacement = random.choice(candidates)
                        replacements[token.text] = replacement
                        new_text.append(replacement)
                    else:
                        new_text.append(token.text)
                else:
                    new_text.append(token.text)
                new_text.append(token.whitespace_)

        result_text = "".join(new_text)
        print(colored("\nHumanization complete!", "green"))
        return result_text, replacements

    def format_output(self, original_text: str, humanized_text: str, replacements: Dict) -> str:
        output = "\n" + "="*80 + "\n"
        output += colored("TEXT HUMANIZATION RESULTS", "cyan", attrs=["bold"]) + "\n"
        output += "="*80 + "\n\n"

        # Original text section
        output += colored("ORIGINAL TEXT:", "yellow", attrs=["bold"]) + "\n"
        output += f"{original_text}\n\n"

        # Humanized text section
        output += colored("HUMANIZED TEXT:", "green", attrs=["bold"]) + "\n"
        output += f"{humanized_text}\n\n"

        # Statistics section
        output += colored("STATISTICS:", "magenta", attrs=["bold"]) + "\n"
        output += f"Total words replaced: {len(replacements)}\n"
        output += f"Unique replacements: {len(set(replacements.values()))}\n\n"

        # Replacements section
        output += colored("WORD REPLACEMENTS:", "blue", attrs=["bold"]) + "\n"
        for original, replacement in replacements.items():
            output += f"• {colored(original, 'red')} → {colored(replacement, 'green')}\n"

        output += "\n" + "="*80 + "\n"
        return output

    def __del__(self):
        self.cache.clear()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None


# Example usage
if __name__ == "__main__":
    humanizer = AdvancedTextHumanizer()

    text = """

As a longstanding Apple enthusiast, I approached the evaluation of the new iPhone 15 with a critical eye, mindful of the considerable anticipation surrounding its launch. The iPhone 15 presents a refined, contemporary design that harmoniously blends familiar elements with innovative touches, introducing new color variants that are aesthetically pleasing and maintaining the robust build quality for which Apple is renowned. The device exudes a premium feel, striking an optimal balance between weight and ergonomics. The display is a particular highlight; the Super Retina XDR technology is markedly brighter and more vibrant, elevating the visual experience across a range of activities, from viewing multimedia content to navigating social media platforms. The integration of 120Hz ProMotion technology ensures an exceptionally smooth and responsive user interface. Equipped with the new A17 Bionic chip, the iPhone 15 delivers unparalleled performance—applications open instantaneously, multitasking is seamless, and even the most resource-intensive games operate without lag. The battery life is noteworthy, comfortably supporting a full day of moderate usage. The camera system on the iPhone 15 is especially impressive, featuring enhanced low-light capabilities and advanced computational photography techniques that yield professional-grade images. In conclusion, the iPhone 15 constitutes a substantial upgrade, offering meaningful advancements in performance, display technology, and photographic capabilities, thereby representing a sound investment for the discerning technology aficionado.

"""

    humanized_text, replacements = humanizer.humanize_text(text)
    formatted_output = humanizer.format_output(text, humanized_text, replacements)
    print(formatted_output)



[nltk_data] Downloading package wordnet to /root/nltk_data...


Initializing AdvancedTextHumanizer...
✓ Loading language models

Processing text...


Analyzing sentences: 100%|██████████| 9/9 [01:16<00:00,  8.54s/it]


Humanization complete!

TEXT HUMANIZATION RESULTS

ORIGINAL TEXT:


As a longstanding Apple enthusiast, I approached the evaluation of the new iPhone 15 with a critical eye, mindful of the considerable anticipation surrounding its launch. The iPhone 15 presents a refined, contemporary design that harmoniously blends familiar elements with innovative touches, introducing new color variants that are aesthetically pleasing and maintaining the robust build quality for which Apple is renowned. The device exudes a premium feel, striking an optimal balance between weight and ergonomics. The display is a particular highlight; the Super Retina XDR technology is markedly brighter and more vibrant, elevating the visual experience across a range of activities, from viewing multimedia content to navigating social media platforms. The integration of 120Hz ProMotion technology ensures an exceptionally smooth and responsive user interface. Equipped with the new A17 Bionic chip, the iPhone 15 delivers




In [None]:
!pip install spacy
!pip install transformers
!pip install sentence-transformers
!pip install fasttext
!pip install nltk
import spacy
import numpy as np
import torch
from typing import List, Dict, Tuple
from transformers import RobertaForMaskedLM, RobertaTokenizer
from sentence_transformers import SentenceTransformer
import fasttext
import fasttext.util
from nltk.corpus import wordnet
from scipy.spatial.distance import cosine
import random
from termcolor import colored
from tqdm import tqdm
import nltk
nltk.download('wordnet')

class AdvancedTextHumanizer:
    def __init__(self):
        # Load models
        self.nlp = spacy.load('en_core_web_sm')
        self.roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        fasttext.util.download_model('en', if_exists='ignore')
        self.ft_model = fasttext.load_model('cc.en.300.bin')

        self.target_pos = ['ADJ', 'ADV']
        self.target_pos = ['ADJ', 'ADV', 'NOUN', 'VERB', 'CONJ', 'PREP', 'PRON', 'INTJ']
        self.cache = {}

        # Expanded formal/AI vocabulary
        self.formal_replacements = {
            'utilize': ['use', 'apply'],
            'implement': ['use', 'add', 'put in'],
            'demonstrate': ['show', 'prove'],
            'comprehensive': ['complete', 'full'],
            'subsequently': ['then', 'after'],
            'methodology': ['method', 'approach'],
            'leverage': ['use', 'apply'],
            'furthermore': ['also', 'plus'],
            'therefore': ['so', 'thus'],
            'approximately': ['about', 'around'],
            'facilitate': ['help', 'aid'],
            'optimal': ['best', 'ideal'],
            'initiated': ['started', 'began'],
            'sufficient': ['enough', 'adequate'],
            'preliminary': ['initial', 'first'],
            'endeavor': ['try', 'attempt'],
            'ascertain': ['find out', 'check'],
            'pursuant': ['following', 'under'],
            'expedite': ['speed up', 'hurry'],
            'commence': ['start', 'begin']
        }

        print(colored("Initializing AdvancedTextHumanizer...", "cyan"))
        print(colored("✓ Loading language models", "green"))

        # Add common words list (top 1000 most frequent English words)
        self.common_words = set()
        try:
            with open('common_words.txt', 'r') as f:
                self.common_words = set(f.read().splitlines())
        except:
            # Fallback to a small set of very common words
            self.common_words = {'good', 'great', 'nice', 'better', 'best', 'easy', 'simple',
                               'clear', 'fast', 'quick', 'well', 'sure', 'right', 'fine'}

        # Add negative prefixes to check
        self.negative_prefixes = {'un', 'in', 'im', 'ir', 'dis', 'non'}

    def char_similarity_ratio(self, str1: str, str2: str) -> float:
        """Calculate character-level similarity between two strings."""
        if len(str1) == 0 or len(str2) == 0:
            return 0.0

        common_chars = sum(1 for c in str1 if c in str2)
        return common_chars / max(len(str1), len(str2))

    def has_opposite_meaning(self, word: str, candidate: str) -> bool:
        """Check if the candidate is just the negative form of the word."""
        for prefix in self.negative_prefixes:
            if (candidate.startswith(prefix) and word in candidate) or \
               (word.startswith(prefix) and candidate in word):
                return True
        return False

    def compute_similarity_score(self, word1: str, word2: str) -> float:
        # Combine multiple similarity metrics
        scores = []

        # FastText similarity
        vec1 = self.ft_model.get_word_vector(word1)
        vec2 = self.ft_model.get_word_vector(word2)
        ft_sim = 1 - cosine(vec1, vec2)
        scores.append(ft_sim)

        # Sentence-BERT similarity
        sbert_sim = self.sentence_model.encode([word1, word2])
        sbert_sim = 1 - cosine(sbert_sim[0], sbert_sim[1])
        scores.append(sbert_sim)

        # WordNet similarity
        wn_sim = self.get_wordnet_similarity(word1, word2)
        if wn_sim:
            scores.append(wn_sim)

        return np.mean(scores)

    def get_wordnet_similarity(self, word1: str, word2: str) -> float:
        synsets1 = wordnet.synsets(word1)
        synsets2 = wordnet.synsets(word2)

        if not synsets1 or not synsets2:
            return None

        max_sim = 0
        for s1 in synsets1:
            for s2 in synsets2:
                sim = s1.path_similarity(s2)
                if sim and sim > max_sim:
                    max_sim = sim
        return max_sim

    def get_replacement_candidates(self, sentence: str, word: str, pos: str) -> List[str]:
        if word.lower() in self.formal_replacements:
            candidates = self.formal_replacements[word.lower()]
            # Prefer common words from predefined replacements
            common_candidates = [c for c in candidates if c in self.common_words]
            return common_candidates if common_candidates else candidates

        if (sentence, word) in self.cache:
            return self.cache[(sentence, word)]

        candidates = []

        # RoBERTa predictions
        masked_text = sentence.replace(word, self.roberta_tokenizer.mask_token)
        inputs = self.roberta_tokenizer(masked_text, return_tensors="pt", padding=True)

        with torch.no_grad():
            outputs = self.roberta_model(**inputs)
            predictions = outputs.logits

        mask_idx = torch.where(inputs["input_ids"][0] == self.roberta_tokenizer.mask_token_id)[0]
        top_tokens = torch.topk(predictions[0, mask_idx], 20, dim=1)

        # FastText similar words
        ft_similar = self.ft_model.get_nearest_neighbors(word, k=10)

        # Combine candidates
        for token_id in top_tokens.indices[0]:
            candidate = self.roberta_tokenizer.decode(token_id).strip()
            if self.validate_candidate(candidate, word, pos):
                candidates.append(candidate)

        for _, ft_word in ft_similar:
            if self.validate_candidate(ft_word, word, pos):
                candidates.append(ft_word)

        # Filter and rank candidates
        ranked_candidates = []
        for candidate in set(candidates):
            similarity = self.compute_similarity_score(word, candidate)
            if similarity > 0.5:  # Adjusted threshold
                ranked_candidates.append((candidate, similarity))

        ranked_candidates.sort(key=lambda x: x[1], reverse=True)

        # Additional filtering for final candidates
        final_candidates = []
        for candidate, similarity in ranked_candidates:
            # Prefer common words
            score_boost = 0.1 if candidate.lower() in self.common_words else 0
            adjusted_similarity = similarity + score_boost

            if adjusted_similarity > 0.5:
                final_candidates.append(candidate)

        # Limit to top 5 candidates, preferring common words
        final_candidates.sort(key=lambda x: x in self.common_words, reverse=True)
        final_candidates = final_candidates[:5]

        self.cache[(sentence, word)] = final_candidates
        return final_candidates

    def validate_candidate(self, candidate: str, original: str, pos: str) -> bool:
        if not candidate.isalpha() or len(candidate) < 3:
            return False

        if candidate.lower() == original.lower():
            return False

        # Check for character similarity to avoid misspellings
        char_sim = self.char_similarity_ratio(candidate.lower(), original.lower())
        if char_sim > 0.8:  # If words are too similar, likely a misspelling
            return False

        # Check for negative forms
        if self.has_opposite_meaning(original.lower(), candidate.lower()):
            return False

        doc = self.nlp(candidate)
        if doc[0].pos_ != pos:
            return False

        return True

    def humanize_text(self, text: str, replacement_prob: float = 0.7) -> Tuple[str, Dict]:
        doc = self.nlp(text)
        new_text = []
        replacements = {}

        print(colored("\nProcessing text...", "cyan"))
        for sent in tqdm(list(doc.sents), desc="Analyzing sentences"):
            sent_text = sent.text
            sent_embedding = self.sentence_model.encode([sent_text])[0]

            for token in sent:
                local_prob = replacement_prob
                if len(token.text) > 8:
                    local_prob += 0.2

                if (token.pos_ in self.target_pos and
                    len(token.text) > 3 and
                    random.random() < local_prob):

                    candidates = self.get_replacement_candidates(sent_text, token.text, token.pos_)
                    if candidates:
                        replacement = random.choice(candidates)
                        replacements[token.text] = replacement
                        new_text.append(replacement)
                    else:
                        new_text.append(token.text)
                else:
                    new_text.append(token.text)
                new_text.append(token.whitespace_)

        result_text = "".join(new_text)
        print(colored("\nHumanization complete!", "green"))
        return result_text, replacements

    def format_output(self, original_text: str, humanized_text: str, replacements: Dict) -> str:
        output = "\n" + "="*80 + "\n"
        output += colored("TEXT HUMANIZATION RESULTS", "cyan", attrs=["bold"]) + "\n"
        output += "="*80 + "\n\n"

        # Original text section
        output += colored("ORIGINAL TEXT:", "yellow", attrs=["bold"]) + "\n"
        output += f"{original_text}\n\n"

        # Humanized text section
        output += colored("HUMANIZED TEXT:", "green", attrs=["bold"]) + "\n"
        output += f"{humanized_text}\n\n"

        # Statistics section
        output += colored("STATISTICS:", "magenta", attrs=["bold"]) + "\n"
        output += f"Total words replaced: {len(replacements)}\n"
        output += f"Unique replacements: {len(set(replacements.values()))}\n\n"

        # Replacements section
        output += colored("WORD REPLACEMENTS:", "blue", attrs=["bold"]) + "\n"
        for original, replacement in replacements.items():
            output += f"• {colored(original, 'red')} → {colored(replacement, 'green')}\n"

        output += "\n" + "="*80 + "\n"
        return output

    def __del__(self):
        self.cache.clear()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None


# Example usage
if __name__ == "__main__":
    humanizer = AdvancedTextHumanizer()

    text = """

As a longstanding Apple enthusiast, I approached the evaluation of the new iPhone 15 with a critical eye, mindful of the considerable anticipation surrounding its launch. The iPhone 15 presents a refined, contemporary design that harmoniously blends familiar elements with innovative touches, introducing new color variants that are aesthetically pleasing and maintaining the robust build quality for which Apple is renowned. The device exudes a premium feel, striking an optimal balance between weight and ergonomics. The display is a particular highlight; the Super Retina XDR technology is markedly brighter and more vibrant, elevating the visual experience across a range of activities, from viewing multimedia content to navigating social media platforms. The integration of 120Hz ProMotion technology ensures an exceptionally smooth and responsive user interface. Equipped with the new A17 Bionic chip, the iPhone 15 delivers unparalleled performance—applications open instantaneously, multitasking is seamless, and even the most resource-intensive games operate without lag. The battery life is noteworthy, comfortably supporting a full day of moderate usage. The camera system on the iPhone 15 is especially impressive, featuring enhanced low-light capabilities and advanced computational photography techniques that yield professional-grade images. In conclusion, the iPhone 15 constitutes a substantial upgrade, offering meaningful advancements in performance, display technology, and photographic capabilities, thereby representing a sound investment for the discerning technology aficionado.

"""

    humanized_text, replacements = humanizer.humanize_text(text)
    formatted_output = humanizer.format_output(text, humanized_text, replacements)
    print(formatted_output)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Initializing AdvancedTextHumanizer...
✓ Loading language models

Processing text...


Analyzing sentences: 100%|██████████| 9/9 [05:57<00:00, 39.72s/it]


Humanization complete!

TEXT HUMANIZATION RESULTS

ORIGINAL TEXT:


As a longstanding Apple enthusiast, I approached the evaluation of the new iPhone 15 with a critical eye, mindful of the considerable anticipation surrounding its launch. The iPhone 15 presents a refined, contemporary design that harmoniously blends familiar elements with innovative touches, introducing new color variants that are aesthetically pleasing and maintaining the robust build quality for which Apple is renowned. The device exudes a premium feel, striking an optimal balance between weight and ergonomics. The display is a particular highlight; the Super Retina XDR technology is markedly brighter and more vibrant, elevating the visual experience across a range of activities, from viewing multimedia content to navigating social media platforms. The integration of 120Hz ProMotion technology ensures an exceptionally smooth and responsive user interface. Equipped with the new A17 Bionic chip, the iPhone 15 delivers






---

# Without Fasttext


---



In [None]:
import os
project_dir = '/content/drive/MyDrive/humanClassifier'


outputs_dir = os.path.join(project_dir, 'outputs')
os.makedirs(outputs_dir, exist_ok=True)

In [None]:
##without fastext

import spacy
import numpy as np
import torch
from typing import List, Dict, Tuple
from transformers import RobertaForMaskedLM, RobertaTokenizer
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet
from scipy.spatial.distance import cosine
import random
from termcolor import colored
from tqdm import tqdm

import nltk

class AdvancedTextHumanizer:
    def __init__(self):
        # Download required NLTK data
        nltk.download('wordnet', quiet=True)

        # Load models
        self.nlp = spacy.load('en_core_web_sm')
        self.roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base')
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.target_pos = ['ADJ', 'ADV', 'NOUN', 'VERB', 'CONJ', 'PREP', 'PRON', 'INTJ']
        self.cache = {}

        # Expanded formal/AI vocabulary
        self.formal_replacements = {
            'utilize': ['use', 'apply'],
            'implement': ['use', 'add', 'put in'],
            'demonstrate': ['show', 'prove'],
            'comprehensive': ['complete', 'full'],
            'subsequently': ['then', 'after'],
            'methodology': ['method', 'approach'],
            'leverage': ['use', 'apply'],
            'furthermore': ['also', 'plus'],
            'therefore': ['so', 'thus'],
            'approximately': ['about', 'around'],
            'facilitate': ['help', 'aid'],
            'optimal': ['best', 'ideal'],
            'initiated': ['started', 'began'],
            'sufficient': ['enough', 'adequate'],
            'preliminary': ['initial', 'first'],
            'endeavor': ['try', 'attempt'],
            'ascertain': ['find out', 'check'],
            'pursuant': ['following', 'under'],
            'expedite': ['speed up', 'hurry'],
            'commence': ['start', 'begin']
        }

        print(colored("Initializing AdvancedTextHumanizer...", "cyan"))
        print(colored("✓ Loading language models", "green"))

        # Add common words list (top 1000 most frequent English words)
        self.common_words = set()
        try:
            with open('common_words.txt', 'r') as f:
                self.common_words = set(f.read().splitlines())
        except:
            # Fallback to a small set of very common words
            self.common_words = {'good', 'great', 'nice', 'better', 'best', 'easy', 'simple',
                               'clear', 'fast', 'quick', 'well', 'sure', 'right', 'fine'}

        # Add negative prefixes to check
        self.negative_prefixes = {'un', 'in', 'im', 'ir', 'dis', 'non'}

        # Add N-gram mappings
        self.ngram_replacements = {
            # Bigrams
            ('in order', 'to'): 'to',
            ('due to', 'the'): 'because of',
            ('with respect', 'to'): 'about',
            ('in addition', 'to'): 'besides',
            ('prior to', 'the'): 'before',
            ('subsequent to', 'the'): 'after',

            # Trigrams
            ('as a result', 'of', 'the'): 'because of',
            ('in the event', 'that', 'the'): 'if',
            ('on the basis', 'of', 'the'): 'based on',
            ('in spite of', 'the', 'fact'): 'although',
            ('with regard to', 'the', 'matter'): 'about'
        }

    def char_similarity_ratio(self, str1: str, str2: str) -> float:
        """Calculate character-level similarity between two strings."""
        if len(str1) == 0 or len(str2) == 0:
            return 0.0

        common_chars = sum(1 for c in str1 if c in str2)
        return common_chars / max(len(str1), len(str2))

    def has_opposite_meaning(self, word: str, candidate: str) -> bool:
        """Check if the candidate is just the negative form of the word."""
        for prefix in self.negative_prefixes:
            if (candidate.startswith(prefix) and word in candidate) or \
               (word.startswith(prefix) and candidate in word):
                return True
        return False

    def compute_similarity_score(self, word1: str, word2: str) -> float:
        # Combine multiple similarity metrics
        scores = []


        # Sentence-BERT similarity
        sbert_sim = self.sentence_model.encode([word1, word2])
        sbert_sim = 1 - cosine(sbert_sim[0], sbert_sim[1])
        scores.append(sbert_sim)

        # WordNet similarity
        wn_sim = self.get_wordnet_similarity(word1, word2)
        if wn_sim:
            scores.append(wn_sim)

        return np.mean(scores)

    def get_wordnet_similarity(self, word1: str, word2: str) -> float:
        """Calculate WordNet similarity between two words."""
        try:
            synsets1 = wordnet.synsets(word1)
            synsets2 = wordnet.synsets(word2)

            if not synsets1 or not synsets2:
                return 0.0

            max_sim = 0.0
            for s1 in synsets1:
                for s2 in synsets2:
                    try:
                        sim = s1.path_similarity(s2)
                        if sim and sim > max_sim:
                            max_sim = sim
                    except:
                        continue
            return max_sim
        except:
            return 0.0

    def get_replacement_candidates(self, sentence: str, word: str, pos: str) -> List[str]:
        if word.lower() in self.formal_replacements:
            candidates = self.formal_replacements[word.lower()]
            # Prefer common words from predefined replacements
            common_candidates = [c for c in candidates if c in self.common_words]
            return common_candidates if common_candidates else candidates

        if (sentence, word) in self.cache:
            return self.cache[(sentence, word)]

        candidates = []

        # RoBERTa predictions
        masked_text = sentence.replace(word, self.roberta_tokenizer.mask_token)
        inputs = self.roberta_tokenizer(masked_text, return_tensors="pt", padding=True)

        with torch.no_grad():
            outputs = self.roberta_model(**inputs)
            predictions = outputs.logits

        mask_idx = torch.where(inputs["input_ids"][0] == self.roberta_tokenizer.mask_token_id)[0]
        top_tokens = torch.topk(predictions[0, mask_idx], 20, dim=1)

        # Combine candidates
        for token_id in top_tokens.indices[0]:
            candidate = self.roberta_tokenizer.decode(token_id).strip()
            if self.validate_candidate(candidate, word, pos):
                candidates.append(candidate)

        # Filter and rank candidates
        ranked_candidates = []
        for candidate in set(candidates):
            similarity = self.compute_similarity_score(word, candidate)
            if similarity > 0.5:  # Adjusted threshold
                ranked_candidates.append((candidate, similarity))

        ranked_candidates.sort(key=lambda x: x[1], reverse=True)

        # Additional filtering for final candidates
        final_candidates = []
        for candidate, similarity in ranked_candidates:
            # Prefer common words
            score_boost = 0.1 if candidate.lower() in self.common_words else 0
            adjusted_similarity = similarity + score_boost

            if adjusted_similarity > 0.5:
                final_candidates.append(candidate)

        # Limit to top 5 candidates, preferring common words
        final_candidates.sort(key=lambda x: x in self.common_words, reverse=True)
        final_candidates = final_candidates[:5]

        self.cache[(sentence, word)] = final_candidates
        return final_candidates

    def validate_candidate(self, candidate: str, original: str, pos: str) -> bool:
        if not candidate.isalpha() or len(candidate) < 3:
            return False

        if candidate.lower() == original.lower():
            return False

        # Check for character similarity to avoid misspellings
        char_sim = self.char_similarity_ratio(candidate.lower(), original.lower())
        if char_sim > 0.8:  # If words are too similar, likely a misspelling
            return False

        # Check for negative forms
        if self.has_opposite_meaning(original.lower(), candidate.lower()):
            return False

        doc = self.nlp(candidate)
        if doc[0].pos_ != pos:
            return False

        return True

    def extract_ngrams(self, tokens: List[str], n: int) -> List[Tuple[str, ...]]:
        """Extract n-grams from a list of tokens."""
        return list(zip(*[tokens[i:] for i in range(n)]))

    def find_ngram_matches(self, tokens: List[str]) -> List[Tuple[int, int, str]]:
        """Find matching n-grams and their positions in the text."""
        matches = []

        # Check trigrams first
        trigrams = self.extract_ngrams(tokens, 3)
        for i, trigram in enumerate(trigrams):
            if trigram in self.ngram_replacements:
                matches.append((i, i + 3, self.ngram_replacements[trigram]))

        # Then check bigrams
        bigrams = self.extract_ngrams(tokens, 2)
        for i, bigram in enumerate(bigrams):
            # Skip if overlaps with a trigram match
            if any(i >= start and i < end for start, end, _ in matches):
                continue
            if bigram in self.ngram_replacements:
                matches.append((i, i + 2, self.ngram_replacements[bigram]))

        return sorted(matches, key=lambda x: x[0])

    def humanize_text(self, text: str, replacement_prob: float = 0.7) -> Tuple[str, Dict]:
        doc = self.nlp(text)
        replacements = {}

        print(colored("\nProcessing text...", "cyan"))
        result_text = []

        for sent in tqdm(list(doc.sents), desc="Analyzing sentences"):
            tokens = [token.text for token in sent]
            ngram_matches = self.find_ngram_matches(tokens)

            # Apply n-gram replacements first
            current_pos = 0
            for start, end, replacement in ngram_matches:
                # Add text before the n-gram
                while current_pos < start:
                    result_text.append(tokens[current_pos])
                    current_pos += 1

                # Add the replacement
                original_phrase = " ".join(tokens[start:end])
                replacements[original_phrase] = replacement
                result_text.append(replacement)
                current_pos = end

            # Process remaining tokens
            while current_pos < len(tokens):
                token = sent[current_pos]

                if (token.pos_ in self.target_pos and
                    len(token.text) > 3 and
                    random.random() < replacement_prob):

                    candidates = self.get_replacement_candidates(sent.text, token.text, token.pos_)
                    if candidates:
                        replacement = random.choice(candidates)
                        replacements[token.text] = replacement
                        result_text.append(replacement)
                    else:
                        result_text.append(token.text)
                else:
                    result_text.append(token.text)

                result_text.append(token.whitespace_)
                current_pos += 1

        result_text = "".join(result_text)
        print(colored("\nHumanization complete!", "green"))
        return result_text, replacements

    def format_output(self, original_text: str, humanized_text: str, replacements: Dict) -> str:
        output = "\n" + "="*80 + "\n"
        output += colored("TEXT HUMANIZATION RESULTS", "cyan", attrs=["bold"]) + "\n"
        output += "="*80 + "\n\n"

        # Original text section
        output += colored("ORIGINAL TEXT:", "yellow", attrs=["bold"]) + "\n"
        output += f"{original_text}\n\n"

        # Humanized text section
        output += colored("HUMANIZED TEXT:", "green", attrs=["bold"]) + "\n"
        output += f"{humanized_text}\n\n"

        # Statistics section
        output += colored("STATISTICS:", "magenta", attrs=["bold"]) + "\n"
        output += f"Total words replaced: {len(replacements)}\n"
        output += f"Unique replacements: {len(set(replacements.values()))}\n\n"

        # Replacements section
        output += colored("WORD REPLACEMENTS:", "blue", attrs=["bold"]) + "\n"
        for original, replacement in replacements.items():
            output += f"• {colored(original, 'red')} → {colored(replacement, 'green')}\n"

        output += "\n" + "="*80 + "\n"
        return output

    def __del__(self):
        self.cache.clear()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None


# Example usage
if __name__ == "__main__":
    humanizer = AdvancedTextHumanizer()

    text = """

    Natural Language Processing (NLP) is essential for enabling machines to understand and interpret human language. Preprocessing is a crucial step in NLP that prepares raw text data for analysis. In our application, which focuses on distinguishing between AI-generated and human-written texts, effective preprocessing ensures accurate and meaningful results.

    """

    humanized_text, replacements = humanizer.humanize_text(text)
    formatted_output = humanizer.format_output(text, humanized_text, replacements)
    print(formatted_output)

Initializing AdvancedTextHumanizer...
✓ Loading language models

Processing text...


Analyzing sentences: 100%|██████████| 3/3 [00:25<00:00,  8.46s/it]


Humanization complete!

TEXT HUMANIZATION RESULTS

ORIGINAL TEXT:


    Natural Language Processing (NLP) is essential for enabling machines to understand and interpret human language. Preprocessing is a crucial step in NLP that prepares raw text data for analysis. In our application, which focuses on distinguishing between AI-generated and human-written texts, effective preprocessing ensures accurate and meaningful results.
    
    

HUMANIZED TEXT:


      Natural   Language   Processing   (  NLP  )   is   crucial   for   enabling   computers   to   read   and   understand   human   words  .   Preprocessing   is   a   crucial   step   in   NLP   this   prepares   raw   text   information   for   analysis  .   In   our   program  ,   this   focus   on   discriminating   between   AI  -  produced   and   human  -  authored   text  ,   effective   preprocessing   guarantees   accurate   and   meaningful   responses  .  
    
     

STATISTICS:
Total words replaced: 16
Unique replaceme


