<a href="https://colab.research.google.com/github/hammadnajeeb123/TechWithWarrior_NLP-internship/blob/main/Tokenize_and_clean_text_data_using_NLTK_or_spaCy_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup & Import Libraries
First, ensure that you have the necessary libraries installed. You can uncomment the installation commands if needed.

In [2]:
# Install required libraries (uncomment if needed)
!pip install nltk spacy wordcloud langdetect

# Download necessary NLTK data (uncomment if needed)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# For spaCy, download the English language model (uncomment if needed)
!python -m spacy download en_core_web_sm

# Import necessary libraries
import re
import nltk
import spacy
import string
import pickle
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from collections import Counter
from spacy.lang.en import English
from langdetect import detect, DetectorFactory

# Ensures consistent results from langdetect
DetectorFactory.seed = 0

# Initialize NLTK tools
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize spaCy model
nlp = spacy.load('en_core_web_sm')


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=efacb496a2bfc2d04d3abf4559ba76fe2317ad43d47553187a7aed8c1d58731d
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Step 2: Define the Enhanced TextPreprocessor Class
This class will encapsulate all the preprocessing functionalities.

In [3]:
class TextPreprocessor:
    def __init__(self, method='nltk', custom_stop_words=None, preserve_punctuation=False):
        """
        Initialize the TextPreprocessor.

        :param method: Method to use for preprocessing ('nltk' or 'spacy').
        :param custom_stop_words: List of custom stop words to be added.
        :param preserve_punctuation: Boolean flag to preserve punctuation.
        """
        self.method = method
        self.preserve_punctuation = preserve_punctuation
        self.custom_stop_words = custom_stop_words if custom_stop_words else []

        if method == 'nltk':
            self.lemmatizer = WordNetLemmatizer()
            self.stop_words = set(stopwords.words('english'))
        elif method == 'spacy':
            self.nlp = spacy.load('en_core_web_sm')

    def preprocess(self, text, apply_ner=False, detect_language=False):
        """
        Preprocess the input text.

        :param text: Input text to preprocess.
        :param apply_ner: Flag to apply Named Entity Recognition.
        :param detect_language: Flag to detect the language of the text.
        :return: Processed text and entities (if apply_ner is True).
        """
        if detect_language:
            # Only preprocess if text is in English
            if detect(text) != 'en':
                return text, []

        if self.method == 'nltk':
            return self._preprocess_nltk(text), []
        elif self.method == 'spacy':
            return self._preprocess_spacy(text, apply_ner)

    def _preprocess_nltk(self, text):
        """
        Preprocess text using NLTK methods.

        :param text: Input text to preprocess.
        :return: Processed text.
        """
        # Text normalization (convert to lowercase, expand contractions)
        text = self._expand_contractions(text.lower())

        # Remove special characters and numbers
        if not self.preserve_punctuation:
            text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize text into words
        tokens = word_tokenize(text)

        # Remove stop words and custom stop words
        tokens = [word for word in tokens if word not in stopwords.words('english') and word not in self.custom_stop_words]

        # Lemmatize tokens
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]

        # Join tokens back to a single string
        return ' '.join(tokens)

    def _preprocess_spacy(self, text, apply_ner):
        """
        Preprocess text using spaCy methods.

        :param text: Input text to preprocess.
        :param apply_ner: Flag to apply Named Entity Recognition.
        :return: Processed text and entities (if apply_ner is True).
        """
        # Text normalization (convert to lowercase, expand contractions)
        text = self._expand_contractions(text.lower())

        # Process text with spaCy model
        doc = self.nlp(text)

        # Tokenize, remove stop words, and lemmatize
        tokens = [
            token.lemma_ for token in doc
            if not token.is_stop and token.text not in self.custom_stop_words and (self.preserve_punctuation or token.is_alpha)
        ]

        # Named Entity Recognition (NER) if enabled
        entities = [(ent.text, ent.label_) for ent in doc.ents] if apply_ner else []

        # Join tokens back to a single string
        return ' '.join(tokens), entities

    def _expand_contractions(self, text):
        """
        Expand common contractions in the text.

        :param text: Input text.
        :return: Text with expanded contractions.
        """
        contractions = {
            "can't": "cannot",
            "won't": "will not",
            "n't": " not",
            "'re": " are",
            "'s": " is",
            "'d": " would",
            "'ll": " will",
            "'t": " not",
            "'ve": " have",
            "'m": " am"
        }
        pattern = re.compile('|'.join(contractions.keys()))
        return pattern.sub(lambda x: contractions[x.group()], text)

    def tokenize_sentences(self, text):
        """
        Tokenize the text into sentences.

        :param text: Input text to tokenize.
        :return: List of sentences.
        """
        if self.method == 'nltk':
            return sent_tokenize(text)
        elif self.method == 'spacy':
            doc = self.nlp(text)
            return [sent.text for sent in doc.sents]

    def pos_tagging(self, text):
        """
        Perform Part-of-Speech tagging on the text.

        :param text: Input text for POS tagging.
        :return: List of tokens with their POS tags.
        """
        if self.method == 'nltk':
            tokens = word_tokenize(text)
            return nltk.pos_tag(tokens)
        elif self.method == 'spacy':
            doc = self.nlp(text)
            return [(token.text, token.pos_) for token in doc]

    def generate_word_cloud(self, text, max_words=100):
        """
        Generate a word cloud for visualization.

        :param text: Input text for word cloud generation.
        :param max_words: Maximum number of words to include in the word cloud.
        :return: WordCloud object.
        """
        wordcloud = WordCloud(width=800, height=400, max_words=max_words, background_color='white').generate(text)
        return wordcloud

    def handle_negations(self, text):
        """
        Handle negations in the text.

        :param text: Input text to process for negations.
        :return: Text with handled negations.
        """
        tokens = word_tokenize(text)
        negation_words = ["not", "n't", "no", "never"]
        negation_flag = False
        processed_tokens = []

        for word in tokens:
            if word in negation_words:
                negation_flag = True
            elif negation_flag:
                processed_tokens.append("NOT_" + word)
                negation_flag = False
            else:
                processed_tokens.append(word)

        return ' '.join(processed_tokens)


Step 3: Testing the Enhanced Text Preprocessor
Now that we have defined the class, we can test it with some sample text to demonstrate its capabilities.

In [4]:
# Sample text for testing
sample_text = "I don't think this is a good idea. I'm not happy with the results. NLTK or spaCy, which one is better?"

# Initialize the TextPreprocessor with custom settings
preprocessor = TextPreprocessor(method='spacy', custom_stop_words=['nltk', 'spacy'], preserve_punctuation=True)

# Preprocess the text with Named Entity Recognition enabled
processed_text, entities = preprocessor.preprocess(sample_text, apply_ner=True, detect_language=True)
print("Processed Text:", processed_text)
print("Named Entities:", entities)

# Sentence tokenization
print("Sentence Tokenization:", preprocessor.tokenize_sentences(sample_text))

# POS Tagging
print("POS Tagging:", preprocessor.pos_tagging(sample_text))

# Handling Negations
print("Text with Negations Handled:", preprocessor.handle_negations(sample_text))

# Word Cloud Visualization (displaying requires a separate environment with graphic support)
# Uncomment to visualize in an appropriate environment
# wordcloud = preprocessor.generate_word_cloud(processed_text)
# wordcloud.to_image().show()


Processed Text: think good idea . happy result . , well ?
Named Entities: []
Sentence Tokenization: ["I don't think this is a good idea.", "I'm not happy with the results.", 'NLTK or spaCy, which one is better?']
POS Tagging: [('I', 'PRON'), ('do', 'AUX'), ("n't", 'PART'), ('think', 'VERB'), ('this', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('good', 'ADJ'), ('idea', 'NOUN'), ('.', 'PUNCT'), ('I', 'PRON'), ("'m", 'AUX'), ('not', 'PART'), ('happy', 'ADJ'), ('with', 'ADP'), ('the', 'DET'), ('results', 'NOUN'), ('.', 'PUNCT'), ('NLTK', 'PROPN'), ('or', 'CCONJ'), ('spaCy', 'VERB'), (',', 'PUNCT'), ('which', 'DET'), ('one', 'PRON'), ('is', 'AUX'), ('better', 'ADJ'), ('?', 'PUNCT')]
Text with Negations Handled: I do NOT_think this is a good idea . I 'm NOT_happy with the results . NLTK or spaCy , which one is better ?


Step 4: Saving and Loading the Preprocessor
You can save the preprocessor object using pickle for later use

In [5]:
# Save the preprocessor object
with open('text_preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

# Load the preprocessor object
with open('text_preprocessor.pkl', 'rb') as f:
    loaded_preprocessor = pickle.load(f)
