In [2]:
!pip install spacy



In [5]:
def preprocess_text(text, language='english'):
    text = text.lower()
    doc = nlp(text)
    try:
        stop_words = set(stopwords.words(language))
    except OSError:
        return f"Stopwords for the language '{language}' are not available."
    filtered_tokens = [token.text for token in doc if token.is_alpha and token.text not in stop_words]
    
    return " ".join(filtered_tokens)
if __name__ == "__main__":
    sample_text = "Ceci est un exemple simple pour démontrer le prétraitement du texte!"
    processed_text = preprocess_text(sample_text, language='french')
    print("Processed Text:", processed_text)


Processed Text: ceci exemple simple démontrer prétraitement texte


In [6]:
def preprocess_text_with_lemmatization(text, language='english'):
    text = text.lower()
    doc = nlp(text)
    stop_words = set(stopwords.words(language))
    filtered_tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in stop_words]
    return " ".join(filtered_tokens)
if __name__ == "__main__":
    sample_text = "Running runs and ran are forms of the same verb."
    processed_text = preprocess_text_with_lemmatization(sample_text)
    print("Processed Text:", processed_text)


Processed Text: run run run form verb


In [7]:
def preprocess_text_with_custom_stopwords(text, custom_stopwords=None):
    text = text.lower()
    doc = nlp(text)
    stop_words = set(stopwords.words('english'))
    if custom_stopwords:
        stop_words.update(custom_stopwords) 
    
    filtered_tokens = [token.text for token in doc if token.is_alpha and token.text not in stop_words]
    return " ".join(filtered_tokens)
if __name__ == "__main__":
    sample_text = "This is a custom example where specific words are removed."
    custom_stopwords = {'custom', 'specific'}
    processed_text = preprocess_text_with_custom_stopwords(sample_text, custom_stopwords)
    print("Processed Text:", processed_text)


Processed Text: example words removed
