# **Day-7**

In [3]:
import nltk
import spacy
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenize using spaCy
    doc = nlp(text)

    # Get English stopwords from NLTK
    stop_words = set(stopwords.words("english"))

    # Remove stopwords and punctuation
    filtered_words = [token.text for token in doc if token.text not in stop_words and token.is_alpha]

    return " ".join(filtered_words)

# Example usage
text = "This is a Sample Text! It contains stopwords and punctuation."
processed_text = preprocess_text(text)
print("Processed Text:", processed_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processed Text: sample text contains stopwords punctuation


# **Day-9**

In [4]:
import gensim
import nltk
from gensim.utils import simple_preprocess
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenization using Gensim
    tokens = simple_preprocess(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming
    stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

    # Apply lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in filtered_tokens]

    return {
        "original_tokens": tokens,
        "filtered_tokens": filtered_tokens,
        "stemmed_words": stemmed_words,
        "lemmatized_words": lemmatized_words
    }

# Read sample text from a file
file_path = "sample.txt"  # Change this to your actual file path
try:
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()

    processed_data = preprocess_text(text)

    # Print results
    print("Original Tokens:", processed_data["original_tokens"])
    print("Filtered Tokens (No Stopwords):", processed_data["filtered_tokens"])
    print("Stemmed Words:", processed_data["stemmed_words"])
    print("Lemmatized Words:", processed_data["lemmatized_words"])

except FileNotFoundError:
    print("Error: The file was not found. Please check the file path.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Error: The file was not found. Please check the file path.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
