In [2]:
#day 7 assignment
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

# Download NLTK resources
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Load stopwords
stop_words = set(stopwords.words("english"))

# Define preprocessing function
def preprocess_text(text):
    # Tokenize the text using Gensim's simple_preprocess
    tokens = simple_preprocess(text, deacc=True)  # deacc=True removes punctuations

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    return lemmatized_tokens

# Load text data from a file
input_file = "/content/sample_text.txt"
try:
    with open(input_file, "r", encoding="utf-8") as file:
        text_data = file.read()
except FileNotFoundError:
    print(f"The file '{input_file}' was not found.")
    text_data = ""  # Placeholder for an empty string

# Preprocess the text
processed_tokens = preprocess_text(text_data)

# Output the results
print("Original Text:")
print(text_data)
print("\nProcessed Tokens:")
print(processed_tokens)

# Save the processed tokens to a file
output_file = "processed_tokens.txt"
with open(output_file, "w", encoding="utf-8") as file:
    file.write(" ".join(processed_tokens))

print(f"Processed tokens have been saved to '{output_file}'.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Original Text:
Natural Language Processing (NLP) is a branch of artificial intelligence that focuses on the interaction between computers and humans through natural language. It involves a variety of techniques, such as tokenization, stemming, and lemmatization, to process and analyze text data efficiently. By understanding the context and semantics of language, NLP enables machines to perform tasks like language translation, sentiment analysis, and information retrieval.


Processed Tokens:
['natur', 'languag', 'process', 'nlp', 'branch', 'artifici', 'intellig', 'focus', 'interact', 'comput', 'human', 'natur', 'languag', 'involv', 'varieti', 'techniqu', 'token', 'stem', 'lemmat', 'process', 'analyz', 'text', 'data', 'effici', 'understand', 'context', 'semant', 'languag', 'nlp', 'enabl', 'machin', 'perform', 'task', 'like', 'languag', 'translat', 'sentiment', 'analysi', 'inform', 'retriev']
Processed tokens have been saved to 'processed_tokens.txt'.


In [4]:
#day8 assignment
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Download NLTK resources
nltk.download("punkt_tab")

# Define a sample paragraph
sample_paragraph = "Natural Language Processing is a fascinating field of AI. It deals with the interaction between computers and humans using natural language. Tokenization is one of its fundamental steps."

# Tokenize the paragraph into sentences
sentences = sent_tokenize(sample_paragraph)

# Tokenize the paragraph into words
words = word_tokenize(sample_paragraph)

# Output the results
print("Original Paragraph:")
print(sample_paragraph)
print("\nTokenized Sentences:")
print(sentences)
print("\nTokenized Words:")
print(words)

# Save the tokenized sentences and words to files
with open("tokenized_sentences.txt", "w", encoding="utf-8") as sent_file:
    sent_file.write("\n".join(sentences))

with open("tokenized_words.txt", "w", encoding="utf-8") as word_file:
    word_file.write(" ".join(words))

print("Tokenized sentences have been saved to 'tokenized_sentences.txt'.")
print("Tokenized words have been saved to 'tokenized_words.txt'.")


Original Paragraph:
Natural Language Processing is a fascinating field of AI. It deals with the interaction between computers and humans using natural language. Tokenization is one of its fundamental steps.

Tokenized Sentences:
['Natural Language Processing is a fascinating field of AI.', 'It deals with the interaction between computers and humans using natural language.', 'Tokenization is one of its fundamental steps.']

Tokenized Words:
['Natural', 'Language', 'Processing', 'is', 'a', 'fascinating', 'field', 'of', 'AI', '.', 'It', 'deals', 'with', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.', 'Tokenization', 'is', 'one', 'of', 'its', 'fundamental', 'steps', '.']
Tokenized sentences have been saved to 'tokenized_sentences.txt'.
Tokenized words have been saved to 'tokenized_words.txt'.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
