In [1]:
import requests
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.probability import FreqDist

#scrape
url = "https://www.gutenberg.org/cache/epub/15776/pg15776.txt"
response = requests.get(url)
text_data = response.text
print("Text loaded successfully. Sample:")
print(text_data[:500])

#download nltk data
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

#sentence splitting
sentences = nltk.sent_tokenize(text_data)
print(f"\nTotal sentences found: {len(sentences)}")

#word tokenization
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
print("\nSample tokenization:")
print(tokenized_sentences[:2])

#lowercase and remove punctuation
punct = set(string.punctuation)
clean_tokens = []
for sentence in tokenized_sentences:
    for word in sentence:
        word = word.lower()
        if word not in punct:
            clean_tokens.append(word)
print("\nSample cleaned tokens:", clean_tokens[:20])

#stopword removal
stop_words = set(stopwords.words("english"))
tokens_no_stop = [word for word in clean_tokens if word not in stop_words]
print("\nTokens after removing stopwords:", tokens_no_stop[:20])

#lemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens_no_stop]

print("\nLemmatized sample:", lemmatized_tokens[:20])

#stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in lemmatized_tokens]
print("\nStemmed sample:", stemmed_tokens[:20])

#word frquenc
freq_dist = FreqDist(lemmatized_tokens)

print("\nMost Common Words:")
print(freq_dist.most_common(20))


Text loaded successfully. Sample:
﻿The Project Gutenberg eBook of The Economic Consequences of the Peace
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...



Total sentences found: 2612

Sample tokenization:
[['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'of', 'The', 'Economic', 'Consequences', 'of', 'the', 'Peace', 'This', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'United', 'States', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.'], ['You', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www.gutenberg.org', '.']]

Sample cleaned tokens: ['\ufeffthe', 'project', 'gutenberg', 'ebook', 'of', 'the', 'economic', 'consequences', 'of', 'the', 'peace', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere']

Tokens after removing stopwords: ['\ufeffthe', 'project', 'gutenberg', 'ebook', 'economic', 'consequences', 'peace', 'ebook', 'use', 'anyone', 'any

In [3]:
import requests
url = 'https://www.gutenberg.org/cache/epub/15776/pg15776.txt'

response = requests.get(url)
text_data = response.text
print(f"Successfully loaded text data. First 500 characters:\n{text_data[:500]}")            
import nltk
nltk.download('punkt')                                                                             
print("NLTK 'punkt' data download initiated. This message confirms the attempt to download or verify the presence of the data.")                                                                         
print("\n Sample of Processed Output ")

Successfully loaded text data. First 500 characters:
﻿The Project Gutenberg eBook of The Economic Consequences of the Peace
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
NLTK 'punkt' data download initiated. This message confirms the attempt to download or verify the presence of the data.

 Sample of Processed Output 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
num_samples = 5 

for i in range(min(num_samples, len(sentences))):
    print(f"\nOriginal Sentence {i+1}: {sentences[i]}")
    print(f"Tokenized Sentence {i+1}: {tokenized_sentences[i]}")                   
print("NLTK 'punkt' data download initiated. This message confirms the attempt to download or verify the presence of the data.")                                                       


Original Sentence 1: ﻿The Project Gutenberg eBook of The Economic Consequences of the Peace
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever.
Tokenized Sentence 1: ['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'of', 'The', 'Economic', 'Consequences', 'of', 'the', 'Peace', 'This', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'United', 'States', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.']

Original Sentence 2: You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org.
Tokenized Sentence 2: ['You', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included', 'with', 'this