In [2]:
!pip install requests beautifulsoup4 nltk



In [5]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download NLTK data
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4') # for lemmatizer

def process_webpage(url):
  # Get the webpage
  response = requests.get(url)
  html = response.text

  # Extract text from html
  soup = BeautifulSoup(html, 'html.parser')
  text = soup.get_text(separator=" ", strip=True)

  # Tokenize the text
  sentences = sent_tokenize(text)
  num_sentences = len(sentences)

  words = word_tokenize(text)
  num_words = len(words)
  token_types = set(words)
  num_token_types = len(token_types)

  # Lemmatization
  lemmatizer = WordNetLemmatizer()
  lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]
  num_lemma_types = len(set(lemmatized_words))

  # Stemming
  stemmer = PorterStemmer()
  stemmed_words = [stemmer.stem(word.lower()) for word in words]
  num_stem_types = len(set(stemmed_words))

  return{
      "num_sentences": num_sentences,
      "num_words": num_words,
      "num_token_types": num_token_types,
      "num_lemma_types": num_lemma_types,
      "num_stem_types": num_stem_types,
      "sample_sentences": sentences[:5],
      "sample_words": words[:20],
      "sample_lemmatized_words": lemmatized_words[:20],
      "sample_stemmed_words": stemmed_words[:20]
  }

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
url = "https://www.wikihow.com/Tie-a-Tie"
result = process_webpage(url)

print("Number of sentences:",result["num_sentences"])
print("Number of words:",result["num_words"])
print("Number of token types:",result["num_token_types"])
print("Number of lemma types:",result["num_lemma_types"])
print("Number of stem types:",result["num_stem_types"])

print("Sample sentences:",result["sample_sentences"])
print("Sample words:",result["sample_words"])
print("Sample lemmatized words:",result["sample_lemmatized_words"])
print("Sample stemmed words:",result["sample_stemmed_words"])

Number of sentences: 234
Number of words: 4616
Number of token types: 1085
Number of lemma types: 900
Number of stem types: 849
Sample sentences: ["4 Ways to Tie a Tie - wikiHow Skip to Content Quizzes PRO Courses Hot Guides Tech Help Pro Expert Videos About wikiHow Pro Upgrade QUIZZES All Quizzes Love Quizzes Personality Quizzes Trivia Quizzes Taylor Swift Quizzes EDIT Edit this Article EXPLORE Tech Help Pro About Us Random Article Quizzes Request a New Article Community Dashboard This Or That Game Forums Arts and Entertainment Artwork Books Movies Computers and Electronics Computers Phone Skills Technology Hacks Health Men's Health Mental Health Women's Health Relationships Dating Love Relationship Issues Hobbies and Crafts Crafts Drawing Games Education & Communication Communication Skills Personal Development Studying Personal Care and Style Fashion Hair Care Personal Hygiene Quizzes Love Quizzes Personality Quizzes Fun Games Forums Arts and Entertainment Finance and Business Home 

# Provided Answer below

In [None]:
import urllib.request
import nltk
from bs4 import BeautifulSoup

url="https://en.wikipedia.org/wiki/Natural_language_preprocessing"

custom_user_agent = 'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'

# Create a dictionary for headers, including User-Agent
headers = {
    'User-Agent':custom_user_agent
    }

# Create a Request object with the URL and headers
req = urllib.request.Request(url,headers=headers)

print("Ready to collect pages......")

# Download the webpage of a given link
with urllib.request.urlopen(req) as response:
    html = response.read().decode('utf-8')
    
print('HTML page downloaded')

# Extract the text content of the page
text = BeautifulSoup(html, "lxml").get_text()

# Split text into sentences and count sentences
sentences = nltk.tokenize.sent_tokenize(text)
print("Number of sentences:" + str(len(sentences)))

# Split text into tokens and count token types
tokens = nltk.tokenize.word_tokenize(text)
print('Number of tokens:' + str(len(tokens)))
token_types = list(set(tokens))
print("Number of token types:" + str(len(token_types)))

# Find lemmas or stems of tokens and count lemma types
# Do stemming on the tokens and count unique stemmed tokens
wnl = nltk.stem.WordNetLemmatizer()

stemmer = nltk.stem.porter.PorterStemmer()
lemma_types = set()
stemmed_types = set()

for token_type in token_types:
    lemma_types.add(wnl.lemmatize(token_type))
    stemmed_types.add(stemmer.stem(token_type))
    
print("Number of lemma types:" + str(len(lemma_types)))
print("Number of stemmed types:" + str(len(stemmed_types)))