In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://en.wikipedia.org/wiki/Natural_language_processing"

response = requests.get(url)

In [None]:
soup = BeautifulSoup(response.text, "html.parser")

title = soup.find("h1", {"id": "firstHeading"}).text
print(f"Page Title: {title}")

Page Title: Natural language processing


In [None]:
paragraphs = soup.find_all("p")

first_paragraph = ""
for para in paragraphs:
    text = para.get_text(strip=True)
    if text:
        first_paragraph = text
        break

print(f"\nFirst Paragraph: {first_paragraph}")


First Paragraph: Natural language processing(NLP) is a subfield ofcomputer scienceand especiallyartificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded innatural languageand is thus closely related toinformation retrieval,knowledge representationandcomputational linguistics, a subfield oflinguistics. Typically data is collected intext corpora, using either rule-based, statistical or neural-based approaches inmachine learninganddeep learning.


In [None]:
headings = [h2.text.strip() for h2 in soup.find_all("h2")]
print("\nHeadings:")
for h in headings:
    print("-", h)


Headings:
- Contents
- History
- Approaches: Symbolic, statistical, neural networks
- Common NLP tasks
- General tendencies and (possible) future directions
- See also
- References
- Further reading
- External links


In [None]:
links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].startswith("/wiki/")]
full_links = ["https://en.wikipedia.org" + link for link in links[:10]]
print("\nSome Wikipedia Links:")
for link in full_links:
    print(link)


Some Wikipedia Links:
https://en.wikipedia.org/wiki/Main_Page
https://en.wikipedia.org/wiki/Wikipedia:Contents
https://en.wikipedia.org/wiki/Portal:Current_events
https://en.wikipedia.org/wiki/Special:Random
https://en.wikipedia.org/wiki/Wikipedia:About
https://en.wikipedia.org/wiki/Help:Contents
https://en.wikipedia.org/wiki/Help:Introduction
https://en.wikipedia.org/wiki/Wikipedia:Community_portal
https://en.wikipedia.org/wiki/Special:RecentChanges
https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard


In [None]:
cleaned_text = []
for para in paragraphs:
    text = para.get_text(strip=True)
    if text:
        cleaned_text.append(text)
    if len(cleaned_text) >= 3:
        break

In [None]:
cleaned_text

['Natural language processing(NLP) is a subfield ofcomputer scienceand especiallyartificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded innatural languageand is thus closely related toinformation retrieval,knowledge representationandcomputational linguistics, a subfield oflinguistics. Typically data is collected intext corpora, using either rule-based, statistical or neural-based approaches inmachine learninganddeep learning.',
 'Major tasks in natural language processing arespeech recognition,text classification,natural-language understanding, andnatural-language generation.',
 'Natural language processing has its roots in the 1950s.[1]Already in 1950,Alan Turingpublished an article titled "Computing Machinery and Intelligence" which proposed what is now called theTuring testas a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test inclu

In [None]:
import re

article_text = " ".join(cleaned_text)


article_text = re.sub(r'http\S+|www.\S+', '', article_text)

In [None]:
article_text = re.sub(r'[^A-Za-z0-9,.!? ]+', '', article_text)

In [None]:
article_text

'Natural language processingNLP is a subfield ofcomputer scienceand especiallyartificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded innatural languageand is thus closely related toinformation retrieval,knowledge representationandcomputational linguistics, a subfield oflinguistics. Typically data is collected intext corpora, using either rulebased, statistical or neuralbased approaches inmachine learninganddeep learning. Major tasks in natural language processing arespeech recognition,text classification,naturallanguage understanding, andnaturallanguage generation. Natural language processing has its roots in the 1950s.1Already in 1950,Alan Turingpublished an article titled Computing Machinery and Intelligence which proposed what is now called theTuring testas a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that inv

In [None]:
example_text = 'Despite the recent advancements in artificial intelligence, the ethical implications surrounding the development and deployment of advanced machine learning models, particularly those designed for high-stakes decision-making, remain a complex and multifaceted issue. While the potential benefits of AI in domains like healthcare, finance, and criminal justice are significant, concerns regarding algorithmic bias, lack of transparency in decision-making processes, and potential for unintended societal consequences raise serious questions about responsible AI development. As such, a collaborative effort between researchers, policymakers, and industry leaders is crucial to establish robust ethical frameworks that ensure AI systems are developed and utilized in a manner that aligns with human values, promotes fairness, and safeguards individual rights, particularly in contexts where human lives may be impacted by algorithmic outcomes.'

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('all')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       t

True

In [None]:
tokens_ = word_tokenize(article_text)
print("Tokens:", tokens_)

Tokens: ['Natural', 'language', 'processingNLP', 'is', 'a', 'subfield', 'ofcomputer', 'scienceand', 'especiallyartificial', 'intelligence', '.', 'It', 'is', 'primarily', 'concerned', 'with', 'providing', 'computers', 'with', 'the', 'ability', 'to', 'process', 'data', 'encoded', 'innatural', 'languageand', 'is', 'thus', 'closely', 'related', 'toinformation', 'retrieval', ',', 'knowledge', 'representationandcomputational', 'linguistics', ',', 'a', 'subfield', 'oflinguistics', '.', 'Typically', 'data', 'is', 'collected', 'intext', 'corpora', ',', 'using', 'either', 'rulebased', ',', 'statistical', 'or', 'neuralbased', 'approaches', 'inmachine', 'learninganddeep', 'learning', '.', 'Major', 'tasks', 'in', 'natural', 'language', 'processing', 'arespeech', 'recognition', ',', 'text', 'classification', ',', 'naturallanguage', 'understanding', ',', 'andnaturallanguage', 'generation', '.', 'Natural', 'language', 'processing', 'has', 'its', 'roots', 'in', 'the', '1950s.1Already', 'in', '1950', ',

In [None]:
# Step 1: Split the text based on spaces (words)
tokens = article_text.split()

# Step 2: Remove punctuation and tokenize further
# We'll use regular expressions to remove punctuation marks from the tokens
tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]

print("Tokens:", tokens)

Tokens: ['Natural', 'language', 'processingNLP', 'is', 'a', 'subfield', 'ofcomputer', 'scienceand', 'especiallyartificial', 'intelligence', 'It', 'is', 'primarily', 'concerned', 'with', 'providing', 'computers', 'with', 'the', 'ability', 'to', 'process', 'data', 'encoded', 'innatural', 'languageand', 'is', 'thus', 'closely', 'related', 'toinformation', 'retrievalknowledge', 'representationandcomputational', 'linguistics', 'a', 'subfield', 'oflinguistics', 'Typically', 'data', 'is', 'collected', 'intext', 'corpora', 'using', 'either', 'rulebased', 'statistical', 'or', 'neuralbased', 'approaches', 'inmachine', 'learninganddeep', 'learning', 'Major', 'tasks', 'in', 'natural', 'language', 'processing', 'arespeech', 'recognitiontext', 'classificationnaturallanguage', 'understanding', 'andnaturallanguage', 'generation', 'Natural', 'language', 'processing', 'has', 'its', 'roots', 'in', 'the', '1950s1Already', 'in', '1950Alan', 'Turingpublished', 'an', 'article', 'titled', 'Computing', 'Machin

In [None]:
# Step 2: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print("Lemmatized Tokens:", lemmatized_tokens)

Lemmatized Tokens: ['Natural', 'language', 'processingNLP', 'is', 'a', 'subfield', 'ofcomputer', 'scienceand', 'especiallyartificial', 'intelligence', 'It', 'is', 'primarily', 'concerned', 'with', 'providing', 'computer', 'with', 'the', 'ability', 'to', 'process', 'data', 'encoded', 'innatural', 'languageand', 'is', 'thus', 'closely', 'related', 'toinformation', 'retrievalknowledge', 'representationandcomputational', 'linguistics', 'a', 'subfield', 'oflinguistics', 'Typically', 'data', 'is', 'collected', 'intext', 'corpus', 'using', 'either', 'rulebased', 'statistical', 'or', 'neuralbased', 'approach', 'inmachine', 'learninganddeep', 'learning', 'Major', 'task', 'in', 'natural', 'language', 'processing', 'arespeech', 'recognitiontext', 'classificationnaturallanguage', 'understanding', 'andnaturallanguage', 'generation', 'Natural', 'language', 'processing', 'ha', 'it', 'root', 'in', 'the', '1950s1Already', 'in', '1950Alan', 'Turingpublished', 'an', 'article', 'titled', 'Computing', 'Mac

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
spacy_lemmatized = [token.lemma_ for token in nlp(article_text).doc]

In [None]:
from textblob import Word
textblob_lemmatized = [Word(token).lemmatize() for token in tokens]

In [None]:
print("Spacy Lemmatized Tokens:", spacy_lemmatized)
print("TextBlob Lemmatized Tokens:", textblob_lemmatized)

Spacy Lemmatized Tokens: ['natural', 'language', 'processingnlp', 'be', 'a', 'subfield', 'ofcomputer', 'scienceand', 'especiallyartificial', 'intelligence', '.', 'it', 'be', 'primarily', 'concern', 'with', 'provide', 'computer', 'with', 'the', 'ability', 'to', 'process', 'datum', 'encode', 'innatural', 'languageand', 'be', 'thus', 'closely', 'relate', 'toinformation', 'retrieval', ',', 'knowledge', 'representationandcomputational', 'linguistic', ',', 'a', 'subfield', 'oflinguistic', '.', 'typically', 'datum', 'be', 'collect', 'intext', 'corpora', ',', 'use', 'either', 'rulebase', ',', 'statistical', 'or', 'neuralbased', 'approach', 'inmachine', 'learninganddeep', 'learning', '.', 'major', 'task', 'in', 'natural', 'language', 'process', 'arespeech', 'recognition', ',', 'text', 'classification', ',', 'naturallanguage', 'understanding', ',', 'andnaturallanguage', 'generation', '.', 'natural', 'language', 'processing', 'have', 'its', 'root', 'in', 'the', '1950s.1already', 'in', '1950,Alan'

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, RegexpStemmer

# Initialize the stemmers
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer("english")
regexp_stemmer = RegexpStemmer(r'(?i)(ed|ing|es|s|ly|ation|ification)$')

# Apply each stemmer
porter_stemmed = [porter_stemmer.stem(token) for token in tokens]
lancaster_stemmed = [lancaster_stemmer.stem(token) for token in tokens]
snowball_stemmed = [snowball_stemmer.stem(token) for token in tokens]
regexp_stemmed = [regexp_stemmer.stem(token) for token in tokens]

# Print the results for each stemmer
print("Porter Stemmed Tokens:", porter_stemmed)
print("Lancaster Stemmed Tokens:", lancaster_stemmed)
print("Snowball Stemmed Tokens:", snowball_stemmed)
print("Regexp Stemmed Tokens:", regexp_stemmed)

Porter Stemmed Tokens: ['natur', 'languag', 'processingnlp', 'is', 'a', 'subfield', 'ofcomput', 'scienceand', 'especiallyartifici', 'intellig', 'it', 'is', 'primarili', 'concern', 'with', 'provid', 'comput', 'with', 'the', 'abil', 'to', 'process', 'data', 'encod', 'innatur', 'languageand', 'is', 'thu', 'close', 'relat', 'toinform', 'retrievalknowledg', 'representationandcomput', 'linguist', 'a', 'subfield', 'oflinguist', 'typic', 'data', 'is', 'collect', 'intext', 'corpora', 'use', 'either', 'rulebas', 'statist', 'or', 'neuralbas', 'approach', 'inmachin', 'learninganddeep', 'learn', 'major', 'task', 'in', 'natur', 'languag', 'process', 'arespeech', 'recognitiontext', 'classificationnaturallanguag', 'understand', 'andnaturallanguag', 'gener', 'natur', 'languag', 'process', 'ha', 'it', 'root', 'in', 'the', '1950s1alreadi', 'in', '1950alan', 'turingpublish', 'an', 'articl', 'titl', 'comput', 'machineri', 'and', 'intellig', 'which', 'propos', 'what', 'is', 'now', 'call', 'thetur', 'testa',

In [None]:
sentences = sent_tokenize(example_text)

vocabulary_size = len(set(tokens))
num_words = len(tokens)
num_documents = len(sentences)

In [None]:
print(f"Vocabulary Size: {vocabulary_size}")
print(f"Number of Words: {num_words}")
print(f"Number of Documents (Sentences): {num_documents}")

Vocabulary Size: 98
Number of Words: 127
Number of Documents (Sentences): 3
