<a href="https://colab.research.google.com/github/krishjagwani/DAV_72/blob/main/DAV_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokenization



In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
!pip install tokenizers



In [None]:
!pip install nltk

import nltk
from nltk.tokenize import word_tokenize

# Sample text
text = "Tokenization is the process of breaking down text into smaller units called tokens."

# Tokenize the text into words
tokens = word_tokenize(text)

# Print the tokens
print(tokens)

['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'smaller', 'units', 'called', 'tokens', '.']


# Frequency Distribution

In [None]:
from nltk import FreqDist

In [None]:
freq_dist = FreqDist(tokens)

# Print the frequency distribution
print(freq_dist.most_common())

[('Tokenization', 1), ('is', 1), ('the', 1), ('process', 1), ('of', 1), ('breaking', 1), ('down', 1), ('text', 1), ('into', 1), ('smaller', 1), ('units', 1), ('called', 1), ('tokens', 1), ('.', 1)]


# Remove stopwords & punctuations

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
text = "Tokenization is the process of breaking down text into smaller units called tokens,It is important in natural language processing."

# Tokenize the text
tokens = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Remove punctuation
filtered_tokens = [word for word in filtered_tokens if word not in string.punctuation]

# Print filtered tokens
print(filtered_tokens)

['Tokenization', 'process', 'breaking', 'text', 'smaller', 'units', 'called', 'tokens', 'important', 'natural', 'language', 'processing']


# Lexicon Normalization (Stemming, Lemmatization)

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

text = "running runner runs"
tokens = word_tokenize(text)

stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in tokens]
print(stems)

['run', 'runner', 'run']


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download WordNet corpus if not already downloaded
nltk.download('wordnet')

# Input text
text = "The cats are running and jumping on the beds."

# Tokenize the text
tokens = word_tokenize(text)

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each token
lemmas = [lemmatizer.lemmatize(word) for word in tokens]

# Print the lemmatized tokens
print(lemmas)

[nltk_data] Downloading package wordnet to /root/nltk_data...


['The', 'cat', 'are', 'running', 'and', 'jumping', 'on', 'the', 'bed', '.']


# Part of Speech tagging

In [None]:
import nltk

from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)

# Perform POS tagging
pos_tags = nltk.pos_tag(tokens)

# Print POS tagged tokens
print(pos_tags)

[('The', 'DT'), ('cats', 'NNS'), ('are', 'VBP'), ('running', 'VBG'), ('and', 'CC'), ('jumping', 'VBG'), ('on', 'IN'), ('the', 'DT'), ('beds', 'NNS'), ('.', '.')]


In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Named Entity Recognization

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Sample text
text = "Apple is headquartered in Cupertino, California."

# Tokenize the text
tokens = word_tokenize(text)

# Perform Named Entity Recognition
ne_tags = nltk.pos_tag(tokens)
ne_chunks = nltk.ne_chunk(ne_tags)

# Print Named Entities
for chunk in ne_chunks:
    if hasattr(chunk, 'label'):
        print(chunk.label(), ' '.join(c[0] for c in chunk))

GPE Apple
GPE Cupertino
GPE California


# Scrape data from a website

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the website to scrape
url = "https://en.wikipedia.org/wiki/Main_Page"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find and extract the desired data from the webpage
    # For example, if you want to scrape all the links on the webpage:
    links = soup.find_all('a')

    # Print the extracted links
    for link in links:
        print(link.get('href'))
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)


#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=Main+Page
/w/index.php?title=Special:UserLogin&returnto=Main+Page
/w/index.php?title=Special:CreateAccount&returnto=Main+Page
/w/index.php?title=Special:UserLogin&returnto=Main+Page
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/Main_Page
/wiki/Talk:Main_Page
/wiki/Main_Page
/w/index.php?title=Main_Page&action=edit
/w/index.php?title=Main_Page&action=history
/wiki/Main_Page
/w/index.php?title=Main_Page&action=edit
/w/ind

## Using R

In [None]:
install.packages("tm")
install.packages("rvest")
install.packages("tokenizers")
install.packages("openNLP")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘NLP’, ‘Rcpp’, ‘slam’, ‘BH’


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependency ‘SnowballC’


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘openNLPdata’, ‘rJava’


“installation of package ‘rJava’ had non-zero exit status”
“installation of package ‘openNLPdata’ had non-zero exit status”
“installation of package ‘openNLP’ had non-zero exit status”


In [None]:
library(tm)
library(rvest)
library(NLP)
library(tokenizers)
library(SnowballC)
#library(openNLP)

In [None]:
text <- "He raced to the grocery store. He went inside but realized he forgot his wallet. He raced back home to grab it. Once he found it, he raced to the car again and drove the Ushers."
sent_tokens <- unlist(tokenize_sentences(text))
word_tokens <- unlist(tokenize_words(text))
cat("Sentence Tokens:", sent_tokens, "\n")
cat("Word Tokens:", word_tokens, "\n")
# Frequency Distribution
fdist <- table(unlist(word_tokens))
print(head(sort(fdist, decreasing = TRUE), 2))

Sentence Tokens: He raced to the grocery store. He went inside but realized he forgot his wallet. He raced back home to grab it. Once he found it, he raced to the car again and drove the Ushers. 
Word Tokens: he raced to the grocery store he went inside but realized he forgot his wallet he raced back home to grab it once he found it he raced to the car again and drove the ushers 

   he raced 
    6     3 


In [None]:
# Remove stopwords and punctuations
stop_words <- stopwords("en")
filtered_tokens <- word_tokens[!(word_tokens %in% stop_words) & grepl("[a-zA-Z]", word_tokens)]
cat("Filtered Tokens (without stopwords and punctuations):", filtered_tokens, "\n")

Filtered Tokens (without stopwords and punctuations): raced grocery store went inside realized forgot wallet raced back home grab found raced car drove ushers 


In [None]:
# Stemming
stemmed_tokens <- wordStem(filtered_tokens, language = "en")

# Lemmatization
lemmatized_text <- tolower(text)
lemmatized_text <- wordStem(lemmatized_text, language = "en")
cat("Stemmed Tokens:", stemmed_tokens, "\n")
cat("Lemmatized Text:", lemmatized_text, "\n")

Stemmed Tokens: race groceri store went insid realiz forgot wallet race back home grab found race car drove usher 
Lemmatized Text: he raced to the grocery store. he went inside but realized he forgot his wallet. he raced back home to grab it. once he found it, he raced to the car again and drove the ushers. 


In [None]:
# Web Scraping
url <- 'http://quotes.toscrape.com/'
web_page <- read_html(url)
web_text <- html_text(web_page)
cat("Scraped Data from the Website:\n", web_text)

Scraped Data from the Website:
 Quotes to Scrape
    
        
            
                
                    Quotes to Scrape
                
            
            
                
                
                    Login
                
                
            
        
    


    

    
        “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
        by Albert Einstein
        (about)
        
        
            Tags:
            change
            
            deep-thoughts
            
            thinking
            
            world
            
        
    

    
        “It is our choices, Harry, that show what we truly are, far more than our abilities.”
        by J.K. Rowling
        (about)
        
        
            Tags:
            abilities
            
            choices
            
        
    

    
        “There are only two ways to live your life. One is as though nothing