<a href="https://colab.research.google.com/github/isosceles45/DAV55/blob/master/55_DAV_EXP07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Analytics in Python

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string

nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

## Tokenization

In [None]:
sentence = """There is no more to be said, gentlemen.
            My name is Blood—Captain Blood, if you please,
            of this ship the Cinco Llagas,
            taken as a prize of war from 'Don Diego' de Espinosa y Valdez,
            who is my prisoner aboard.."""

tokens = nltk.word_tokenize(sentence)

In [None]:
tokens[0: 6]

['There', 'is', 'no', 'more', 'to', 'be']

## Parts of Speech Tagging

In [None]:
tagged = nltk.pos_tag(tokens)

* EX: Existential there
* VBZ: Verb, 3rd person singular present
* DT: Determiner
* RBR: Adverb, comparative
* TO: to (preposition)
* VB: Verb, base form

In [None]:
tagged[0: 6]

[('There', 'EX'),
 ('is', 'VBZ'),
 ('no', 'DT'),
 ('more', 'RBR'),
 ('to', 'TO'),
 ('be', 'VB')]

## Lexicon Normalization (Stemming, Lemmatization)

In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
stemmed_words = [stemmer.stem(word) for word in tokens]
lemmatized_words = [lemmatizer.lemmatize(word, wordnet.VERB) for word in tokens]

In [None]:
print("Original Sentence:", sentence)
print("Stemmed Sentence:", ' '.join(stemmed_words))
print("Lemmatized Sentence:", ' '.join(lemmatized_words))

Original Sentence: There is no more to be said, gentlemen. 
            My name is Blood—Captain Blood, if you please,
            of this ship the Cinco Llagas,
            taken as a prize of war from Don Diego de Espinosa y Valdez, 
            who is my prisoner aboard..
Stemmed Sentence: there is no more to be said , gentlemen . my name is blood—captain blood , if you pleas , of thi ship the cinco llaga , taken as a prize of war from don diego de espinosa y valdez , who is my prison aboard ..
Lemmatized Sentence: There be no more to be say , gentlemen . My name be Blood—Captain Blood , if you please , of this ship the Cinco Llagas , take as a prize of war from Don Diego de Espinosa y Valdez , who be my prisoner aboard ..


## Frequency Distribution

In [None]:
fdist = FreqDist(stemmed_words)

In [None]:
for word, frequency in fdist.items():
    print(f"{word}: {frequency}")

there: 1
is: 3
no: 1
more: 1
to: 1
be: 1
said: 1
,: 5
gentlemen: 1
.: 1
my: 2
name: 1
blood—captain: 1
blood: 1
if: 1
you: 1
pleas: 1
of: 2
thi: 1
ship: 1
the: 1
cinco: 1
llaga: 1
taken: 1
as: 1
a: 1
prize: 1
war: 1
from: 1
don: 1
diego: 1
de: 1
espinosa: 1
y: 1
valdez: 1
who: 1
prison: 1
aboard: 1
..: 1


## Remove stopwords & punctuations

In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
tokens_without_punct = [word for word in tokens if word not in string.punctuation]
tokens_without_punct

['There',
 'is',
 'no',
 'more',
 'to',
 'be',
 'said',
 'gentlemen',
 'My',
 'name',
 'is',
 'Blood—Captain',
 'Blood',
 'if',
 'you',
 'please',
 'of',
 'this',
 'ship',
 'the',
 'Cinco',
 'Llagas',
 'taken',
 'as',
 'a',
 'prize',
 'of',
 'war',
 'from',
 'Don',
 'Diego',
 'de',
 'Espinosa',
 'y',
 'Valdez',
 'who',
 'is',
 'my',
 'prisoner',
 'aboard',
 '..']

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
tokens_without_stopwords = [word for word in tokens_without_punct if word.lower() not in stop_words]
tokens_without_stopwords

['said',
 'gentlemen',
 'name',
 'Blood—Captain',
 'Blood',
 'please',
 'ship',
 'Cinco',
 'Llagas',
 'taken',
 'prize',
 'war',
 'Diego',
 'de',
 'Espinosa',
 'Valdez',
 'prisoner',
 'aboard',
 '..']

## Named Entity Recognization

In [None]:
ner_entities = nltk.ne_chunk(tagged)
print(ner_entities)

(S
  There/EX
  is/VBZ
  no/DT
  more/RBR
  to/TO
  be/VB
  said/VBD
  ,/,
  gentlemen/NNS
  ./.
  My/PRP$
  name/NN
  is/VBZ
  Blood—Captain/NNP
  Blood/NNP
  ,/,
  if/IN
  you/PRP
  please/VBP
  ,/,
  of/IN
  this/DT
  ship/NN
  the/DT
  (ORGANIZATION Cinco/NNP Llagas/NNP)
  ,/,
  taken/VBN
  as/IN
  a/DT
  prize/NN
  of/IN
  war/NN
  from/IN
  (PERSON Don/NNP Diego/NNP)
  de/IN
  (PERSON Espinosa/NNP)
  y/NNP
  (PERSON Valdez/NNP)
  ,/,
  who/WP
  is/VBZ
  my/PRP$
  prisoner/NN
  aboard/IN
  ../NN)


## Web Scrapping

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://en.wikipedia.org/wiki/77th_British_Academy_Film_Awards'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
website_text = soup.get_text()
website_text = sent_tokenize(website_text)

In [None]:
print(website_text[0:10])

['\n\n\n\n77th British Academy Film Awards - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\n\nLanguages\n\nLanguage links are at the top of the page.', "Search\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\n Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1Winners and nominees\n\n\n\nToggle Winners and nominees subsection\n\n\n\n\n\n1.1BAFTA Fellowship\n\n\n\n\n\n\n\n1.2Outstanding British Contribution t

## Text Analytics in R

In [3]:
install.packages(c("tokenizers", "stringi", "openNLP"))

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘Rcpp’, ‘openNLPdata’, ‘rJava’


“installation of package ‘rJava’ had non-zero exit status”
“installation of package ‘openNLPdata’ had non-zero exit status”
“installation of package ‘openNLP’ had non-zero exit status”


In [8]:
install.packages(c("tm", "stringr", "SnowballC", "udpipe", "rvest", "spacyr"))

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘RcppTOML’, ‘here’, ‘png’, ‘slam’, ‘BH’, ‘reticulate’




In [9]:
library(tokenizers)
library(tm)
library(stringr)
library(SnowballC)
library(udpipe)
library(rvest)
library(spacyr)

Loading required package: NLP



In [4]:
sentence <- "There is no more to be said, gentlemen. My name is Blood—Captain Blood, if you please, of this ship the Cinco Llagas, taken as a prize of war from 'Don Diego' de Espinosa y Valdez, who is my prisoner aboard."

## Tokenization

In [10]:
tokens <- unlist(tokenize_words(sentence))
tokens

## Parts of speech tagging (POS tagging)


In [14]:
ud_model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(ud_model$file_model)

parsed <- udpipe_annotate(ud_model, x = sentence, trace = TRUE)
pos_tags <- as.data.frame(parsed)$upos

print(pos_tags)

Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /content/english-ewt-ud-2.5-191206.udpipe

 - This model has been trained on version 2.5 of data from https://universaldependencies.org

 - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0

 - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.

 - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')

Downloading finished, model stored at '/content/english-ewt-ud-2.5-191206.udpipe'



2024-02-20 17:40:57.02836 Annotating text fragment 1/1
 [1] "PRON"  "VERB"  "ADV"   "ADJ"   "PART"  "AUX"   "VERB"  "PUNCT" "NOUN" 
[10] "PUNCT" "PRON"  "NOUN"  "AUX"   "PROPN" "PROPN" "PROPN" "PUNCT" "SCONJ"
[19] "PRON"  "INTJ"  "PUNCT" "ADP"   "DET"   "NOUN"  "DET"   "PROPN" "PROPN"
[28] "PUNCT" "VERB"  "ADP"   "DET"   "NOUN"  "ADP"   "NOUN"  "ADP"   "PUNCT"
[37] "PROPN" "PROPN" "PART"  "PROPN" "PROPN" "PROPN" "PROPN" "PUNCT" "PRON" 
[46] "AUX"   "PRON"  "NOUN"  "ADV"   "PUNCT"


## Stemming


In [16]:
stemmed_words <- wordStem(tokens)
stemmed_words

## Frequency distribution

In [17]:
fdist <- table(stemmed_words)
fdist

stemmed_words
        a    aboard        be     blood   captain     cinco        de     diego 
        2         1         1         2         1         1         1         1 
      don  espinosa      from gentlemen         i        if     llaga      more 
        1         1         1         1         3         1         1         1 
       my      name        no        of     pleas    prison     prize      said 
        2         1         1         2         1         1         1         1 
     ship     taken       the     there       thi        to    valdez       war 
        1         1         1         1         1         1         1         1 
      who         y       you 
        1         1         1 

## Remove punctuation

In [18]:
tokens_without_punct <- tokens[!str_detect(tokens, "[[:punct:]]")]
tokens_without_punct

## Web Scrapping

In [20]:
url <- "https://www.nytimes.com/international/"
webpage <- read_html(url)
text_from_website <- html_text(webpage)
print(substr(text_from_website,1,1000))

[1] "The New York Times International - Breaking News, US News, World News, Videos{\"@context\":\"https://schema.org\",\"@type\":\"WebPage\",\"image\":[{\"@context\":\"https://schema.org\",\"@type\":\"ImageObject\",\"url\":\"https://static01.nyt.com/vi-assets/images/share/1200x675_nameplate.png\",\"height\":675,\"width\":1200,\"contentUrl\":\"https://static01.nyt.com/vi-assets/images/share/1200x675_nameplate.png\",\"creditText\":\"The New York Times\"},{\"@context\":\"https://schema.org\",\"@type\":\"ImageObject\",\"url\":\"https://static01.nyt.com/vi-assets/images/share/1200x900_t.png\",\"height\":900,\"width\":1200,\"contentUrl\":\"https://static01.nyt.com/vi-assets/images/share/1200x900_t.png\",\"creditText\":\"The New York Times\"},{\"@context\":\"https://schema.org\",\"@type\":\"ImageObject\",\"url\":\"https://static01.nyt.com/vi-assets/images/share/1200x1200_t.png\",\"height\":1200,\"width\":1200,\"contentUrl\":\"https://static01.nyt.com/vi-assets/images/share/1200x1200_t.png\",\