In [2]:
import spacy
import en_core_web_sm
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
from collections import Counter
from bs4 import BeautifulSoup

In [3]:
document = open("reutersDataset/reut2-001.sgm", 'r')
text = document.read()
document.close()
document_beautiful = BeautifulSoup(text, "html.parser")
for reuter in document_beautiful.find_all("reuters"):
    text1 = reuter.find('text')
    title = text1.title
    doc = text1.body
    if doc!=None:
        doc_len = len(doc.text.split())
        if (doc_len> 200):
            print(f"The length of the document is {doc_len}.")
            #print("=================================")
            break
article = doc.text

The length of the document is 383.


# With SpaCy

In [None]:
nlp1 = spacy.load('en_core_web_sm')

#-- Creating SpaCy object
doc = nlp1(article)
labels = [x.label_ for x in doc.ents]
Counter(labels)

In [218]:
print('There are %s entities in the article and they are represented as %s unique labels.' %(len(doc.ents), 9))

There are 46 entities in the article and they are represented as 9 unique labels.


In [217]:
#-- Printing the original document
print('Original Article: %s' % (doc))
print()

#-- Creating the tokens with lemmanization and labels
for ent in doc.ents:
    print('Original: %s, New: %s, Labels: %s' %(ent.text, ent.lemma_, ent.label_))

Original Article: Unilever Plc <UN.A> and NV group reported
improvements in margins and underlying sales volume growth of
five pct in 1986 after stripping out the effects of falling
prices, disposals and currency movements, Unilever Plc chairman
Michael Angus said.
    He told reporters that volumes in North America increased
some 10.5 pct while European consumer goods rose about 2.5 pct
after being flat for some years.
    Much of the disposal strategy, aimed at concentrating
activities on core businesses, had now been completed, he
noted.
    But the process of acquisitions would go on, with strategic
acquisitions taking place "from time to time," he said.
    The company earlier reported a 20 pct rise in pre-tax
profits for 1986 to 1.14 billion stg from 953 mln previously.
In guilder terms, however, profits at the pre-tax level dropped
three pct to 3.69 billion from 3.81 billion.
    Angus said the recent purchase of Chesebrough-Pond's Inc
<CBM.N> for 72.50 dlrs a share was unlikely

In [219]:
#-- Creating tokens with parts of speech
for ent in doc:
    print('Token: %s, POS: %s' %(ent.lemma_, ent.pos_))

Token: Unilever, POS: PROPN
Token: Plc, POS: PROPN
Token: <, POS: X
Token: un.a, POS: X
Token: >, POS: X
Token: and, POS: CCONJ
Token: NV, POS: PROPN
Token: group, POS: NOUN
Token: report, POS: VERB
Token: 
, POS: SPACE
Token: improvement, POS: NOUN
Token: in, POS: ADP
Token: margin, POS: NOUN
Token: and, POS: CCONJ
Token: underlie, POS: VERB
Token: sale, POS: NOUN
Token: volume, POS: NOUN
Token: growth, POS: NOUN
Token: of, POS: ADP
Token: 
, POS: SPACE
Token: five, POS: NUM
Token: pct, POS: NOUN
Token: in, POS: ADP
Token: 1986, POS: NUM
Token: after, POS: ADP
Token: strip, POS: VERB
Token: out, POS: ADP
Token: the, POS: DET
Token: effect, POS: NOUN
Token: of, POS: ADP
Token: fall, POS: VERB
Token: 
, POS: SPACE
Token: price, POS: NOUN
Token: ,, POS: PUNCT
Token: disposal, POS: NOUN
Token: and, POS: CCONJ
Token: currency, POS: NOUN
Token: movement, POS: NOUN
Token: ,, POS: PUNCT
Token: Unilever, POS: PROPN
Token: Plc, POS: PROPN
Token: chairman, POS: NOUN
Token: 
, POS: SPACE
Token: M

# With NLTK

In [220]:
nltk.download('wordnet')
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
tokens2 = nltk.word_tokenize(article)
print(tokens2)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\heydi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['Unilever', 'Plc', '<', 'UN.A', '>', 'and', 'NV', 'group', 'reported', 'improvements', 'in', 'margins', 'and', 'underlying', 'sales', 'volume', 'growth', 'of', 'five', 'pct', 'in', '1986', 'after', 'stripping', 'out', 'the', 'effects', 'of', 'falling', 'prices', ',', 'disposals', 'and', 'currency', 'movements', ',', 'Unilever', 'Plc', 'chairman', 'Michael', 'Angus', 'said', '.', 'He', 'told', 'reporters', 'that', 'volumes', 'in', 'North', 'America', 'increased', 'some', '10.5', 'pct', 'while', 'European', 'consumer', 'goods', 'rose', 'about', '2.5', 'pct', 'after', 'being', 'flat', 'for', 'some', 'years', '.', 'Much', 'of', 'the', 'disposal', 'strategy', ',', 'aimed', 'at', 'concentrating', 'activities', 'on', 'core', 'businesses', ',', 'had', 'now', 'been', 'completed', ',', 'he', 'noted', '.', 'But', 'the', 'process', 'of', 'acquisitions', 'would', 'go', 'on', ',', 'with', 'strategic', 'acquisitions', 'taking', 'place', '``', 'from', 'time', 'to', 'time', ',', "''", 'he', 'said', '.

In [153]:
for token in tokens2:
    lemmatized_token = wordnet_lemmatizer.lemmatize(token)    
    if token != lemmatized_token:
        print('Original: %s, New: %s' % (token, lemmatized_token))

Original: improvements, New: improvement
Original: margins, New: margin
Original: sales, New: sale
Original: effects, New: effect
Original: prices, New: price
Original: disposals, New: disposal
Original: movements, New: movement
Original: reporters, New: reporter
Original: volumes, New: volume
Original: goods, New: good
Original: years, New: year
Original: activities, New: activity
Original: businesses, New: business
Original: acquisitions, New: acquisition
Original: acquisitions, New: acquisition
Original: profits, New: profit
Original: terms, New: term
Original: profits, New: profit
Original: was, New: wa
Original: profits, New: profit
Original: profits, New: profit
Original: costs, New: cost
Original: was, New: wa
Original: was, New: wa
Original: guilders, New: guilder
Original: parts, New: part
Original: was, New: wa
Original: was, New: wa
Original: activities, New: activity
Original: products, New: product
Original: sales, New: sale
Original: products, New: product
Original: costs

In [221]:
tagged_sent = nltk.pos_tag(tokens2)
tagged_sent[:40]

[('Unilever', 'NNP'),
 ('Plc', 'NNP'),
 ('<', 'NNP'),
 ('UN.A', 'NNP'),
 ('>', 'NNP'),
 ('and', 'CC'),
 ('NV', 'NNP'),
 ('group', 'NN'),
 ('reported', 'VBD'),
 ('improvements', 'NNS'),
 ('in', 'IN'),
 ('margins', 'NNS'),
 ('and', 'CC'),
 ('underlying', 'JJ'),
 ('sales', 'NNS'),
 ('volume', 'NN'),
 ('growth', 'NN'),
 ('of', 'IN'),
 ('five', 'CD'),
 ('pct', 'NNS'),
 ('in', 'IN'),
 ('1986', 'CD'),
 ('after', 'IN'),
 ('stripping', 'VBG'),
 ('out', 'RP'),
 ('the', 'DT'),
 ('effects', 'NNS'),
 ('of', 'IN'),
 ('falling', 'VBG'),
 ('prices', 'NNS'),
 (',', ','),
 ('disposals', 'NNS'),
 ('and', 'CC'),
 ('currency', 'NN'),
 ('movements', 'NNS'),
 (',', ','),
 ('Unilever', 'NNP'),
 ('Plc', 'NNP'),
 ('chairman', 'NN'),
 ('Michael', 'NNP')]