In [2]:
import spacy 

In [3]:
nlp = spacy.load("en_core_web_sm")

In [6]:
doc = nlp("Elon ate pizza yesterday. He's fat man.")

for token in doc:
    print(token, " | ", token.pos_, " | ", spacy.explain(token.pos_))

Elon  |  PROPN  |  proper noun
ate  |  VERB  |  verb
pizza  |  NOUN  |  noun
yesterday  |  NOUN  |  noun
.  |  PUNCT  |  punctuation
He  |  PRON  |  pronoun
's  |  AUX  |  auxiliary
fat  |  ADJ  |  adjective
man  |  NOUN  |  noun
.  |  PUNCT  |  punctuation


In [11]:
doc = nlp("Ah! Elon ate pizza and bought 5 $ for it yesterday. He's fat man.")
for token in doc:
    print(token, " | ", token.pos_, " | ", token.tag_, " | ", spacy.explain(token.tag_))


Ah  |  INTJ  |  UH  |  interjection
!  |  PUNCT  |  .  |  punctuation mark, sentence closer
Elon  |  PROPN  |  NNP  |  noun, proper singular
ate  |  VERB  |  VBD  |  verb, past tense
pizza  |  NOUN  |  NN  |  noun, singular or mass
and  |  CCONJ  |  CC  |  conjunction, coordinating
bought  |  VERB  |  VBD  |  verb, past tense
5  |  NUM  |  CD  |  cardinal number
$  |  SYM  |  $  |  symbol, currency
for  |  ADP  |  IN  |  conjunction, subordinating or preposition
it  |  PRON  |  PRP  |  pronoun, personal
yesterday  |  NOUN  |  NN  |  noun, singular or mass
.  |  PUNCT  |  .  |  punctuation mark, sentence closer
He  |  PRON  |  PRP  |  pronoun, personal
's  |  AUX  |  VBZ  |  verb, 3rd person singular present
fat  |  ADJ  |  JJ  |  adjective (English), other noun-modifier (Chinese)
man  |  NOUN  |  NN  |  noun, singular or mass
.  |  PUNCT  |  .  |  punctuation mark, sentence closer


In [12]:
earning_text = """Microsoft Corp. today announced the following results for the quarter ended December 31, 2023, as compared to the corresponding period of last fiscal year:

·        Revenue was $62.0 billion and increased 18% (up 16% in constant currency)

·        Operating income was $27.0 billion and increased 33%, and increased 25% non-GAAP (up 23% in constant currency)

·        Net income was $21.9 billion and increased 33%, and increased 26% non-GAAP (up 23% in constant currency)

·        Diluted earnings per share was $2.93 and increased 33%, and increased 26% non-GAAP (up 23% in constant currency)"""

In [13]:
doc = nlp(earning_text)

In [23]:
filter_token = []
for token in doc:
    if token.pos_ not in ["PUNCT", "X", "SPACE"]:
        filter_token.append(token)

In [25]:
filter_token[:20]

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2023,
 as,
 compared,
 to,
 the,
 corresponding,
 period]

In [26]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 3,
 92: 34,
 100: 13,
 90: 3,
 85: 8,
 93: 20,
 97: 18,
 98: 1,
 84: 8,
 103: 8,
 87: 4,
 99: 4,
 89: 7,
 86: 4}

### **Excercise**

In [46]:
with open ("C:/Users/FPTSHOP/OneDrive/Documents/JVB_Training/NLP/data/news_story.txt") as f:
    new_text = f.read()
new_text

'Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and energy prices, so-called core CPI still rose 6.2%, against expectations for a 6% gain, clouding hopes that inflation had peaked in March.\n\nThe month-over-month gains also were higher than expectations â€” 0.3% on headline CPI versus the 0.2% estimate and a 0.6% increase for core, against the outlook for a 0.4% gain.\n\nThe price gains also meant that workers continued to lose ground. Real wages adjusted for inflation decreased 0.1% on the month despite a nominal increase of 0.3% in average h

**Extract noun and num in the text**

In [47]:
doc = nlp(new_text)

In [52]:
noun = []
num = []
for token in doc:
    if token.pos_ == "NOUN":
        noun.append(token)
    elif token.pos_ == "NUM":
        num.append(token)
print(noun)
print(num)

[Inflation, climb, consumers, brink, expansion, consumer, price, index, measure, prices, goods, services, %, year, estimate, %, gain, ease, Marchâ€, ™, peak, level, summer, food, energy, prices, core, %, expectations, %, gain, hopes, inflation, month, month, gains, expectations, %, headline, %, estimate, %, increase, core, outlook, %, gain, price, gains, workers, ground, wages, inflation, %, month, increase, %, earnings, year, earnings, %, earnings, %, Inflation, threat, recovery, pandemic, economy, stage, year, growth, level, prices, pump, grocery, stores, problem, inflation, areas, housing, auto, sales, host, areas, officials, problem, interest, rate, hikes, year, pledges, inflation, %, goal, ™, data, job, Credits]
[8.3, 8.1, 1982, 6.2, 6, â€, 0.3, 0.2, 0.6, 0.4, 0.1, 0.3, 2.6, 5.5, 2021, 1984, one, two, two, 2]


In [54]:
count = doc.count_by(spacy.attrs.POS)
count

{92: 98,
 100: 27,
 86: 15,
 85: 39,
 96: 17,
 97: 32,
 90: 34,
 95: 4,
 87: 13,
 89: 10,
 84: 23,
 103: 7,
 93: 20,
 94: 4,
 98: 8,
 101: 1}

In [57]:
for k,v in count.items():
    print(doc.vocab[k].text,"|", v)

NOUN | 98
VERB | 27
ADV | 15
ADP | 39
PROPN | 17
PUNCT | 32
DET | 34
PRON | 4
AUX | 13
CCONJ | 10
ADJ | 23
SPACE | 7
NUM | 20
PART | 4
SCONJ | 8
X | 1
