In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
SENT_DETECTOR = nltk.data.load('tokenizers/punkt/english.pickle')


## SPACY POS Tags

TEXT	LEMMA	POS	    TAG	    DEP	        SHAPE	ALPHA	STOP
Apple	apple	PROPN	NNP	    nsubj	    Xxxxx	True	False
is	    be	    AUX	    VBZ	    aux	        xx	    True	True
looking	look	VERB	VBG	    ROOT	    xxxx	True	False
at	    at	    ADP	    IN	    prep        xx	    True	True
buying	buy	    VERB	VBG	    pcomp	    xxxx	True	False
U.K.	u.k.	PROPN	NNP	    compound	X.X.	False	False
startup	startup	NOUN	NN	    dobj	    xxxx	True	False
for	    for	    ADP	    IN	    prep	    xxx	    True	True
$	    $	    SYM	    $	    quantmod	$	    False	False
1	    1	    NUM	    CD	    compound	d	    False	False
billion	billion	NUM	    CD	    pobj	    xxxx	True	False

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
introduction_text = ("Professor McCarthy is my favorite teacher....I really enjoy his lectures. He is an expert in artificial intelligence.")
introduction_doc = nlp(introduction_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_doc])

['Professor', 'McCarthy', 'is', 'my', 'favorite', 'teacher', '....', 'I', 'really', 'enjoy', 'his', 'lectures', '.', 'He', 'is', 'an', 'expert', 'in', 'artificial', 'intelligence', '.']


In [22]:
#More often we work with text in files 
file_name = "introduction.txt"
introduction_file = open(file_name).read()
introduction_file_doc = nlp(introduction_file)
print ([token.text for token in introduction_file_doc])

['We', 'will', 'learn', 'about', 'processing', 'language', 'using', 'spacy', '.', '\n', 'We', 'will', 'learn', 'about', 'sentences', '.', '\n', 'We', 'will', 'learn', 'about', 'tokens', '.', '\n', 'We', 'will', 'learn', 'about', 'people', ',', 'like', 'Professor', 'Daniel', 'McCarthy', '.', '\n', 'We', 'will', 'learn', 'about', 'places', ',', 'like', 'Washington', 'D.C.', '\n', 'We', 'will', 'learning', 'about', 'proper', 'nouns', 'like', 'Strayer', 'University', '.', '\n', 'We', 'will', 'learn', 'about', 'pre', '-', 'processing', 'and', 'the', 'removal', 'of', 'stop', '-', 'words', '.', '\n', 'We', 'will', 'learn', 'about', 'lemmatization', '.', '\n', 'We', 'will', 'learn', 'about', 'parts', 'of', 'speech', '.', '\n', 'Mr.', 'McCarthy', 'likes', 'diet', 'Dr.', 'Pepper', '.', '\n']


In [13]:
#Extracting stop words and punctuations from the text
from collections import Counter
words = [ token.text for token in introduction_file_doc if not token.is_stop and not token.is_punct]
print(Counter(words).most_common(5))

[('learning', 1), ('pre', 1), ('processing', 1), ('lemmatization', 1), ('\n', 1)]


In [17]:
#
for token in introduction_file_doc:
    print (token, token.lemma_)

We we
will will
be be
learning learn
about about
pre pre
- -
processing processing
and and
lemmatization lemmatization
. .

 

The the
meaning meaning
of of
supposition supposition
is be
a a
thing thing
tacitly tacitly
assumed assume
beforehand beforehand
. .


In [19]:
#part of speech tagging
for token in introduction_file_doc:
    if not token.is_stop and not token.is_punct:
        print (token, token.pos_)

learning VERB
pre ADJ
processing NOUN
lemmatization NOUN

 SPACE
meaning NOUN
supposition NOUN
thing NOUN
tacitly ADV
assumed VERB


In [23]:
#Named entity recognition
for ent in introduction_file_doc.ents:
    print(ent.text, ent.label_)

Daniel McCarthy PERSON
Washington D.C. GPE
Strayer University ORG
McCarthy PERSON
Pepper PERSON


In [24]:
#displaying the named entities in the text
from spacy import displacy
displacy.render(introduction_file_doc, style='ent', jupyter=True)



In [25]:
introduction_text = "This tutorial is about Natural Language Processing in Spacy."
introduction_doc = nlp(introduction_text)
displacy.serve(introduction_doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [26]:
about_text = """According to the outlet, the OpenAI CEO recently reached an agreement with the iPhone maker to incorporate some OpenAI services into Apple products. Nadella was reportedly concerned about the potential impact of a deal on Microsoft's product ambitions, per the report.

Representatives for OpenAI, Apple, and Microsoft did not immediately respond to requests for comment from Business Insider, made outside normal working hours. Apple and OpenAI declined to comment to The Information.
If OpenAI has indeed reached an agreement with Apple, it would be a much-needed win for Altman.
The tech boss has faced heightened scrutiny after former employees and board members publicly criticized him.
Helen Toner, a former OpenAI director, recently accused Altman of lying to the board "multiple" times and "withholding information.
Toner, who participated in November's ousting of the CEO, said Altman had also kept the board in the dark about the company's ownership structure.

Altman has also faced criticism from Jan Lieke, a former executive at OpenAI.

Leike quit OpenAI earlier this month, accusing the company of prioritizing "shiny products" over "safety culture and processes."

Tensions over the priorities within OpenAI had reportedly been brewing since Altman's brief ouster.

"""

about_doc = nlp(about_text)
sentences = list(about_doc.sents)
num_sentences = len(sentences)
print("sentences: " + str(num_sentences))
for sentence in sentences:
    print(sentence)

sentences: 10
According to the outlet, the OpenAI CEO recently reached an agreement with the iPhone maker to incorporate some OpenAI services into Apple products.
Nadella was reportedly concerned about the potential impact of a deal on Microsoft's product ambitions, per the report.


Representatives for OpenAI, Apple, and Microsoft did not immediately respond to requests for comment from Business Insider, made outside normal working hours.
Apple and OpenAI declined to comment to The Information.

If OpenAI has indeed reached an agreement with Apple, it would be a much-needed win for Altman.

The tech boss has faced heightened scrutiny after former employees and board members publicly criticized him.

Helen Toner, a former OpenAI director, recently accused Altman of lying to the board "multiple" times and "withholding information.
Toner, who participated in November's ousting of the CEO, said Altman had also kept the board in the dark about the company's ownership structure.


Altman ha

In [27]:
#tokens
for token in about_doc:
    print (token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

According accord VERB VBG prep Xxxxx True False
to to ADP IN prep xx True True
the the DET DT det xxx True True
outlet outlet NOUN NN pobj xxxx True False
, , PUNCT , punct , False False
the the DET DT det xxx True True
OpenAI OpenAI PROPN NNP compound XxxxXX True False
CEO CEO PROPN NNP nsubj XXX True False
recently recently ADV RB advmod xxxx True False
reached reach VERB VBD ROOT xxxx True False
an an DET DT det xx True True
agreement agreement NOUN NN dobj xxxx True False
with with ADP IN prep xxxx True True
the the DET DT det xxx True True
iPhone iPhone PROPN NNP compound xXxxxx True False
maker maker NOUN NN pobj xxxx True False
to to PART TO aux xx True True
incorporate incorporate VERB VB acl xxxx True False
some some DET DT det xxxx True True
OpenAI OpenAI PROPN NNP compound XxxxXX True False
services service NOUN NNS dobj xxxx True False
into into ADP IN prep xxxx True True
Apple Apple PROPN NNP compound Xxxxx True False
products product NOUN NNS pobj xxxx True False
. . PUNC

In [28]:
for word in about_doc.ents:
    print(word.text, word.label_)


OpenAI GPE
iPhone ORG
OpenAI GPE
Apple ORG
Nadella PERSON
Microsoft ORG
OpenAI GPE
Apple ORG
Microsoft ORG
Business Insider ORG
normal working hours TIME
Apple ORG
OpenAI ORG
Information ORG
OpenAI GPE
Apple ORG
Altman PRODUCT
Helen Toner PERSON
OpenAI GPE
Altman ORG
Toner PERSON
November DATE
Altman ORG
Altman ORG
Jan Lieke PERSON
OpenAI GPE
Leike PERSON
OpenAI GPE
earlier this month DATE
OpenAI GPE
Altman ORG


## NLTK POS Meaning
CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

In [32]:
import nltk

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

about_text = """According to the outlet, the OpenAI CEO recently reached an agreement with the iPhone maker to incorporate some OpenAI services into Apple products. Nadella was reportedly concerned about the potential impact of a deal on Microsoft's product ambitions, per the report.

Representatives for OpenAI, Apple, and Microsoft did not immediately respond to requests for comment from Business Insider, made outside normal working hours. Apple and OpenAI declined to comment to The Information.
If OpenAI has indeed reached an agreement with Apple, it would be a much-needed win for Altman.
The tech boss has faced heightened scrutiny after former employees and board members publicly criticized him.
Helen Toner, a former OpenAI director, recently accused Altman of lying to the board "multiple" times and "withholding information.
Toner, who participated in November's ousting of the CEO, said Altman had also kept the board in the dark about the company's ownership structure.

Altman has also faced criticism from Jan Lieke, a former executive at OpenAI.

Leike quit OpenAI earlier this month, accusing the company of prioritizing "shiny products" over "safety culture and processes."

Tensions over the priorities within OpenAI had reportedly been brewing since Altman's brief ouster.

"""

pattern = r"""
    NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
        {<NNP>+}                # chunk sequences of proper nouns
"""

sent = preprocess(about_text)
print(sent)
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)


[('According', 'VBG'), ('to', 'TO'), ('the', 'DT'), ('outlet', 'NN'), (',', ','), ('the', 'DT'), ('OpenAI', 'NNP'), ('CEO', 'NNP'), ('recently', 'RB'), ('reached', 'VBD'), ('an', 'DT'), ('agreement', 'NN'), ('with', 'IN'), ('the', 'DT'), ('iPhone', 'NN'), ('maker', 'NN'), ('to', 'TO'), ('incorporate', 'VB'), ('some', 'DT'), ('OpenAI', 'NNP'), ('services', 'NNS'), ('into', 'IN'), ('Apple', 'NNP'), ('products', 'NNS'), ('.', '.'), ('Nadella', 'NN'), ('was', 'VBD'), ('reportedly', 'RB'), ('concerned', 'VBN'), ('about', 'IN'), ('the', 'DT'), ('potential', 'JJ'), ('impact', 'NN'), ('of', 'IN'), ('a', 'DT'), ('deal', 'NN'), ('on', 'IN'), ('Microsoft', 'NNP'), ("'s", 'POS'), ('product', 'NN'), ('ambitions', 'NNS'), (',', ','), ('per', 'IN'), ('the', 'DT'), ('report', 'NN'), ('.', '.'), ('Representatives', 'NNS'), ('for', 'IN'), ('OpenAI', 'NNP'), (',', ','), ('Apple', 'NNP'), (',', ','), ('and', 'CC'), ('Microsoft', 'NNP'), ('did', 'VBD'), ('not', 'RB'), ('immediately', 'RB'), ('respond', 'VB

In [33]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('According', 'VBG', 'O'),
 ('to', 'TO', 'O'),
 ('the', 'DT', 'B-NP'),
 ('outlet', 'NN', 'I-NP'),
 (',', ',', 'O'),
 ('the', 'DT', 'O'),
 ('OpenAI', 'NNP', 'B-NP'),
 ('CEO', 'NNP', 'I-NP'),
 ('recently', 'RB', 'O'),
 ('reached', 'VBD', 'O'),
 ('an', 'DT', 'B-NP'),
 ('agreement', 'NN', 'I-NP'),
 ('with', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('iPhone', 'NN', 'I-NP'),
 ('maker', 'NN', 'B-NP'),
 ('to', 'TO', 'O'),
 ('incorporate', 'VB', 'O'),
 ('some', 'DT', 'O'),
 ('OpenAI', 'NNP', 'B-NP'),
 ('services', 'NNS', 'O'),
 ('into', 'IN', 'O'),
 ('Apple', 'NNP', 'B-NP'),
 ('products', 'NNS', 'O'),
 ('.', '.', 'O'),
 ('Nadella', 'NN', 'B-NP'),
 ('was', 'VBD', 'O'),
 ('reportedly', 'RB', 'O'),
 ('concerned', 'VBN', 'O'),
 ('about', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('potential', 'JJ', 'I-NP'),
 ('impact', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('deal', 'NN', 'I-NP'),
 ('on', 'IN', 'O'),
 ('Microsoft', 'NNP', 'B-NP'),
 ("'s", 'POS', 'O'),
 ('product', 'NN', 'B-NP'),
 ('ambit

In [34]:

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(about_text)))
print(ne_tree)

(S
  According/VBG
  to/TO
  the/DT
  outlet/NN
  ,/,
  the/DT
  (ORGANIZATION OpenAI/NNP)
  CEO/NNP
  recently/RB
  reached/VBD
  an/DT
  agreement/NN
  with/IN
  the/DT
  (ORGANIZATION iPhone/NN)
  maker/NN
  to/TO
  incorporate/VB
  some/DT
  OpenAI/NNP
  services/NNS
  into/IN
  (GPE Apple/NNP)
  products/NNS
  ./.
  (PERSON Nadella/NN)
  was/VBD
  reportedly/RB
  concerned/VBN
  about/IN
  the/DT
  potential/JJ
  impact/NN
  of/IN
  a/DT
  deal/NN
  on/IN
  (ORGANIZATION Microsoft/NNP)
  's/POS
  product/NN
  ambitions/NNS
  ,/,
  per/IN
  the/DT
  report/NN
  ./.
  Representatives/NNS
  for/IN
  (ORGANIZATION OpenAI/NNP)
  ,/,
  (PERSON Apple/NNP)
  ,/,
  and/CC
  (PERSON Microsoft/NNP)
  did/VBD
  not/RB
  immediately/RB
  respond/VB
  to/TO
  requests/NNS
  for/IN
  comment/NN
  from/IN
  (ORGANIZATION Business/NNP Insider/NNP)
  ,/,
  made/VBD
  outside/IN
  normal/JJ
  working/JJ
  hours/NNS
  ./.
  (PERSON Apple/NNP)
  and/CC
  (ORGANIZATION OpenAI/NNP)
  declined/VBD
  to/T

In [52]:
#SENTIMENT ANALYSIS/Text Classification using Spacy

import spacy
from spacy.tokens import DocBin
from spacy.training import Example 
examples = []

# Prepare training data
train_data = [
    ("I love programming.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
    ("I hate bugs.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
]

# Create a blank model
nlp = spacy.blank("en")

# Add the text categorizer to the pipeline
textcat = nlp.add_pipe("textcat")

# Add labels to the text categorizer
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

# Convert training data to DocBin
doc_bin = DocBin()
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Train the model (simplified)
nlp.initialize()
for epoch in range(10):
    nlp.update(examples)

# Test the model
doc = nlp("Learning new things is ok.")
print(doc.cats)


{'POSITIVE': 0.5251275300979614, 'NEGATIVE': 0.4748724699020386}


In [53]:
#Textbook Classification

import spacy
from spacy.tokens import DocBin
from spacy.training import Example 
examples = []

# Prepare training data
# train_data = [
#     ("I love programming.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
#     ("Excitmement is the best feeling.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
#     ("I hate bugs.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
# ]
train_data = [
    
    ("On Language", {"cats": {"FICTION": 0.0, "NON-FICTION": 1.0}}),
    ("The Great Gatsby", {"cats": {"FICTION": 1.0, "NON-FICTION": 0.0}}),
    ("The Elements of Style", {"cats": {"FICTION": 0.0, "NON-FICTION": 1.0}}),
    ("The Catcher in the Rye", {"cats": {"FICTION": 1.0, "NON-FICTION": 0.0}}),
    ("The Design of Everyday Things", {"cats": {"FICTION": 0.0, "NON-FICTION": 1.0}}),
    ("The Old Man and the Sea", {"cats": {"FICTION": 1.0, "NON-FICTION": 0.0}}),
    ("The Art of Computer Programming", {"cats": {"FICTION": 0.0, "NON-FICTION": 1.0}})
]

# Create a blank model
nlp = spacy.blank("en")

# Add the text categorizer to the pipeline
textcat = nlp.add_pipe("textcat")

# Add labels to the text categorizer
textcat.add_label("FICTION")
textcat.add_label("NON-FICTION")

# Convert training data to DocBin
doc_bin = DocBin()
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Train the model (simplified)
nlp.initialize()
for epoch in range(10):
    nlp.update(examples)

# Test the model
doc = nlp("On the road")
print(doc.cats)


{'FICTION': 0.5461692214012146, 'NON-FICTION': 0.4538307785987854}


In [46]:
#SENTIMENT ANALYSIS/Text Classification using NLTK

import nltk
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

# Prepare training data
train_data = [
    ("I love programming.", "POSITIVE"),
    ("I hate bugs.", "NEGATIVE"),
    ("It is not enjoyable to program.", "NEGATIVE"),
]

# Extract features
def extract_features(words):
    return {word: True for word in words}
train_features = [(extract_features(word_tokenize(text)), label) for text, label in train_data]


# Convert training data
train_features = [(extract_features(word_tokenize(text)), label) for text, label in train_data]

# Train the classifier
classifier = NaiveBayesClassifier.train(train_features)

# Test the classifier
test_sentence = "I enjoy working with a VCR."
test_features = extract_features(word_tokenize(test_sentence))
print(classifier.classify(test_features))


NEGATIVE
