In [49]:
import nltk

nltk.download(["brown","webtext", "words", "stopwords"] )
nltk.download(["punkt", "averaged_perceptron_tagger", "maxent_ne_chunker", "vader_lexicon", "wordnet", "tagsets", "omw-1.4"])

[nltk_data] Downloading package brown to /Users/babakjan/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     /Users/babakjan/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package words to /Users/babakjan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/babakjan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/babakjan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/babakjan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/babakjan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-t

True

In [50]:
# read input text

file = open("text.txt", "r")
text = file.read()
print(text)

Microsoft is betting heavily on integrating OpenAI's GPT language models into its products to
compete with Google, and, the company now claims, its AI is an early form of artificial general
intelligence (AGI). 

On Wednesday, Microsoft researchers released a paper on the arXiv preprint server titled “Sparks of
Artificial General Intelligence: Early experiments with GPT-4.” They declared that GPT-4 showed
early signs of AGI, meaning that it has capabilities that are at or above human level. 

This eyebrow-raising conclusion largely contrasts what OpenAI CEO Sam Altman has been saying
regarding GPT-4. For example, he said the model was "still flawed, still limited." In fact, if you
read the paper itself, the researchers appear to dial back their own splashy claim: the bulk of the
paper is dedicated to listing the number of limitations and biases the large language model
contains. This begs the question of how close to AGI GPT-4 really is, and how AGI is instead being
used as clickbait.



In [51]:
# POS (Part Of Speech) tagging

tokens = nltk.word_tokenize(text)
taggedTokens = nltk.pos_tag(tokens)
print(taggedTokens)

[('Microsoft', 'NNP'), ('is', 'VBZ'), ('betting', 'VBG'), ('heavily', 'RB'), ('on', 'IN'), ('integrating', 'VBG'), ('OpenAI', 'NNP'), ("'s", 'POS'), ('GPT', 'NNP'), ('language', 'NN'), ('models', 'NNS'), ('into', 'IN'), ('its', 'PRP$'), ('products', 'NNS'), ('to', 'TO'), ('compete', 'VB'), ('with', 'IN'), ('Google', 'NNP'), (',', ','), ('and', 'CC'), (',', ','), ('the', 'DT'), ('company', 'NN'), ('now', 'RB'), ('claims', 'VBZ'), (',', ','), ('its', 'PRP$'), ('AI', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('early', 'JJ'), ('form', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('general', 'JJ'), ('intelligence', 'NN'), ('(', '('), ('AGI', 'NNP'), (')', ')'), ('.', '.'), ('On', 'IN'), ('Wednesday', 'NNP'), (',', ','), ('Microsoft', 'NNP'), ('researchers', 'NNS'), ('released', 'VBD'), ('a', 'DT'), ('paper', 'NN'), ('on', 'IN'), ('the', 'DT'), ('arXiv', 'JJ'), ('preprint', 'NN'), ('server', 'NN'), ('titled', 'VBN'), ('“', 'JJ'), ('Sparks', 'NNP'), ('of', 'IN'), ('Artificial', 'NNP'), ('General', '

In [52]:
# NER (Named Entity Recognition) with entity classification

neChunked = nltk.ne_chunk(taggedTokens, binary=True)
data = {}
for entity in neChunked:
    if isinstance(entity, nltk.tree.Tree):
        text = " ".join([word for word, tag in entity.leaves()])
        ent = entity.label()
        data[text] = ent

print(data)


{'Microsoft': 'NE', 'OpenAI': 'NE', 'GPT': 'NE', 'Google': 'NE', 'AGI': 'NE', 'arXiv': 'NE', 'Artificial': 'NE', 'ChatGPT': 'NE', 'TiKZ': 'NE', 'Intelligencer': 'NE', 'Kara Swisher': 'NE', 'Altman': 'NE', 'AI': 'NE', 'Motherboard': 'NE', 'Bing': 'NE', 'Gap': 'NE'}


In [53]:
# Custom NER

customData = []
entity = []
for taggedEntry in taggedTokens:
    # NN = noun, singular (cat, tree) or NN = noun plural (desks) or IN = preposition/subordinating conjunction
    if taggedEntry[1].startswith("NN") or taggedEntry[1].startswith("NNS") or (entity and taggedEntry[1].startswith("IN")):
        entity.append(taggedEntry)
    else:
        if (entity) and entity[-1][1].startswith("IN"):  # pop last preposition
            entity.pop()
        if (entity and " ".join(e[0] for e in entity)[0].isupper()):
            customData.append(" ".join(e[0] for e in entity))
        entity = []

print(len(customData))
print(customData)

49
['Microsoft', 'OpenAI', 'GPT language models', 'Google', 'AI', 'AGI', 'Wednesday', 'Microsoft researchers', 'Sparks of Artificial General Intelligence', 'GPT-4', 'OpenAI CEO Sam Altman', 'GPT-4', 'AGI GPT-4', 'AGI', 'GPT-4', 'GPT-4 ’', 'ChatGPT', 'AGI', 'TiKZ', 'GPT-4 ’', 'GPT-4 model', 'Sparks of Artificial General Intelligence', 'Microsoft spokesperson', 'Thursday interview with Intelligencer ’', 'Kara Swisher', 'Altman', 'Altman', 'Altman', 'OpenAI', 'AGI', 'Altman', 'GPT-4', 'AGI', 'GPT-4 rumor mill', 'GPT-4', 'People', 'AGI', 'Microsoft', 'AGI', 'Microsoft spokesperson', 'Motherboard', 'Microsoft researchers', 'GPT-4', 'Gap ’', 'GPT-4', 'GPT-4', 'OpenAI', 'AI models', 'AI researchers']


In [None]:
# NER + classification using existing language model (e.g. NER models from Hugging Face)

from transformers import pipeline

ner = pipeline("ner", grouped_entities=True)
ner(text)

In [73]:
# custom entity classification

import wikipedia
import pprint

# part of speech tags shortcuts https://www.guru99.com/pos-tagging-chunking-nltk.html
# DT = delimiter
# JJ = This NLTK POS Tag is an adjective (large)
# VBN = verb past participle (reunified)
# NN = noun, singular (cat, tree)
# NNS = noun plural (desks)
# IN = preposition/subordinating conjunction
grammar = "NP: {<DT>?<IN>?<JJ>?<VBN>*<NN|NNS>?<NN|NNS>}"
cp = nltk.RegexpParser(grammar)

resultDict = {}

for entity in customData:
    summarySentence = None
    try:
        summarySentence = wikipedia.summary(
            entity, auto_suggest=False, sentences=1)
    except wikipedia.DisambiguationError as e:
        summarySentence = wikipedia.summary(e.options[0], sentences=1)
    except:
        resultDict[entity] = "Thing (page not found)"
        continue
    tokenizedSentence = nltk.word_tokenize(summarySentence)
    posTaggedSentence = nltk.pos_tag(tokenizedSentence)
    parsedSentence = cp.parse(posTaggedSentence)
    success = False
    for chunk in parsedSentence:
        if isinstance(chunk, nltk.tree.Tree):
            text = " ".join([word for word, tag in chunk.leaves()])
            resultDict[entity] = text
            success = True
            break

    if not success:
        resultDict[entity] = "Thing"

pprint.pprint(resultDict)




  lis = BeautifulSoup(html).find_all('li')


{'AGI': 'a feminist research',
 'AGI GPT-4': 'Thing (page not found)',
 'AI': 'Artificial intelligence',
 'AI models': 'Thing (page not found)',
 'AI researchers': 'Thing (page not found)',
 'Altman': 'a surname',
 'ChatGPT': 'an artificial intelligence chatbot',
 'GPT language models': 'Thing (page not found)',
 'GPT-4': 'large language model',
 'GPT-4 model': 'Thing (page not found)',
 'GPT-4 rumor mill': 'Thing (page not found)',
 'GPT-4 ’': 'Thing (page not found)',
 'Gap ’': 'Thing (page not found)',
 'Google': 'an American multinational technology',
 'Kara Swisher': 'an American journalist',
 'Microsoft': 'an American multinational technology',
 'Microsoft researchers': 'Thing (page not found)',
 'Microsoft spokesperson': 'Thing (page not found)',
 'Motherboard': 'A motherboard',
 'OpenAI': 'artificial intelligence',
 'OpenAI CEO Sam Altman': 'Thing (page not found)',
 'People': 'A people',
 'Sparks of Artificial General Intelligence': 'Thing (page not found)',
 'Thursday intervi