In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [1]:
text = """The NLP and LLM technologies are central to the analysis and generation of human language on a large scale. With their growing prevalence, distinguishing between LLM vs NLP becomes increasingly important.

NLP encompasses a suite of algorithms to understand, manipulate, and generate human language. Since its inception in the 1950s, NLP has evolved to analyze textual relationships. It uses part-of-speech tagging, named entity recognition, and sentiment analysis methods.

As exemplified by OpenAI’s ChatGPT, LLMs leverage deep learning to train on extensive text sets. Although they can mimic human-like text, their comprehension of language’s nuances is limited. Unlike NLP, which focuses on language analysis, LLMs primarily generate text."""

In [3]:
len(text)

744

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
doc = nlp(text)

In [8]:
tokens = [token.text for token in doc]
print(tokens)

['The', 'NLP', 'and', 'LLM', 'technologies', 'are', 'central', 'to', 'the', 'analysis', 'and', 'generation', 'of', 'human', 'language', 'on', 'a', 'large', 'scale', '.', 'With', 'their', 'growing', 'prevalence', ',', 'distinguishing', 'between', 'LLM', 'vs', 'NLP', 'becomes', 'increasingly', 'important', '.', '\n\n', 'NLP', 'encompasses', 'a', 'suite', 'of', 'algorithms', 'to', 'understand', ',', 'manipulate', ',', 'and', 'generate', 'human', 'language', '.', 'Since', 'its', 'inception', 'in', 'the', '1950s', ',', 'NLP', 'has', 'evolved', 'to', 'analyze', 'textual', 'relationships', '.', 'It', 'uses', 'part', '-', 'of', '-', 'speech', 'tagging', ',', 'named', 'entity', 'recognition', ',', 'and', 'sentiment', 'analysis', 'methods', '.', '\n\n', 'As', 'exemplified', 'by', 'OpenAI', '’s', 'ChatGPT', ',', 'LLMs', 'leverage', 'deep', 'learning', 'to', 'train', 'on', 'extensive', 'text', 'sets', '.', 'Although', 'they', 'can', 'mimic', 'human', '-', 'like', 'text', ',', 'their', 'comprehensi

In [22]:
punctuation = punctuation + '\n' + '\n\n'

Text Cleaning

In [23]:
word_freq = {}
stop_words = list(STOP_WORDS)
for word in doc:
    if word.text.lower() not in stop_words:
        if word.text.lower() not in punctuation:
            if word.text not in word_freq.keys():
                word_freq[word.text] = 1
            else:
                word_freq[word.text] += 1

In [24]:
word_freq

{'NLP': 5,
 'LLM': 2,
 'technologies': 1,
 'central': 1,
 'analysis': 3,
 'generation': 1,
 'human': 3,
 'language': 4,
 'large': 1,
 'scale': 1,
 'growing': 1,
 'prevalence': 1,
 'distinguishing': 1,
 'vs': 1,
 'increasingly': 1,
 'important': 1,
 'encompasses': 1,
 'suite': 1,
 'algorithms': 1,
 'understand': 1,
 'manipulate': 1,
 'generate': 2,
 'inception': 1,
 '1950s': 1,
 'evolved': 1,
 'analyze': 1,
 'textual': 1,
 'relationships': 1,
 'uses': 1,
 'speech': 1,
 'tagging': 1,
 'named': 1,
 'entity': 1,
 'recognition': 1,
 'sentiment': 1,
 'methods': 1,
 'exemplified': 1,
 'OpenAI': 1,
 'ChatGPT': 1,
 'LLMs': 2,
 'leverage': 1,
 'deep': 1,
 'learning': 1,
 'train': 1,
 'extensive': 1,
 'text': 3,
 'sets': 1,
 'mimic': 1,
 'like': 1,
 'comprehension': 1,
 'nuances': 1,
 'limited': 1,
 'Unlike': 1,
 'focuses': 1,
 'primarily': 1}

In [26]:
max_freq = max(word_freq.values())

In [27]:
for word in word_freq.keys():
    word_freq[word] = word_freq[word]/max_freq

In [28]:
word_freq

{'NLP': 1.0,
 'LLM': 0.4,
 'technologies': 0.2,
 'central': 0.2,
 'analysis': 0.6,
 'generation': 0.2,
 'human': 0.6,
 'language': 0.8,
 'large': 0.2,
 'scale': 0.2,
 'growing': 0.2,
 'prevalence': 0.2,
 'distinguishing': 0.2,
 'vs': 0.2,
 'increasingly': 0.2,
 'important': 0.2,
 'encompasses': 0.2,
 'suite': 0.2,
 'algorithms': 0.2,
 'understand': 0.2,
 'manipulate': 0.2,
 'generate': 0.4,
 'inception': 0.2,
 '1950s': 0.2,
 'evolved': 0.2,
 'analyze': 0.2,
 'textual': 0.2,
 'relationships': 0.2,
 'uses': 0.2,
 'speech': 0.2,
 'tagging': 0.2,
 'named': 0.2,
 'entity': 0.2,
 'recognition': 0.2,
 'sentiment': 0.2,
 'methods': 0.2,
 'exemplified': 0.2,
 'OpenAI': 0.2,
 'ChatGPT': 0.2,
 'LLMs': 0.4,
 'leverage': 0.2,
 'deep': 0.2,
 'learning': 0.2,
 'train': 0.2,
 'extensive': 0.2,
 'text': 0.6,
 'sets': 0.2,
 'mimic': 0.2,
 'like': 0.2,
 'comprehension': 0.2,
 'nuances': 0.2,
 'limited': 0.2,
 'Unlike': 0.2,
 'focuses': 0.2,
 'primarily': 0.2}

Sentence tokenization

In [37]:
sent_tokens = [sent for sent in doc.sents]
print(sent_tokens)

[The NLP and LLM technologies are central to the analysis and generation of human language on a large scale., With their growing prevalence, distinguishing between LLM vs NLP becomes increasingly important.

, NLP encompasses a suite of algorithms to understand, manipulate, and generate human language., Since its inception in the 1950s, NLP has evolved to analyze textual relationships., It uses part-of-speech tagging, named entity recognition, and sentiment analysis methods.

, As exemplified by OpenAI’s ChatGPT, LLMs leverage deep learning to train on extensive text sets., Although they can mimic human-like text, their comprehension of language’s nuances is limited., Unlike NLP, which focuses on language analysis, LLMs primarily generate text.]


In [38]:
sent_score = {}

In [44]:
for sent in sent_tokens:
  for word in sent:
    if word.text.lower() in word_freq.keys():
      if sent not in sent_score.keys():
        sent_score[sent] = word_freq[word.text.lower()]
      else:
        sent_score[sent] += word_freq[word.text.lower()]

In [45]:
print(sent_score)

{The NLP and LLM technologies are central to the analysis and generation of human language on a large scale.: 3.0, With their growing prevalence, distinguishing between LLM vs NLP becomes increasingly important.

: 1.2, NLP encompasses a suite of algorithms to understand, manipulate, and generate human language.: 2.8, Since its inception in the 1950s, NLP has evolved to analyze textual relationships.: 1.2, It uses part-of-speech tagging, named entity recognition, and sentiment analysis methods.

: 2.2, As exemplified by OpenAI’s ChatGPT, LLMs leverage deep learning to train on extensive text sets.: 1.9999999999999998, Although they can mimic human-like text, their comprehension of language’s nuances is limited.: 3.0000000000000004, Unlike NLP, which focuses on language analysis, LLMs primarily generate text.: 2.8000000000000003}


Select 30% sentences with max score

In [48]:
from heapq import nlargest


In [49]:
len(sent_score) * 0.3

2.4

In [50]:
summary = nlargest(n = 3, iterable = sent_score, key = sent_score.get)

In [51]:
print(summary)

[Although they can mimic human-like text, their comprehension of language’s nuances is limited., The NLP and LLM technologies are central to the analysis and generation of human language on a large scale., Unlike NLP, which focuses on language analysis, LLMs primarily generate text.]


In [52]:
final_summary = [word.text for word in summary]

In [53]:
final_summary = " ".join(final_summary)
print(final_summary)

Although they can mimic human-like text, their comprehension of language’s nuances is limited. The NLP and LLM technologies are central to the analysis and generation of human language on a large scale. Unlike NLP, which focuses on language analysis, LLMs primarily generate text.


In [54]:
len(final_summary)

280