In [63]:
text="""
Natural language processing (NLP) is a field that focuses on making natural human language usable by computer programs. NLTK, or Natural Language Toolkit, is a Python package that you can use for NLP.
A lot of the data that you could be analyzing is unstructured data and contains human-readable text. Before you can analyze that data programmatically, you first need to preprocess it. In this tutorial, you’ll take your first look at the kinds of text preprocessing tasks you can do with NLTK so that you’ll be ready to apply them in future projects. You’ll also see how to do some basic text analysis and create visualizations.
If you’re familiar with the basics of using Python and would like to get your feet wet with some NLP, then you’ve come to the right place.
"""
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation



In [64]:
stopwords=list(STOP_WORDS)
print(stopwords)

['by', 'within', 'yours', 'yet', 'twenty', '’re', 'we', 'done', 'n’t', 'least', 'almost', 'either', 'whereafter', 'do', 'without', 'hereby', "'d", 'five', 'else', 'thereupon', 'something', 'whoever', 'nowhere', 'them', 'few', 'his', 'full', 'unless', 'meanwhile', 'did', 'thus', 'everyone', 'toward', 'often', '‘re', 'name', 'here', 'anywhere', 'through', 'thereby', 'three', 'until', 'otherwise', 'any', 'are', 'what', 'whereby', 'off', 'anything', 'empty', 'of', 'each', 'quite', 'should', 'take', 'will', 'bottom', 'becoming', 'less', 'those', 'thru', 'and', 'front', 'could', 'himself', 'am', 'sixty', 'as', 'became', 'can', 'anyway', 'seems', 'hundred', 'all', 'mine', 'not', 'me', "'ve", 'enough', 'show', 'since', 'towards', 'elsewhere', 'or', 'hence', 'already', 'two', 'may', 'might', 'around', 'whereupon', 'the', 'during', 'n‘t', 'latter', "'s", 'noone', 'several', '‘s', 'cannot', 'beforehand', '‘d', 'used', 'former', 'into', 'how', 'out', 'fifty', 'beyond', 'with', 'many', 'although', 

In [65]:
nlp=spacy.load('en_core_web_sm')
doc=nlp(text) #we get tokenised word in doc
# now make list of tokens
tokens=[token.text for token in doc]
print(tokens)#punctuations and stop words are also a part of token

['\n', 'Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'that', 'focuses', 'on', 'making', 'natural', 'human', 'language', 'usable', 'by', 'computer', 'programs', '.', 'NLTK', ',', 'or', 'Natural', 'Language', 'Toolkit', ',', 'is', 'a', 'Python', 'package', 'that', 'you', 'can', 'use', 'for', 'NLP', '.', '\n', 'A', 'lot', 'of', 'the', 'data', 'that', 'you', 'could', 'be', 'analyzing', 'is', 'unstructured', 'data', 'and', 'contains', 'human', '-', 'readable', 'text', '.', 'Before', 'you', 'can', 'analyze', 'that', 'data', 'programmatically', ',', 'you', 'first', 'need', 'to', 'preprocess', 'it', '.', 'In', 'this', 'tutorial', ',', 'you', '’ll', 'take', 'your', 'first', 'look', 'at', 'the', 'kinds', 'of', 'text', 'preprocessing', 'tasks', 'you', 'can', 'do', 'with', 'NLTK', 'so', 'that', 'you', '’ll', 'be', 'ready', 'to', 'apply', 'them', 'in', 'future', 'projects', '.', 'You', '’ll', 'also', 'see', 'how', 'to', 'do', 'some', 'basic', 'text', 'analysis', 'and', '

In [66]:
punctuations=punctuation +'\n'
punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'

In [67]:
word_frequencies = {}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1 
            else:
                word_frequencies[word.text] += 1    
           



In [68]:
print(word_frequencies)
# find the frequency of filtered words(no stopwords no punctuation)


{'\n': 4, 'Natural': 2, 'language': 2, 'processing': 1, 'NLP': 3, 'field': 1, 'focuses': 1, 'making': 1, 'natural': 1, 'human': 2, 'usable': 1, 'computer': 1, 'programs': 1, 'NLTK': 2, 'Language': 1, 'Toolkit': 1, 'Python': 2, 'package': 1, 'use': 1, 'lot': 1, 'data': 3, 'analyzing': 1, 'unstructured': 1, 'contains': 1, 'readable': 1, 'text': 3, 'analyze': 1, 'programmatically': 1, 'need': 1, 'preprocess': 1, 'tutorial': 1, 'look': 1, 'kinds': 1, 'preprocessing': 1, 'tasks': 1, 'ready': 1, 'apply': 1, 'future': 1, 'projects': 1, 'basic': 1, 'analysis': 1, 'create': 1, 'visualizations': 1, 'familiar': 1, 'basics': 1, 'like': 1, 'feet': 1, 'wet': 1, 'come': 1, 'right': 1, 'place': 1}


In [69]:
max_frequency=max(word_frequencies.values())
max_frequency

4

In [70]:
# now divide the word frequencies by 4 so that the normalised frequencies can be achieved(4/4 =1 which is a maximum normalised frequency
# )
for word in word_frequencies.keys():
    word_frequencies[word]=word_frequencies[word]/max_frequency
print(word_frequencies)



{'\n': 1.0, 'Natural': 0.5, 'language': 0.5, 'processing': 0.25, 'NLP': 0.75, 'field': 0.25, 'focuses': 0.25, 'making': 0.25, 'natural': 0.25, 'human': 0.5, 'usable': 0.25, 'computer': 0.25, 'programs': 0.25, 'NLTK': 0.5, 'Language': 0.25, 'Toolkit': 0.25, 'Python': 0.5, 'package': 0.25, 'use': 0.25, 'lot': 0.25, 'data': 0.75, 'analyzing': 0.25, 'unstructured': 0.25, 'contains': 0.25, 'readable': 0.25, 'text': 0.75, 'analyze': 0.25, 'programmatically': 0.25, 'need': 0.25, 'preprocess': 0.25, 'tutorial': 0.25, 'look': 0.25, 'kinds': 0.25, 'preprocessing': 0.25, 'tasks': 0.25, 'ready': 0.25, 'apply': 0.25, 'future': 0.25, 'projects': 0.25, 'basic': 0.25, 'analysis': 0.25, 'create': 0.25, 'visualizations': 0.25, 'familiar': 0.25, 'basics': 0.25, 'like': 0.25, 'feet': 0.25, 'wet': 0.25, 'come': 0.25, 'right': 0.25, 'place': 0.25}


In [71]:
sentence_tokens=[sent for sent in doc.sents]
print(sentence_tokens)
# phle hmne normalization se ye nikala ki konsa word imp h 
# phir ab hum nikal rhe h konsa sentence sbse imp h

[
, Natural language processing (NLP) is a field that focuses on making natural human language usable by computer programs., NLTK, or Natural Language Toolkit, is a Python package that you can use for NLP.
, A lot of the data that you could be analyzing is unstructured data and contains human-readable text., Before you can analyze that data programmatically, you first need to preprocess it., In this tutorial, you’ll take your first look at the kinds of text preprocessing tasks you can do with NLTK so that you’ll be ready to apply them in future projects., You’ll also see how to do some basic text analysis and create visualizations.
, If you’re familiar with the basics of using Python and would like to get your feet wet with some NLP, then you’ve come to the right place.
]


In [72]:
sentence_scores={}
for sent in sentence_tokens:
    for word in sent:
        if word.text.lower() in  word_frequencies.keys():
            #now we are adding the normalised word frequency(normalized by dividing max_frequency)of each word in a sentence and the word which have highest addition is the imp sentence
            if sent not in sentence_scores.keys():
                sentence_scores[sent]=word_frequencies[word.text.lower()]
            else:
                sentence_scores[sent]+=word_frequencies[word.text.lower()]

In [73]:
sentence_scores

{: 1.0,
 Natural language processing (NLP) is a field that focuses on making natural human language usable by computer programs.: 3.75,
 NLTK, or Natural Language Toolkit, is a Python package that you can use for NLP.: 2.25,
 A lot of the data that you could be analyzing is unstructured data and contains human-readable text.: 4.0,
 Before you can analyze that data programmatically, you first need to preprocess it.: 1.75,
 In this tutorial, you’ll take your first look at the kinds of text preprocessing tasks you can do with NLTK so that you’ll be ready to apply them in future projects.: 3.0,
 You’ll also see how to do some basic text analysis and create visualizations.: 2.75,
 If you’re familiar with the basics of using Python and would like to get your feet wet with some NLP, then you’ve come to the right place.: 3.0}

In [74]:
from heapq import nlargest
select_length=int(len(sentence_tokens)*0.3)
select_length

2

In [75]:
summary=nlargest(select_length,sentence_scores,key=sentence_scores.get)

In [76]:
summary

[A lot of the data that you could be analyzing is unstructured data and contains human-readable text.,
 Natural language processing (NLP) is a field that focuses on making natural human language usable by computer programs.]

In [77]:
final_summary=[word.text for word in summary]
summary=' '.join(final_summary)
summary

'A lot of the data that you could be analyzing is unstructured data and contains human-readable text. Natural language processing (NLP) is a field that focuses on making natural human language usable by computer programs.'