In [4]:
import re
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

import nltk
import heapq

In [15]:
#Downloading some missing modules in nltk.
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
#Reading the file using pandas.
df = pd.read_excel('/content/drive/My Drive/TASK.xlsx')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Intoduction,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,,Acnesol Gel is an antibiotic that fights bacte...,,,,,,,,
1,,Ambrodil Syrup is used for treating various re...,,,,,,,,
2,,Augmentin 625 Duo Tablet is a penicillin-type ...,,,,,,,,
3,,Azithral 500 Tablet is an antibiotic used to t...,,,,,,,,
4,,Alkasol Oral Solution is a medicine used in th...,,,,,,,,


Here we can see that we need to work only with 'Intoduction' column. Thus we will remove all other columns.

In [7]:
#Selecting only the needed column.
df = df.drop(['Unnamed: 0','Unnamed: 2','Unnamed: 3','Unnamed: 4','Unnamed: 5','Unnamed: 6','Unnamed: 7','Unnamed: 8','Unnamed: 9'], axis=1)

In [8]:
df.head()

Unnamed: 0,Intoduction
0,Acnesol Gel is an antibiotic that fights bacte...
1,Ambrodil Syrup is used for treating various re...
2,Augmentin 625 Duo Tablet is a penicillin-type ...
3,Azithral 500 Tablet is an antibiotic used to t...
4,Alkasol Oral Solution is a medicine used in th...


In [21]:
stopwords = nltk.corpus.stopwords.words('english')

def get_summary(text, max_len_of_sent=30, total_number_of_sentences=4):
    '''
    This function will return the extractive summary of a given text. Extractive
    summarization means getting summary using the existing text only and not generating
    any new sentence.
    --------------------------------------------------------------
    text: The document/text for which we need a summary.

    max_len_of_sent: Maximum length to be allowed for a sentence in summary. Default = 30.

    total_number_of_sentences: Number of sentences to be needed in a summary. Default = 4
    '''
    
    #Removing any special characters and digts.
    preprocessed_text = re.sub(r'[^a-zA-Z]', ' ', text)
    #Removing extra spaces with normal one space.
    preprocessed_text = re.sub(r'\s+', ' ', preprocessed_text)
    #Lower casing the text.
    preprocessed_text = preprocessed_text.lower()
    
    #Dictionary to store the frequency of words.
    word_freqs = dict()

    for word in nltk.word_tokenize(preprocessed_text):
        #Checking whether a word is stopword or not.
        if word not in stopwords:
            #Checking whether a word is present in dictionary or not.
            if word not in word_freqs.keys():
                word_freqs[word] = 1
            else:
                word_freqs[word] += 1

    max_freq = max(word_freqs.values())

    #Getting score for each word in the word_freqs dictionary by dividing its value by maximum frequency of a word.
    for word in word_freqs.keys():
        word_freqs[word] = (word_freqs[word] / max_freq)

    #List of all sentences in a text.
    sentence_list = nltk.sent_tokenize(text)

    #Dictionary to store score of every sentence in a particular text.
    sent_scores = dict()

    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_freqs.keys():
                if len(sent.split(' ')) < max_len_of_sent:
                    if sent not in sent_scores.keys():
                        sent_scores[sent] = word_freqs[word]
                    else:
                        sent_scores[sent] += word_freqs[word]

    #Selecting i number of sentences with highest scores.
    summary_sentences = heapq.nlargest(total_number_of_sentences, sent_scores, key=sent_scores.get)

    #Joining all the sentences to get a summary of a text.
    summary = ' '.join(summary_sentences)

    return summary

In [22]:
#Creating summary columns by applying get_summary function to respective text.
df['Summary'] = df['Intoduction'].apply(lambda x: get_summary(x))
df.head()

Unnamed: 0,Intoduction,Summary
0,Acnesol Gel is an antibiotic that fights bacte...,Ask your doctor when you should stop treatment...
1,Ambrodil Syrup is used for treating various re...,It is advised not to use it for more than 14 d...
2,Augmentin 625 Duo Tablet is a penicillin-type ...,You should also let your healthcare team know ...
3,Azithral 500 Tablet is an antibiotic used to t...,Stopping the medicine too early may lead to th...
4,Alkasol Oral Solution is a medicine used in th...,"If you are pregnant or breastfeeding, consult ..."


In [23]:
df['Summary'][1]

"It is advised not to use it for more than 14 days without doctor consultation.The most common side effects of this medicine include vomiting, nausea, and stomach upset. Your doctor should also know about all other medicines you are taking as many of these may make this medicine less effective or change the way it works. You must take doctor's advice before using this medicine if you are pregnant or breastfeeding. It works by thinning and loosens mucus in the nose, windpipe and lungs and make it easier to cough out.Ambrodil Syrup should be taken with food."