In [1]:
### DS Task 2
## DS1 Building a summarizer

In [2]:
import pandas as pd
import numpy as np

In [3]:
## General pre-processing of the data

In [4]:
paragraphs = pd.read_excel('TASK.xlsx', headers=True) #load the dataset
paragraphs['introduction'] = paragraphs['Unnamed: 1'] #cleanup columns
paragraphs = paragraphs.drop(columns=['TEST DATASET','Unnamed: 1'],axis=1)
paragraphs = paragraphs.drop(paragraphs.index[0])#cleanup rows
new_index = list(range(0,len(paragraphs)))#reorder index
paragraphs['index'] = new_index
paragraphs = paragraphs.set_index('index')
print(len(paragraphs))
paragraphs.head()

1000


Unnamed: 0_level_0,introduction
index,Unnamed: 1_level_1
0,Acnesol Gel is an antibiotic that fights bacte...
1,Ambrodil Syrup is used for treating various re...
2,Augmentin 625 Duo Tablet is a penicillin-type ...
3,Azithral 500 Tablet is an antibiotic used to t...
4,Alkasol Oral Solution is a medicine used in th...


In [5]:
##Solution 1: Simple summary using Gensim Summarization
#I've decided to use Gensim because of it's efficiency and 
#because it's summarizer allows for user adjusted summary length as required.

In [6]:
from gensim.summarization.summarizer import summarize

#get input for the desired word count of the summary
sum_len=int(input('What is the desired word count of the summary? Ideal length is around 50 '))

#initialize variables
summary = [] # for storing summaries
error_val = 0 # for error values
error_index = []

#generating summaries using gensim summarizer
for i in range(0,len(paragraphs)):
    try:
        summary.append(summarize(paragraphs.introduction[i],ratio=0.2, word_count=sum_len, split=False))
    except:
        error_val = 1
        error_index.append(i)
        summary.append(paragraphs.introduction[i])

#displaying issues in data and the remedy
if error_val ==1:
    print('Summary for introduction at the following indexes is too short!')
    print(error_index)
    print('Adding introduction as is...')

#adding the summary to the main dataframe
paragraphs['summary'] = summary

#execute the following code to output the code as an excel file marked as 'output.xlsx'
#paragraphs.to_excel("output.xlsx")
paragraphs.head()

What is the desired word count of the summary? Ideal length is around 50 50
Summary for introduction at the following indexes is too short!
[79, 520, 750, 924]
Adding introduction as is...


Unnamed: 0_level_0,introduction,summary
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Acnesol Gel is an antibiotic that fights bacte...,This medicine works by attacking the bacteria ...
1,Ambrodil Syrup is used for treating various re...,It is advised not to use it for more than 14 d...
2,Augmentin 625 Duo Tablet is a penicillin-type ...,Augmentin 625 Duo Tablet is a penicillin-type ...
3,Azithral 500 Tablet is an antibiotic used to t...,Consult your doctor if you find these side eff...
4,Alkasol Oral Solution is a medicine used in th...,Alkasol Oral Solution is a medicine used in th...


In [7]:
##Solution 2: Implemented summarizer using RegEx and NLTK
#I've decided to use Regexp because it allows me to clean up the text and make the content uniform
# and I've used NLTK for the mutlitude of methods and corpuses which make the processing easy and fast.
#I also use heapq to simply take the sentences of importance according to user input

In [8]:
import re
import nltk
import heapq
#nltk.download('punkt')
#nltk.download('stopwords')

In [9]:
def summarize_this(text,sen_len):
    #cleaning the text using regexp
    clean = re.sub(r'\[[0-9]*\]', ' ', text)
    clean = re.sub(r'\s+', ' ', clean)
    # Removing special characters and digits
    cleaner = re.sub('[^a-zA-Z]', ' ', clean )
    cleanest = re.sub(r'\s+', ' ', cleaner)
    # tokenizing the introduction
    sentence_list = nltk.sent_tokenize(clean)
    stopwords = nltk.corpus.stopwords.words('english')
    # calculating word frequencies
    word_freq = {}
    for word in nltk.word_tokenize(cleanest):
        if word not in stopwords:
            if word not in word_freq.keys():
                word_freq[word] = 1
            else:
                word_freq[word] += 1
    max_freq = max(word_freq.values())
    for word in word_freq.keys():
        word_freq[word] = (word_freq[word]/max_freq)
    scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_freq.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in scores.keys():
                        scores[sent] = word_freq[word]
                    else:
                        scores[sent] += word_freq[word]
    summary_sen = heapq.nlargest(sen_len, scores, key=scores.get)
    summary = ' '.join(summary_sen)
    return summary

In [10]:
sen_len = int(input('What is the desired number of sentences of each summary? Ideal length is 2: '))
#initalize variables
summary = []

#generating summaries using nltk
for i in paragraphs.introduction:
    summary.append(summarize_this(i,sen_len))

#adding the summary to the main dataframe
paragraphs['summary_nltk'] = summary

#execute the following code to output the code as an excel file marked as 'output.xlsx'
#paragraphs.to_excel("output.xlsx")
paragraphs.head()

What is the desired number of sentences of each summary? Ideal length is 2: 2


Unnamed: 0_level_0,introduction,summary,summary_nltk
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Acnesol Gel is an antibiotic that fights bacte...,This medicine works by attacking the bacteria ...,Ask your doctor when you should stop treatment...
1,Ambrodil Syrup is used for treating various re...,It is advised not to use it for more than 14 d...,It is advised not to use it for more than 14 d...
2,Augmentin 625 Duo Tablet is a penicillin-type ...,Augmentin 625 Duo Tablet is a penicillin-type ...,You should also let your healthcare team know ...
3,Azithral 500 Tablet is an antibiotic used to t...,Consult your doctor if you find these side eff...,Stopping the medicine too early may lead to th...
4,Alkasol Oral Solution is a medicine used in th...,Alkasol Oral Solution is a medicine used in th...,"If you are pregnant or breastfeeding, consult ..."
