<a href="https://colab.research.google.com/github/konductor000/Summarizer-bot/blob/main/summarizer_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2
!pip install validators
!pip install pytelegrambotapi
!pip install transformers
!pip install sentencepiece

In [2]:
import telebot
from telebot import types
import os
import PyPDF2
from urllib.request import urlopen
from bs4 import BeautifulSoup
import validators
import transformers
from transformers import BartTokenizer, TFBartForConditionalGeneration, pipeline, AlbertTokenizer, TFAlbertForQuestionAnswering

class Bot:
    def __init__(self):
        self.bot = telebot.TeleBot('5891098732:AAEoWI3_Wby-Epx2HUNBTO5xN4xRyAK-Pco')
        self.model = Model()
        self.mode = 'summarization'
        self.text = ''
        
        @self.bot.message_handler(commands=['start'])
        def _start(message):
            self.start(message)

        @self.bot.message_handler(content_types=['text'])
        def _get_text_messages(message):
            self.get_text_messages(message)
        
        @self.bot.message_handler(content_types=['document'])
        def _get_file(message):
            self.get_file(message)
        
        self.bot.polling(none_stop=True, interval=0)
        
    def start(self, message):
        self.bot.send_message(message.from_user.id, "👋 Hello! Send some text, txt, pdf file or url to summarize.")

    def get_text_messages(self, message):
        text = message.text
        if validators.url(message.text) == True:
            text = self.get_text_from_url(text)
        
        if self.mode == 'summarization':
            self.text = text
            self.bot.send_message(message.from_user.id, self.model.summarize(text))
            
            markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
            btn1 = types.KeyboardButton('Answer Question')
            btn2 = types.KeyboardButton('Back')
            
            markup.add(btn1, btn2)
            self.bot.send_message(message.chat.id,'Would you like to ask a question?',reply_markup=markup)
            
            self.mode = 'read'
        
        elif self.mode == 'read':
            if message.text == 'Answer Question':
                self.mode = 'question_answer'
            else:
                self.mode = 'summarization'
        
        else:
            self.mode = 'read'
            self.bot.send_message(message.from_user.id, self.model.answer_question(self.text, text))

    def get_file(self, message):
        file_name = message.document.file_name
        file_info = self.bot.get_file(message.document.file_id)
        downloaded_file = self.bot.download_file(file_info.file_path)
        with open(file_name, 'wb') as new_file:
            new_file.write(downloaded_file)
        
        text = self.get_text_from_file(file_name)
        os.remove(file_name)
        
        if self.mode == 'summarization':
            self.text = text
            self.bot.send_message(message.from_user.id, self.model.summarize(text))
            
        else:
            self.mode = 'read'
            self.bot.send_message(message.from_user.id, self.model.answer_question(self.text, text))
        
        markup = types.ReplyKeyboardMarkup(resize_keyboard=True)
        btn1 = types.KeyboardButton('Answer Question')
        btn2 = types.KeyboardButton('Back')

        markup.add(btn1, btn2)
        self.bot.send_message(message.chat.id,'Would you like to ask a question?',reply_markup=markup)

        self.mode = 'read'
        

    def get_text_from_file(self, file_name):
        text = ''
        if file_name.endswith('pdf'):
            pdfFileObj = open(file_name, 'rb')

            pdfReader = PyPDF2.PdfReader(pdfFileObj)
            pageObj = pdfReader.pages[0]

            text = pageObj.extract_text()

            pdfFileObj.close()

        elif file_name.endswith('txt'):
            text = open(file_name, 'r', encoding='utf-8')

            chunks = (phrase.strip() for line in text for phrase in line.split("  "))
            text = '. '.join(chunk for chunk in chunks if chunk)

        return text

    def get_text_from_url(self, url):
        html = urlopen(url).read()
        soup = BeautifulSoup(html, features="html.parser")

        for script in soup(["script", "style"]):
            script.extract()    

        text = soup.get_text()

        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

        text = '. '.join(chunk for chunk in chunks if chunk)

        return text

    
class Model:
    def __init__(self):
        
        device = 'cuda'
        mname = 'facebook/bart-large'
        tokenizer = BartTokenizer.from_pretrained(mname)
        summarization_model = TFBartForConditionalGeneration.from_pretrained(mname)

        self.summarizer = transformers.pipeline("summarization", model=summarization_model, \
                                                tokenizer="facebook/bart-large-cnn", device=0)
        
        tokenizer = AlbertTokenizer.from_pretrained("vumichien/albert-base-v2-squad2")
        qa_model = TFAlbertForQuestionAnswering.from_pretrained("vumichien/albert-base-v2-squad2")
        self.question_answerer = transformers.pipeline("question-answering", model=qa_model, \
                                                       tokenizer=tokenizer, device=0)
    
    def summarize(self, text):
        min_length = max(75, len(text.split()) // 10)
        max_length = max(110, len(text.split()) // 5)
        
        #print(text)
        return self.summarizer(text, min_length=min_length, max_length=max_length, do_sample=False)[0]['summary_text']
    
    def answer_question(self, text, question):
        return self.question_answerer(question=question, context=text)['answer']

In [None]:
Bot()