In [1]:
import os
import sys
import tempfile
import logging
import traceback
import json
import html
import re
from io import StringIO
from urllib.parse import urlparse, parse_qs
from pathlib import Path
import pickle

# import openai
from pytube import YouTube
from telegram import Update, LabeledPrice, Bot
from telegram.ext import MessageHandler, filters, ContextTypes, CallbackContext, CommandHandler, Updater
from telegram.constants import ParseMode
from telegram.error import BadRequest

from crabnlp.youtube import download_captions_and_meta, is_youtube, get_yt_id_from_url
from crabnlp.summarize import summarize_by_chunk
from crabnlp.commons import is_notebook
# from crabnlp.whisperapi import transcribe

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# import sys
# !{sys.executable} -m pip install pytube

In [3]:
DEVELOPER_CHAT_ID = 87799679
IS_PROD = not is_notebook()

SAVE_PATH = Path("/mnt/storage/newsboy/")
MESSAGE_MAX_LEN = 1000

In [6]:
tg_bot_token = os.environ['TRANSCRABER' + ('' if IS_PROD else '_DEV')]
bot = Bot(tg_bot_token)
updater = Updater(bot)
dispatcher = updater.dispatcher

TypeError: Updater.__init__() missing 1 required positional argument: 'update_queue'

In [5]:
updates_logger = logging.getLogger('Updates')
updates_logger.setLevel(logging.INFO)
file_h = logging.FileHandler("updates.json")
file_h.setFormatter(logging.Formatter(fmt='%(message)s'))
updates_logger.addHandler(file_h)

errors_logger = logging.getLogger('Error')
errors_logger.setLevel(logging.WARNING)
file_h = logging.FileHandler("errors.json")
file_h.setFormatter(logging.Formatter(fmt='%(message)s'))
errors_logger.addHandler(file_h)

In [6]:
def chop(text, max_size):
    s, e = 0, max_size
    while s < len(text):
        yield text[s:e]
        s = e
        e += max_size


assert list(chop("1234567890", 3)) == ['123', '456', '789', '0']

In [7]:
def error_handler(update: object, context) -> None:
    """Log the error and send a telegram message to notify the developer."""
    # Log the error before we do anything else, so we can see it even if something breaks.
    logging.error(msg="Exception while handling an update:", exc_info=context.error, stack_info=True)

    # traceback.format_exception returns the usual python message about an exception, but as a
    # list of strings rather than a single string, so we have to join them together.
    tb_list = traceback.format_exception(None, context.error, context.error.__traceback__)
    tb_string = "".join(tb_list)

    # Build the message with some markup and additional information about what happened.
    # You might need to add some logic to deal with messages longer than the 4096 character limit.
    update_dict = update.to_dict() if isinstance(update, Update) else {'update': str(update)}
    errors_logger.error(json.dumps({
        'update': update_dict,
        'chat_data': context.chat_data,
        'user_data': context.user_data,
        'traceback': tb_string,
        'error': context.error}))

    message = (
        f"An exception was raised while handling an update\n"
        f"<pre>update = {html.escape(json.dumps(update_dict, indent=2, ensure_ascii=False))}"
        "</pre>\n\n"
        f"<pre>context.chat_data = {html.escape(str(context.chat_data))}</pre>\n\n"
        f"<pre>context.user_data = {html.escape(str(context.user_data))}</pre>\n\n"
        f"<pre>{html.escape(tb_string)}</pre>"
    )
    context.bot.send_message(
        chat_id=DEVELOPER_CHAT_ID, text=message, parse_mode=PARSEMODE_HTML
    )
dispatcher.add_error_handler(error_handler)

In [8]:
# def gpt_complete(prompt):
#     response = openai.Completion.create(
#         model="text-davinci-003",
#         prompt=str(prompt),
#         temperature=0.0,
#         max_tokens=256,
#         top_p=1,
#         frequency_penalty=0,
#         presence_penalty=0
#     )
#     return response["choices"][0]["text"]

# def is_gpt_called(text):
#     """Returns return code and prompt"""
#     COMPLETE_REGEX = r"^\W*complete\W+"
#     if re.match(COMPLETE_REGEX, text, flags=re.I):
#         return True, re.sub(COMPLETE_REGEX, '', text, flags=re.I)
#     else:
#         return False, text


# assert is_gpt_called(" Complete! Who is the president of Brazil?") == (True, "Who is the president of Brazil?")
# assert is_gpt_called("Complete... 123") == (True, "123")
# assert is_gpt_called("Some text") == (False, "Some text")

In [9]:
def send_message(context: CallbackContext,
                 text,
                 temporary_fn=None,
                 **send_message_args):
    text = text.strip()
    if not text:
        return
    try:
        context.bot.send_message(text=text, **send_message_args)
    except BadRequest as ex:
        n = len(text)
        # print(f"{n=}")
        # print(f"{text=}")
        if n < MESSAGE_MAX_LEN:
            raise
        else:
            half = int(n/2)
            send_message(context, text=text[:half], **send_message_args)
            send_message(context, text=text[half:], **send_message_args)

    # if len(text) > MESSAGE_MAX_LEN * 3:
    #     context.bot.send_message(text=f"Message is too long (len={len(text)}) {text[:100]}", **send_message_args)
    #     if temporary_fn:
    #         with (SAVE_PATH / temporary_fn).open('w') as f:
    #             print(text, file=f)
    #         context.bot.send_document(document=(SAVE_PATH / temporary_fn).open(),
    #                                   **send_message_args)
    #     else:
    #         with tempfile.TemporaryFile(mode="w+") as f:
    #             f.write(text)
    #             f.seek(0)
    #             context.bot.send_document(document=f, **send_message_args)
    # else:
    #     for c in chop(text, MESSAGE_MAX_LEN):
    #         context.bot.send_message(text=c, **send_message_args)

In [10]:
def transcribe_media(update: Update, context: CallbackContext):
#     context.bot.send_message(chat_id=update.effective_chat.id, 
#                          text=f"{{update.message.voice}} {update.effective_chat.id}", 
#                          reply_to_message_id=update.message.message_id)
    log = dict(update.to_dict())
    log['function'] = sys._getframe(0).f_code.co_name
    updates_logger.info(json.dumps(log))
    
    tmp = tempfile.NamedTemporaryFile()
    if update.message.voice:
        tmp.name = str(Path(tmp.name).parent / f'{update.message.voice.file_id}.ogg')
        new_file = context.bot.get_file(update.message.voice.file_id)
        new_file.download(tmp.name)
    elif update.message.video_note:
        tmp.name = str(Path(tmp.name).parent / f'{update.message.video_note.file_id}.mp4')
        new_file = context.bot.get_file(update.message.video_note.file_id)
        new_file.download(tmp.name)
    else:
        raise RuntimeError('No voice nor video note attached')
    tr = transcribe(tmp.name)
    answer = tr['text']
    # is_gpt, prompt = is_gpt_called(answer)
    # if is_gpt:
    #     answer = gpt_complete(prompt)

    send_message(context,
                 text=answer,
                 temporary_fn=tmp.name + '.txt',
                 chat_id=update.effective_chat.id,
                 reply_to_message_id=update.message.message_id,
                 disable_notification=True)


dispatcher.add_handler(MessageHandler(Filters.voice | Filters.video_note, transcribe_media))

In [11]:
def download_youtube_audio(url):
    assert is_youtube(url)

    parsed_url = urlparse(url)
    if 'v' in parse_qs(parsed_url.query):
        video_id = parse_qs(parsed_url.query)['v'][0]
    else:
        video_id = url.rsplit('/', 1)[-1]

    yt = YouTube(url)
    stream = max(yt.streams.filter(only_audio=True), key=lambda x: x.bitrate)
    filename = video_id + '.' + stream.default_filename.rsplit('.', 1)[-1]
    stream.download(output_path=SAVE_PATH, filename=filename, skip_existing=True)

    return (SAVE_PATH / filename), video_id

In [12]:
def transcribe_and_download_audio_youtube(update: Update, context: CallbackContext):
    log = dict(update.to_dict())
    log['function'] = sys._getframe(0).f_code.co_name
    updates_logger.info(json.dumps(log))

    url = context.args[0]
    filename, video_id = download_youtube_audio(url)

    s = filename.lstat().st_size / 1024 / 1024
    context.bot.send_message(
            chat_id=update.effective_chat.id,
            text=f"Downloaded {s:.1f}MB",
            reply_to_message_id=update.message.message_id)
    if s < 50:
        context.bot.send_audio(
                chat_id=update.effective_chat.id, 
                audio=filename.open('rb'),
                reply_to_message_id=update.message.message_id)

    transcription = transcribe(str(filename))
    with (SAVE_PATH / (video_id + '.' + 'pkl')).open('wb') as f:
        pickle.dump(transcription, f)

    send_message(context, transcription['text'],
                 temporary_fn=video_id + '.txt',
                 chat_id=update.effective_chat.id,
                 reply_to_message_id=update.message.message_id)


dispatcher.add_handler(CommandHandler("yt", transcribe_and_download_audio_youtube))

In [None]:
PAYMENTS_TOKEN = "390540012:LIVE:31867"
PRICES = teleg
def donate(update: Update, context: CallbackContext):
    context.bot.send_invoice(message.chat.id,
                             title="Поддержка",
                             provider_token=PAYMENTS_TOKEN,
                             currency="rub",
                             prices=[
    return
    
    # 390540012:LIVE:31867
dispatcher.add_handler(CommandHandler("donate", transcribe_and_download_audio_youtube))

In [13]:
def summarize_youtube(update: Update, context: CallbackContext):
    if not update.message:
        return
    url = update.message.text
    if not is_youtube(url):
        return

    log = dict(update.to_dict())
    log['function'] = sys._getframe(0).f_code.co_name
    updates_logger.info(json.dumps(log))

    video_id = get_yt_id_from_url(url)

    summary_file = SAVE_PATH / f"{video_id}-summary.txt"
    if not summary_file.exists():
        try:
            try:
                meta = download_captions_and_meta(url)
            except RuntimeError:
                send_message(context,
                             text="The video doesn't have captions",
                             chat_id=update.effective_chat.id,
                             reply_to_message_id=update.message.message_id)
                return
            with (SAVE_PATH / f"{video_id}-meta.json").open('w') as fm:
                json.dump(meta, fm)
            text = ' '.join(w['text'] for w in meta['captions'])
            summaries = summarize_by_chunk(text)
        except RuntimeError as ex:
            summaries = [str(ex)]
        with summary_file.open('w') as f:
            print('\n'.join(summaries), file=f)

    with summary_file.open() as f:
        s = f.read().strip()
        message = s if s else "<Empty>"
        send_message(context,
                     text=message,
                     chat_id=update.effective_chat.id,
                     reply_to_message_id=update.message.message_id)


dispatcher.add_handler(MessageHandler(Filters.text, summarize_youtube))

In [14]:
def transcribe_audio(update: Update, context: CallbackContext):
    log = dict(update.to_dict())
    log['function'] = sys._getframe(0).f_code.co_name
    updates_logger.info(json.dumps(log))

    tmp = tempfile.NamedTemporaryFile()
    if update.message.audio:
        tmp.name += '_' + update.message.audio.file_name
        new_file = context.bot.get_file(update.message.audio.file_id)
        new_file.download(tmp.name)
    elif update.message.video:
        tmp.name += '_' + update.message.video.file_name
        new_file = context.bot.get_file(update.message.video.file_id)
        new_file.download(tmp.name)
    else:
        raise RuntimeError('No voice nor video note attached')

    transcription = transcribe(tmp.name)

    send_message(context,
                 transcription['text'],
                 temporary_fn=tmp.name + '.txt',
                 chat_id=update.effective_chat.id,
                 reply_to_message_id=update.message.message_id)


dispatcher.add_handler(MessageHandler(Filters.audio | Filters.video, transcribe_audio))

In [None]:
updater.start_polling()
updater.idle()

In [None]:
# updater.stop()
# for h in dispatcher.handlers.values():
#     dispatcher.remove_handler(h)