# EAudio
Generate audio from a URL.

## Imports

In [141]:
import requests
import time
from datetime import datetime
import os
import re
from typing import List, Tuple, Dict, Union, Optional
import urllib
import pyttsx3
import os
import winreg
from bs4 import BeautifulSoup, NavigableString

## Constants

In [142]:
"""
urls_to_test = [
        "https://www.lesswrong.com/posts/vzfz4AS6wbooaTeQk/staring-into-the-abyss-as-a-core-life-skill", # Lesswrong
        "https://forum.effectivealtruism.org/posts/oGdCtvuQv4BTuNFoC/good-things-that-happened-in-ea-this-year", # EA Forum
        "https://www.alignmentforum.org/posts/JSkqkgYcyYt8oHsFi/large-language-models-can-provide-normative-assumptions-for", # Alignment Forum
        "https://www.gwern.net/Melatonin", # Gwern
        "https://astralcodexten.substack.com/p/sorry-i-still-think-i-am-right-about", # Substack
        "https://arbital.com/p/bayes_rule/", # Arbital
    ]
"""

url = "https://astralcodexten.substack.com/p/sorry-i-still-think-i-am-right-about"
url = "https://astralcodexten.substack.com/p/how-do-ais-political-opinions-change"

## Get Text

In [151]:
class Text:
    def __init__(self, url: str):
        self.url = url
        self.parsed_link = urllib.parse.urlparse(url)

        headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"}
        self.page = requests.get(url, headers=headers).text
        self.soup = BeautifulSoup(self.page, 'html.parser')

        self.title = None
        self.author = None
        self.date = None
        self.text = None

        if self.parsed_link.netloc == 'www.lesswrong.com' or self.parsed_link.netloc == 'www.alignmentforum.org' or self.parsed_link.netloc == 'forum.effectivealtruism.org':
            self.get_text_lw_eaf_af()
        elif self.parsed_link.netloc == 'www.gwern.net':
            self.get_info_gwern()
        elif '.'.join(self.parsed_link.netloc.split('.')[1:]) == 'substack.com':
            self.get_info_substack()
        elif self.parsed_link.netloc == 'arbital.com':
            self.get_info_arbital()
        else:
            raise ValueError(f'URL {url} is not yet supported.')
        
        outro = f"""This was '{self.title}' by {self.author}. You can find the original post at {self.url}."""
        self.text = f"{self.text}\n\n{outro}"

    def get_text_lw_eaf_af(self):
        self.title = self.soup.find("a", {"class": "PostsPageTitle-link"}).text
        self.author = re.sub(r',([^,]*)$', r' and\1', ", ".join([t.text for t in self.soup.find_all("span", {"class": "PostsAuthors-authorName"})]))
        self.date = self.soup.find("span", {"class": "PostsPageDate-date"}).text
        tags = "Tags: " + ", ".join([t.text for t in self.soup.find("span", {"class": "FooterTagList-root"}).find_all("span", {"class": "FooterTag-name"})])
        raw_text = self.soup.find("div", {"class": "PostsPage-postContent instapaper_body ContentStyles-base content ContentStyles-postBody"}).find_all(["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"])
        text = "\n".join([t.text for t in raw_text])

        self.title = f"{self.title}".replace(":", "-").replace("/", "_").replace("\\", "_").replace("<", "(").replace(">", ")").replace("*", "").replace("?", "").replace("|", "#")
        self.text = f"{self.title}, by {self.author}\nPosted on the {self.date}\nTags: {tags}\n\n{text}"

    def get_info_gwern(self):
        self.title = self.soup.h1.string        
        self.author = "Gwern Branwen"
        dates = self.soup.find(id="page-date-range").text.split('–')
        start_date = datetime.strptime(dates[0], '%Y-%m-%d')
        end_date = datetime.strptime(dates[1], '%Y-%m-%d')
        start_month = start_date.strftime('%B')
        end_month = end_date.strftime('%B')
        self.date = f"Written from {start_month} {start_date.day}, {start_date.year}, to {end_month} {end_date.day}, {end_date.year}."

        body = self.soup.find("div", {"id": "markdownBody", "class": "markdownBody"})

        bold_elements = body.find_all(['b', 'strong'])
        italic_elements = body.find_all(['i', 'em'])
        sup_tags = body.find_all('sup')
        blockquotes = body.find_all('blockquote')

        to_delete = body.find_all(id=['see-also', 'external-links', 'appendix', 'appendices', 'footnotes', 'backlinks-section', 'link-bibliography-section', 'similars-section']) + body.find_all('noscript')

        for element in bold_elements:
            element.replace_with('**' + element.text + '**')
        for element in italic_elements:
            element.replace_with('*' + element.text + '*')
        for element in sup_tags:
            element.decompose()
        for i,element in enumerate(blockquotes):
            if i == 0:
                element.replace_with('Abstract' + element.text)
                continue
            element.replace_with('Quote.' + element.text + 'Unquote.')
        for element in to_delete:
            element.decompose()
            
        self.title = f"{self.title}".replace(":", "-").replace("/", "_").replace("\\", "_").replace("<", "(").replace(">", ")").replace("*", "").replace("?", "").replace("|", "#")
        self.text = f"{self.title}, by {self.author}\nDate range: {self.date}\n\n{body.get_text()}"

    def get_info_substack(self):
        self.title = self.soup.title.string
        subtitle = self.soup.find("h3", {"class": "subtitle"}).string
        self.author = "Scott Alexander"
        self.date = self.soup.time.string
        body = self.soup.find("div", {"class": "body markup"})

        bold_elements = body.find_all(['b', 'strong'])
        italic_elements = body.find_all(['i', 'em'])
        sup_tags = body.find_all('sup')
        imgs = body.find_all('picture')
        code = body.find_all('code')
        titles = body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        blockquotes = body.find_all('blockquote')

        for element in bold_elements:
            element.replace_with('**' + element.text + '**')
        for element in italic_elements:
            element.replace_with('*' + element.text + '*')
        for element in sup_tags:
            element.decompose()
        for element in imgs:
            element.replace_with("\n[IMAGE ATTACHED]\n")
        for element in code:
            element.replace_with("\n`" + element.text + "`\n")
        for element in titles:
            element.replace_with("\n" + element.text + "\n")
        for element in blockquotes:
            element.replace_with('Quote.' + element.text + 'Unquote.')

        self.title = f"{self.title}".replace(":", "-").replace("/", "_").replace("\\", "_").replace("<", "(").replace(">", ")").replace("*", "").replace("?", "").replace("|", "#")
        self.text = f"{self.title}\n{subtitle}\nby {self.author}\nPosted on {self.date}\n\n{body.get_text()}"

    def get_info_arbital(self):
        raise NotImplementedError("Arbital is not supported yet.")
            
    def __str__(self):
        return self.text

In [152]:
class Audio:
    def __init__(self, url: str, save_path: str=None, voice: int=0, speed: int=100) -> None:
        """
        :param url: the URL of the text to be read
        :param save_path: the path to save the audio file to; if None, it saves in the download folder
        :param voice: the voice to use (enter an int if you know which to use, and manually in the terminal if you don't)
        :param speed: the speed to read the text at (100 is standard)
        """
        if save_path is None:
            save_path = winreg.QueryValueEx(winreg.OpenKey(winreg.HKEY_CURRENT_USER, r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders"), "{374DE290-123F-4565-9164-39C4925E467B}")[0]

        self.url = url
        self.save_path = save_path
        self.voice = voice
        self.speed = speed

        self.text = Text(url)
        self.engine = self.set_engine()
        self.save_audio()

    def set_engine(self):
        engine = pyttsx3.init()
        voices = engine.getProperty('voices')
        engine.setProperty('voice', voices[self.voice].id)
        engine.setProperty('rate', self.speed)
        return engine

    def save_audio(self):
        self.engine.save_to_file(self.text.text, f"{self.save_path}/{self.text.title}.mp3")
        self.engine.runAndWait()
        self.engine.stop()

In [154]:
audio = Audio("https://www.lesswrong.com/posts/nmMorGE4MS4txzr8q/simulators-seminar-sequence-1-background-and-shared", 'data', voice=1, speed=300)
print(audio.text.title)
print("Audio saved successfully")


[Simulators seminar sequence] #1 Background & shared assumptions
Audio saved successfully
