# 모듈설치
1. 유튜브 영상 다운로드 (음성만 다운로드, mp3) - PyTube
2. Speech to Text (Transcribe) - OpenAI Whisper (Local)
3. Map-reduce summariation - LangChain, OpenAI ChatGPT API

In [12]:
# !pip install ffmpeg => conda 로 설치하셈
# !pip install -q openai-whisper pytube
# !pip install -q openai tiktoken langchain
# !pip install gradio
# !pip install openai

In [13]:
# from pytube import YouTube

# yt = YouTube('https://www.youtube.com/watch?v=JGdlvUffk5Y')

# yt.streams.filter(only_audio=True).first().download(
#     output_path='.', filename='input.mp3')

In [14]:
# 위스퍼 로드 (음성 -> 텍스트)
import whisper

model = whisper.load_model("base")



In [15]:


## 스플릿 로드
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len,
)


In [16]:
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chat_models import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

openai_api_key = input() # api 키는 유출이 안되게 하라

llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

# Map prompt
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes
Helpful Answer:"""

map_prompt = PromptTemplate.from_template(map_template)

# Reduce prompt
reduce_template = """The following is set of summaries:
{doc_summaries}
Take these and distill it into a final, consolidated summary of the main themes.
The final answer is a single paragraph of about 100 words and must be in Korean.
Helpful Answer:"""

reduce_prompt = PromptTemplate.from_template(reduce_template)

# 1. Reduce chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="doc_summaries"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# 2. Map chain
map_chain = LLMChain(llm=llm, prompt=map_prompt)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)




ValidationError: 1 validation error for ChatOpenAI
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)

In [11]:
import gradio as gr
import re

def extract_video_id(url):
    youtube_regex = (r'(https?://)?(www\.)?'
        '(youtube|youtu|youtube-nocookie)\.(com|be)/'
        '(watch\?v=|embed/|v/|.+\?v=)?([^&=%\?]{11})')
    youtube_pattern = re.compile(youtube_regex)
    match = youtube_pattern.match(url)
    if not match:
        return None
    return match.group(6)

def summarize(url):
    yt = YouTube(url)

    yt.streams.filter(only_audio=True).first().download(
        output_path='.', filename='input.mp3')

    result = model.transcribe("input.mp3")

    docs = [Document(page_content=x) for x in text_splitter.split_text(result["text"])]
    split_docs = text_splitter.split_documents(docs)

    sum_result = map_reduce_chain.run(split_docs)

    video_id = extract_video_id(url)
    embed = f"""<iframe width='560' height='315' src='https://www.youtube.com/embed/{video_id}' frameborder='0' allowfullscreen></iframe>"""

    return sum_result, embed

demo = gr.Interface(
    fn=summarize,
    inputs=gr.Textbox(label="URL"),
    outputs=[gr.TextArea(label="Summary"), gr.HTML()],
    # outputs=gr.TextArea(label="Summary"),
)

demo.launch(debug=True, share=True)

Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




Keyboard interruption in main thread... closing server.


