In [145]:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import math

from langchain import OpenAI,LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain


import os
from dotenv import load_dotenv
load_dotenv()

import json
import io

import pickle

from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from tqdm.autonotebook import tqdm

import hgutils
stopwatch = hgutils.timer("YT Play")

In [87]:
def get_youtube_transcript(video_id, batch_size=1_000,overlap=100):
    if overlap <=0:
        overlap = 1
    extracted_transcript = YouTubeTranscriptApi.get_transcript(video_id)
    transcript_in_batches = []
    current_batch = ""
    start = 0
    end = 0
    full_transcript = ""
    for idx,slice in enumerate(extracted_transcript):
        if len(current_batch) + len(slice['text']) >= batch_size:
            transcript_in_batches.append({'text':current_batch,'start':start,'end':end})
            current_batch = current_batch[-overlap:]
            start = math.ceil(slice['start'])
            end = math.floor(slice['start'] + slice['duration'])
        current_batch += slice['text']
        end = math.floor(slice['start'] + slice['duration'])
        full_transcript = " ".join([full_transcript,slice['text']])
    transcript_in_batches.append({'text':current_batch,'start':start,'end':end})
    return transcript_in_batches, full_transcript

def get_YTvideo_details(video_id,transcript_batch_size=1_000,transcript_overlap=100):
    video_link = "https://www.youtu.be/" + video_id
    video = YouTube(video_link)
    full_extraction = True
    try:
        title = video.title
    except:
        title = "NA"
        full_extraction = False
    try:
        author = video.author
    except:
        author = "NA"
        full_extraction = False
    try:
        length = video.length
    except:
        length = 0
        full_extraction = False
    try:
        description = video.description
    except:
        description = "NA"
        full_extraction = False
    try:
        transcript_in_batches,full_transcript = get_youtube_transcript(
                        video_id=video_id,
                        batch_size=transcript_batch_size,
                        overlap=transcript_overlap)
    except:
        transcript_in_batches,full_transcript = "NA","NA"
        full_transcript = False
    details = {'title':title,
               'author':author,
               'length':length,
               'description':description,
               'transcript_in_batches':transcript_in_batches,
               'full_transcript':full_transcript
               }
    return details, full_extraction

In [88]:
YT_videos = [{'product':'iPhone 14 Pro','video_id':'SdLShOCvVeM'},
             {'product':'iPhone 14 Pro','video_id':'h6m9medgP2A'},
             {'product':'iPhone 14 Pro','video_id':'rMpe_hqSQUs'},
             {'product':'iPhone 14 Pro','video_id':'7qk2wsl8IFA'},
             {'product':'Samsung Galaxy S23 Ultra','video_id':'zhoTX0RRXPQ'},
             {'product':'Samsung Galaxy S23 Ultra','video_id':'zz70o2Ia4X0'},
             {'product':'Samsung Galaxy S23 Ultra','video_id':'Qz3as0YRUuY'},
             {'product':'Samsung Galaxy S23 Ultra','video_id':'e95YT-lDehQ'},
             {'product':'Google Pixel 7 Pro','video_id':"KGXYcumRkYk"},
             {'product':'Google Pixel 7 Pro','video_id':"wZK3fs39kmI"},
             {'product':'Google Pixel 7 Pro','video_id':"NE5H5intsck"},
             {'product':'Google Pixel 7 Pro','video_id':"_xMWo14KXfM"},
             ]

max_reruns = 5
rerun_count = 0
all_video_extraction = True
while (rerun_count < max_reruns):
    stopwatch.start(rerun_count+1)
    for video in YT_videos:
        if "full_extraction" in video.keys():
            if video['full_extraction']:
                # print(video["video_id"])
                continue
        details,full_extraction=get_YTvideo_details(video_id=video['video_id'])
        video['full_extraction'] = full_extraction
        for k,v in details.items():
            video[k] = v
        
    rerun_count += 1
    stopwatch.stop(verbose=0,print=True)
  
for video in YT_videos:
    print(video)

	 1 - 18.36 seconds
	 2 - 5.45 seconds
	 3 - 2.43 seconds
	 4 - 0.0 milliseconds
	 5 - 0.0 milliseconds
{'product': 'iPhone 14 Pro', 'video_id': 'SdLShOCvVeM', 'full_extraction': True, 'title': 'iPhone 14 Pro Review: This Will Be Copied!', 'author': 'Marques Brownlee', 'length': 1341, 'description': "iPhone 14 Pro and iPhone 14 Pro Max have 3 main new things. My honest thoughts!\n\nNew Chevron Hoodie! http://shop.MKBHD.com\n\nBring new life to your smartphone with iFixit at https://ifixit.com/MKBHD and check out the iPhone 14 Pro teardown at https://ifix.gd/teardownvideos\n\nSkin your iPhone at https://dbrand.com/shop/iphone-14\nGet an iPhone 14 Pro at https://geni.us/VO4z4\nGet an iPhone 14 Pro Max at https://geni.us/CigazCO\nGet an iPhone 13 Pro at https://geni.us/VO4z4\nTech I'm using right now: https://www.amazon.com/shop/MKBHD\n\nIntro Track: http://youtube.com/20syl\nPlaylist of MKBHD Intro music: https://goo.gl/B3AWV5\n\nPhone provided by Apple for review.\n\n~\nhttp://twitter.c

In [92]:
def get_video_of_min_length():
    min_length = 1_000_000
    for video in YT_videos:
        if video['length'] < min_length:
            min_length_video = video
            min_length = min_length_video['length']
    return min_length_video

print(get_video_of_min_length()['length'])

295


In [114]:
llm = OpenAI(model_name="text-davinci-003",temperature=0,
             openai_api_key=os.environ['OPENAI_API_KEY'])

def get_summary_from_llm (text,llm):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3_000,chunk_overlap=300)
    texts = text_splitter.split_text(text)
    docs = [Document(page_content=t) for t in texts]
    chain = load_summarize_chain(llm, chain_type="map_reduce")
    summary = chain.run(docs)
    return summary


In [115]:
for idx,video in enumerate(YT_videos):
    stopwatch.start(f"{idx}. {video['length']}")
    # video["summary"] = "summary"
    video["summary"] = get_summary_from_llm(video['full_transcript'],llm)
    stopwatch.stop(print=True)

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


	 0. 1341 - 49.39 seconds
	 1. 619 - 4.43 seconds
	 2. 669 - 3.85 seconds
	 3. 987 - 4.55 seconds
	 4. 800 - 6.47 seconds
	 5. 822 - 4.41 seconds
	 6. 870 - 5.3 seconds
	 7. 295 - 3.45 seconds
	 8. 633 - 5.18 seconds
	 9. 549 - 3.86 seconds
	 10. 822 - 4.5 seconds
	 11. 701 - 4.92 seconds


In [117]:
YT_videos[0]['summary']

' The iPhone 14 Pro and 14 Pro Max have upgraded internals, a new display, improved cameras, and a new MKBHD Edition Chevron hoodie. It has a new A16 Bionic chip, six gigs of RAM, and improved gyroscope and accelerometer sensors. It also has a physical SIM card tray, satellite connectivity, and 6.1 inch and 6.7 inch OLED displays. The camera bumps are larger and thicker than the 13 Pros, and the Dynamic Island feature is a widget with media controls and system alerts. The camera has a 48 megapixel sensor, 12 megapixel selfie cameras, improved autofocus, and Prores video. iFixit offers free repair guides and tear downs, and there will be an upcoming iPhone 14 review and Apple Watch Ultra review.'

In [118]:
with open("data/Yt_videos.pickle","wb") as handle:
    pickle.dump(YT_videos,handle, protocol=pickle.HIGHEST_PROTOCOL)

In [146]:
with open("data/YT_videos.pickle","rb") and handle:
    YT_videos = pickle.load(handle)

In [122]:
for video in YT_videos:
    print(video['author'],video['summary'])

Marques Brownlee  The iPhone 14 Pro and 14 Pro Max have upgraded internals, a new display, improved cameras, and a new MKBHD Edition Chevron hoodie. It has a new A16 Bionic chip, six gigs of RAM, and improved gyroscope and accelerometer sensors. It also has a physical SIM card tray, satellite connectivity, and 6.1 inch and 6.7 inch OLED displays. The camera bumps are larger and thicker than the 13 Pros, and the Dynamic Island feature is a widget with media controls and system alerts. The camera has a 48 megapixel sensor, 12 megapixel selfie cameras, improved autofocus, and Prores video. iFixit offers free repair guides and tear downs, and there will be an upcoming iPhone 14 review and Apple Watch Ultra review.
Dave2D  The iPhone 14 Pro Models have a brighter screen, a unique Dynamic Island cutout, improved autofocus, an always-on display, 48 megapixel Raw photos, improved low light photography, and an A16 Bionic chip. Pricing has gone up globally, with some countries needing to spend u

In [124]:
with io.open("data/YT_videos.json","w",encoding="utf8") as outfile:
    str_ = json.dumps(YT_videos,
            indent=4, sort_keys=True,
            separators=(',', ': '), ensure_ascii=False)
    outfile.write(str_)
    

In [127]:
with open('data/YT_videos.json') as data_file:
    data_loaded = json.load(data_file)


<class 'dict'>


In [140]:
docs = []
for video in YT_videos:
    docs.extend([Document(page_content=slice['text'],
                 metadata={'start':slice['start'],
                           'end':slice['end'],
                           'product':video['product'],
                           'author':video['author']
                           }
                           ) 
                 for slice in video['transcript_in_batches']])

docs

[Document(page_content="foreign[Music][Music]14 pro so this might be a little bitinside baseball but typically whenyou're testing a new product anunreleased product from Apple they tellyou to keep the testing a secret don'tlet anyone know you have it don'tdisplay it publicly don't show it toanyone that would be breaking the NDAand that's still true about this newiPhone 14 pro but I'll tell you what Iwas able to walk around take picturestext people pay for things with Applepay and nobody even blinked an eyebecause it looksexactly the same as the iPhone 13 Prothis is still of course a new phonethough for a new year and so reallythere are three major things that arenew with the 14 pro those being theinternalsthe displayand the cameras and so I'm going to sortof chapter those things off to go overeach of them but of course also worthmentioning is this new MKBHD EditionChevron hoodie finally finally availablein the MKBHD store mkbhd.com it's verycomfortable but also hoodie season[Music]yeah

In [147]:
pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment=os.environ['PINECONE_API_ENV']
)
index_name = os.environ['PINECONE_INDEX_NAME']

embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])