<a href="https://colab.research.google.com/github/jonathantcallahan/guidance/blob/main/finetuning_data_from_lectures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install yt-dlp
!pip install vttformatter
!pip install openai
!pip install getpass

Collecting yt-dlp
  Downloading yt_dlp-2024.4.9-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting brotli (from yt-dlp)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting mutagen (from yt-dlp)
  Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pycryptodomex (from yt-dlp)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting websockets>=12.0 (from yt-dlp

In [None]:
import yt_dlp

URLS = ['https://www.youtube.com/watch?v=kIXLhRkqbKo']

'''
ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
    'writesubtitles': True,
    'subtitlelangs': ['en'],
    'writeautomaticsub': True,
}
'''
ydl_opts = {
    'format': 'best',
    'writesubtitles': True,
    'writeautomaticsub': True,
    'subtitleslangs': ['en'],
    'skip_download': True,
    'outtmpl': 'example.%(ext)s',
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
  error_code = ydl.download(URLS)

[youtube] Extracting URL: https://www.youtube.com/watch?v=kIXLhRkqbKo
[youtube] kIXLhRkqbKo: Downloading webpage
[youtube] kIXLhRkqbKo: Downloading ios player API JSON
[youtube] kIXLhRkqbKo: Downloading android player API JSON




[youtube] kIXLhRkqbKo: Downloading player 7d1f7724
[youtube] kIXLhRkqbKo: Downloading m3u8 information
[info] kIXLhRkqbKo: Downloading subtitles: en
[info] kIXLhRkqbKo: Downloading 1 format(s): 22
[info] Writing video subtitles to: example.en.vtt
[download] Destination: example.en.vtt
[download] 100% of   60.55KiB in 00:00:00 at 1.26MiB/s


In [None]:
import sys
import re
from google.colab import files

vtt_file_name = 'example.en.vtt'

def remove_tags(text):
    """
    Remove vtt markup tags
    """
    tags = [
        r'</c>',
        r'<c(\.color\w+)?>',
        r'<\d{2}:\d{2}:\d{2}\.\d{3}>',

    ]

    for pat in tags:
        text = re.sub(pat, '', text)

    # extract timestamp, only kep HH:MM
    text = re.sub(
        r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
        r'\g<1>',
        text
    )

    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
    return text

def remove_header(lines):
    """
    Remove vtt file header
    """
    pos = -1
    for mark in ('##', 'Language: en',):
        if mark in lines:
            pos = lines.index(mark)
    lines = lines[pos+1:]
    return lines


def merge_duplicates(lines):
    """
    Remove duplicated subtitles. Duplacates are always adjacent.
    """
    last_timestamp = ''
    last_cap = ''
    for line in lines:
        if line == "":
            continue
        if re.match('^\d{2}:\d{2}$', line):
            if line != last_timestamp:
                yield line
                last_timestamp = line
        else:
            if line != last_cap:
                yield line
                last_cap = line


def merge_short_lines(lines):
    buffer = ''
    for line in lines:
        if line == "" or re.match('^\d{2}:\d{2}$', line):
            yield '\n' + line
            continue

        if len(line+buffer) < 80:
            buffer += ' ' + line
        else:
            yield buffer.strip()
            buffer = line
    yield buffer


def main():

    txt_name =  re.sub(r'.vtt$', '.txt', vtt_file_name)
    with open(vtt_file_name) as f:
        text = f.read()
    text = remove_tags(text)
    lines = text.splitlines()
    lines = remove_header(lines)
    lines = merge_duplicates(lines)
    lines = list(lines)
    lines = merge_short_lines(lines)
    lines = list(lines)

    time_pattern = r'\b\d{2}:\d{2}\b'
    bracket_pattern = r'\[.*?\]'
    whitespace_pattern = r'\s{2,}'

    with open(txt_name, 'w') as f:
        for line in lines:
            line = re.sub(time_pattern, ' ', line)
            line = re.sub(bracket_pattern, ' ', line)
            line = line + ' '
            line = re.sub(whitespace_pattern, ' ', line)

            f.write(line)


    files.download(txt_name)

main()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import openai
from openai import OpenAI
import os

# Set your API key securely
from getpass import getpass
api_key = getpass('Enter your API key: ')
os.environ['OPENAI_API_KEY'] = api_key
client = OpenAI()

Enter your API key: ··········


In [None]:
def chunk_text(text):
    try:
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are a document processor, skilled in breaking long transcriptions into groups of cohesive ideas. Do not summarize, or reflect. The response should only include exactly the same words that are in the prompt with the only changes being the correction of clear grammatical errors. Return each chunk with a line break at the end of the chunk"},
                {"role": "user", "content": f"Divide this text into coherent chunks each no more than 400 words:\n\n{text}"}
                ]
        )
        text_chunks = response.choices[0].message.content.strip()
        return text_chunks
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [None]:
import json

training_data = {
    "prompts" : []
}

In [None]:
text_data = ""
with open('example.en.txt', 'r') as file:
    text_data = file.read()

chunks = chunk_text(text_data)
print(chunks)

training_data['prompts'].append(json.loads(chunks))

#for i, chunk in enumerate(chunks):
    #print(f"Chunk {i+1}: {chunk}\n")

print(training_data)

{
  "Instruction": "You are author and intellectual Alan Watts, please respond accordingly",
  "Input": "Can you explain the role of an artist in society and how their work impacts our understanding of life?",
  "Output": "Allow ourselves the idea that life is not serious because somehow you feel if you aren't engaged in something serious, you're a loafer, you're not contributing to the social welfare. In this way, the artist has a peculiar role in society, very very interesting because the artist is a very deceptive fellow. He appears to be the supreme luxury, the irrelevant fellow. You can afford an artist, you can afford to buy paintings if you have surplus money; that's a luxury. So you can support an artist and we call it Fine Arts, the completely useless person who makes paintings which are sort of big labels or posters that you stick on your utilitarian walls to decorate them. But on the other hand, the artist is the man who shows you the future long before everybody else sees i

JSONDecodeError: Extra data: line 5 column 2 (char 1533)

In [None]:
print(chunks)

{
  "Instruction": "You are author and intellectual Alan Watts, please respond accordingly",
  "Input": "Can you explain the role of an artist in society and how their work impacts our understanding of life?",
  "Output": "Allow ourselves the idea that life is not serious because somehow you feel if you aren't engaged in something serious, you're a loafer, you're not contributing to the social welfare. In this way, the artist has a peculiar role in society, very very interesting because the artist is a very deceptive fellow. He appears to be the supreme luxury, the irrelevant fellow. You can afford an artist, you can afford to buy paintings if you have surplus money; that's a luxury. So you can support an artist and we call it Fine Arts, the completely useless person who makes paintings which are sort of big labels or posters that you stick on your utilitarian walls to decorate them. But on the other hand, the artist is the man who shows you the future long before everybody else sees i

In [None]:
print(json.loads(chunks))

JSONDecodeError: Extra data: line 5 column 2 (char 1533)