In [None]:
import os
import subprocess
import re
source_dir = "C:\\projects\\local\\video_analysis"
ffmpeg_path = os.environ['LOCALAPPDATA'] + "\\Microsoft\\WinGet\\Packages\\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\\ffmpeg-6.0-full_build\\bin\\ffmpeg.exe"
import whisper
os.environ["OPENAI_API_TYPE"]="azure"
os.environ["OPENAI_API_VERSION"]="2023-05-15"

# get settings from files
with open(source_dir + "\\..\\openai-base.txt", "r") as f:
    os.environ["OPENAI_API_BASE"] = f.read()
with open(source_dir + "\\..\\openai-key.txt", "r") as f:
    os.environ["OPENAI_API_KEY"] = f.read()

from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate
llm = AzureOpenAI(
    deployment_name="gpt-35-turbo"
)
chat_model = AzureChatOpenAI(
    deployment_name="gpt-35-turbo"
)


In [None]:
# find all mp4 files in source_dir (and subdirectories) that do not yet have a mp3 file and convert them with ffmpeg
def process_files():
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".mp4"):
                mp4_file = os.path.join(root, file)
                mp3_file = mp4_file[:-4] + '.mp3'
                if not os.path.isfile(mp3_file):
                    cmd = [ffmpeg_path, '-i', mp4_file, '-vn', '-ar', '44100', '-ac', '2', '-ab', '192k', '-f', 'mp3', mp3_file]
                    print(cmd)
                    proc = subprocess.Popen(cmd)
                    result = proc.wait()
                    print("{} - processed from {}".format(result, mp4_file))
process_files()
print('generated MP3s for all MP4s in ' + source_dir)



In [None]:
model = whisper.load_model('medium.en', device='cuda')

# find all mp3 files in source_dir (and subdirectories) that do not yet have a txt file and convert them with whisper
def process_files():
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".mp3"):
                mp3_file = os.path.join(root, file)
                txt_file = mp3_file[:-4] + '.txt'
                if not os.path.isfile(txt_file):
                    print('processing {}'.format(mp3_file))
                    result = model.transcribe(mp3_file)
                    # result has the following structure:
                    #  text: string
                    #  language: string
                    #  segments: array of
                    #    id: number
                    #    seek: number
                    #    start: number
                    #    end: number
                    #    text: string
                    #    tokens: number[]
                    #    temperature: number
                    #    avg_logprob: number
                    #    compression_ratio: number
                    #    no_speech_prob: number

                    # if we used text, we'll get one giant line.  Instead, we'll use segments
                    print('got {} segments from {}'.format(len(result['segments']), mp3_file))
                    with open(txt_file, 'w') as f:
                        for segment in result['segments']:
                            f.write(segment['text'] + '\n')
                    print('wrote to {}'.format(txt_file))
process_files()
print('generated TXTs for all MP3s in ' + source_dir)


In [None]:
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful assistant that identifies the date a file was created based on information in the filename.
    All dates are between 2019 and 2030, and any dates in the filename are written with the month before the date (ie. american style).
    The resulting date should be formatted as YYYY-mm-dd - ie. 2021-02-15.
    Your response should _only_ contain the date, and nothing else.
    """),
    ("human", "{text}"),
])
chain = chat_prompt | chat_model

# find all txt files in source_dir (and subdirectories) that do not yet have a date file and ask the LLM to guess the date
def process_files():
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".txt"):
                txt_file = os.path.join(root, file)
                date_file = txt_file[:-4] + '.date'
                if not os.path.isfile(date_file):
                    print('getting date for {}'.format(txt_file))
                    result = chain.invoke({ "text": txt_file})
                    date = result.content
                    # use a regular expression to make sure date looks like YYYY-mm-dd
                    if not re.match(r'^\d{4}-\d{2}-\d{2}$', date):
                        print('invalid date: {} generated for {}'.format(date, txt_file))
                        continue
                    with open(date_file, 'w') as f:
                        f.write(date)
process_files()
print('generated DATEs for all TXTs in ' + source_dir)
