In [70]:
from moviepy.editor import VideoFileClip
def extract_audio(video_path, audio_path):
    print("extract_audio processing...")
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path)

import whisper
def transcribe_audio(audio_path, model_name="base"):
    print("transcribe_audio processing...")
    model = whisper.load_model(model_name)
    result = model.transcribe(audio_path)
    return result

# 현재는 대본이 없기 때문에 전적으로 음성 인식결과에서 매칭.
# 만약 대본을 구할 수 있다면, 음성인식 부분에서 parse한 결과를 실제 대본으로 업데이트.

import re
def search_text(transcript, query):
    print("search_text processing...")
    matches = re.finditer(query, transcript, re.IGNORECASE)
    return [match.span() for match in matches]

def search_text(result, query):
    print("search_text processing...")
    for idx, item in enumerate(result["segments"]):
        if query in item["text"]:
            return result["segments"][idx-1]["start"], result["segments"][idx+1]["end"], query
    return None, None, query

def map_time(result, matches):
    print("mapping time processing...")
    segments = result["segments"]
    time_matches = []
    for match in matches:
        start_time = next(seg["start"] for seg in segments if seg["start"] <= match[0] < seg["end"])
        end_time = next(seg["end"] for seg in reversed(segments) if seg["start"] < match[1] <= seg["end"])
        time_matches.append((start_time, end_time))
    return time_matches

def print_results(time_matches):
    for start, end in time_matches:
        print(f"대사 발견: {start:.2f}초 - {end:.2f}초")

def save_to_txt(script, file_path = "./assets/Goblin_test.txt", encoding='utf-8'):
    try:
        with open(file_path, 'w', encoding=encoding) as f:
            f.write(script)
        print(f"파일이 성공적으로 저장되었습니다: {file_path}")
    except Exception as e:
        print(f"파일 저장 중 오류가 발생했습니다: {str(e)}")

video_path = "./assets/Goblin_test.mp4"
audio_path = "./assets/Goblin_test.wav"
extract_audio(video_path, audio_path)
# option: base, small, medium, large, turbo
# turbo is optimized for inference speed
result = transcribe_audio(audio_path, model_name="turbo")
transcript = result['text']
save_to_txt(transcript)

# 8분영상 처리하는데, 약 5분 -> 개선여지 gpu 돌리고 음,,

extract_audio processing...
MoviePy - Writing audio in ./assets/Goblin_test.wav


                                                                        

MoviePy - Done.
transcribe_audio processing...


100%|█████████████████████████████████████| 1.51G/1.51G [01:09<00:00, 23.3MiB/s]


파일이 성공적으로 저장되었습니다: ./assets/Goblin_test.txt


In [67]:
from IPython.display import Video, HTML
import base64
from pathlib import Path

def play_video_basic(video_path, width=800):
    """기본 비디오 플레이어"""
    return Video(video_path)

def play_video_with_controls(video_path, width=800):
    """HTML5 비디오 플레이어 (컨트롤 포함)"""
    video_path = Path(video_path)
    video_type = f'video/{video_path.suffix[1:]}'  # .mp4 -> video/mp4
    
    html = f'''
    <video width="{width}" controls>
        <source src="{video_path}" type="{video_type}">
        Your browser does not support the video tag.
    </video>
    '''
    return HTML(html)

def play_video_segment(video_path, start_time=0, end_time=None, width=800):
    """특정 구간 재생"""
    video_path = Path(video_path)
    video_type = f'video/{video_path.suffix[1:]}'
    
    # t=시작시간#t=끝시간 형식으로 구간 지정
    time_params = f"#t={start_time}"
    if end_time:
        time_params += f",{end_time}"
    
    html = f'''
    <video width="{width}" controls>
        <source src="{video_path}{time_params}" type="{video_type}">
        Your browser does not support the video tag.
    </video>
    '''
    return HTML(html)

In [76]:
# sample target_text
num = int(input("몇 번째 대사를 찾을까요?"))
text_corpus = {
    1: "아파",
    2: "잘 먹었어",
    3: "걱정 마",
    4: "어디 가는데요?",
    5: '그래',
    6: '괜찮으세요',
    7: '더러워',
    8: '처음이야',
}
target_text = text_corpus[num]

In [77]:
#특정 구간 재생
start_time, end_time, query = search_text(result, target_text)
segment_player = play_video_segment(video_path, start_time=start_time, end_time=end_time)
display(segment_player)

search_text processing...
