In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## 모델 상세 사용 코드

In [None]:
import whisper

model = whisper.load_model("large-v3")

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("testAudio/noNoise_codeapple.wav")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(model.device)  # n_mels를 128로 설정

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

In [None]:
import torch
import whisper

# 모델 로드
model = whisper.load_model("large")

# 오디오 파일 경로
audio_file_path = "testAudio/noNoise_codeapple.wav"

# 오디오 로드 및 샘플링
audio = whisper.load_audio(audio_file_path)  # 오디오를 로드
audio = whisper.pad_or_trim(audio)  # 필요 시 패딩 또는 트리밍

# Mel 스펙트로그램 생성
mel = whisper.log_mel_spectrogram(audio, n_mels=128).to(model.device)

# Mel 스펙트로그램 차원 확인
print(f"Mel shape: {mel.shape}")

# 언어 감지
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# 디코딩 옵션 설정
options = whisper.DecodingOptions(language="ko", task="transcribe", without_timestamps=False)

# 오디오 디코드 및 결과 저장
result = model.decode(mel, options)

## 모델 사용 코드

### 모델 임포트 및 로드

In [None]:
import whisper
import torch

model = whisper.load_model("medium") # large or medium 사용

In [None]:
origin = "path/to/audio.wav"

audio = whisper.load_audio(origin)
result = model.transcribe(audio)

In [None]:
import whisper
import time

start_time = time.time()
large_model = whisper.load_model("large")
large_loading_time = time.time() - start_time

start_time = time.time()
base_model = whisper.load_model("base")
base_loading_time = time.time() - start_time

print(f"Large Model Loading Time: {large_loading_time:.2f} seconds")
print(f"Base Model Loading Time: {base_loading_time:.2f} seconds")

# ============

audio = whisper.load_audio("testAudio/noNoise_codeapple.wav")

start_time = time.time()
large_model_output = large_model.transcribe(audio)
large_inference_time = time.time() - start_time

start_time = time.time()
base_model_output = base_model.transcribe(audio)
base_inference_time = time.time() - start_time

print(f"Large Model Inference Time: {large_inference_time:.2f} seconds")
print(f"Base Model Inference Time: {base_inference_time:.2f} seconds")

In [None]:
# 결과를 텍스트 파일로 저장
with open("transcription_outputs.txt", "w", encoding="utf-8") as text_file:
    for model_name, output in outputs.items():
        text_file.write(f"{model_name} Output:\n")
        text_file.write(f"{output}\n\n")  # 각 모델의 출력 사이에 공백 줄 추가

print("Transcription outputs saved to transcription_outputs.txt")

In [None]:
torch.cuda.empty_cache()

## 시간 측정 코드

In [None]:
start_time = time.time()
medium_model = whisper.load_model("medium")
medium_loading_time = time.time() - start_time

start_time = time.time()
small_model = whisper.load_model("small")
small_loading_time = time.time() - start_time

print(f"medium Model Loading Time: {medium_loading_time:.2f} seconds")
print(f"small Model Loading Time: {small_loading_time:.2f} seconds")

# ============

audio = whisper.load_audio("testAudio/noNoise_codeapple.wav")

start_time = time.time()
medium_model_output = medium_model.transcribe(audio)
medium_inference_time = time.time() - start_time

start_time = time.time()
small_model_output = small_model.transcribe(audio)
small_inference_time = time.time() - start_time

print(f"medium Model Inference Time: {medium_inference_time:.2f} seconds")
print(f"small Model Inference Time: {small_inference_time:.2f} seconds")

In [None]:
audio = whisper.load_audio("testAudio/noNpm.wav")

start_time = time.time()
large_model_output = large_model.transcribe(audio)
large_inference_time = time.time() - start_time

start_time = time.time()
base_model_output = base_model.transcribe(audio)
base_inference_time = time.time() - start_time

start_time = time.time()
medium_model_output = medium_model.transcribe(audio)
medium_inference_time = time.time() - start_time

start_time = time.time()
small_model_output = small_model.transcribe(audio)
small_inference_time = time.time() - start_time

print(f"Large Model Inference Time: {large_inference_time:.2f} seconds")
print(f"medium Model Inference Time: {medium_inference_time:.2f} seconds")
print(f"small Model Inference Time: {small_inference_time:.2f} seconds")
print(f"Base Model Inference Time: {base_inference_time:.2f} seconds")

In [None]:
small_model_output

In [None]:
import time
from tqdm import tqdm  # tqdm 라이브러리 추가
import whisper

# 오디오 파일 로드
audio = whisper.load_audio("testAudio/Python_just_1Hour.mp3")

# 모델과 이름 리스트
models = [
    ("Large Model", large_model),
    ("Base Model", base_model),
    ("Medium Model", medium_model),
    ("Small Model", small_model)
]

# 결과를 저장할 딕셔너리
outputs = {}

# 로딩 바를 사용하여 각 모델의 추론 시간을 측정
for model_name, model in tqdm(models, desc="Transcribing", unit="model"):
    start_time = time.time()
    outputs[model_name] = model.transcribe(audio)
    elapsed_time = time.time() - start_time
    tqdm.write(f"{model_name} inference time: {elapsed_time:.2f} seconds")


In [None]:
def calculate_avg_logprob(model_outputs):
    avg_logprobs = []
    
    for model_output in model_outputs:
        segments = model_output['segments']
        avg_logprob = sum(segment['avg_logprob'] for segment in segments) / len(segments)
        avg_logprobs.append(avg_logprob)
    
    return avg_logprobs

model_outputs = [
    base_model_output,
    small_model_output,
    medium_model_output,
    large_model_output
]

avg_logprobs = calculate_avg_logprob(model_outputs)
print("각 모델의 avg_logprob 비율:", avg_logprobs)


In [None]:
dir(outputs)