In [None]:
import sys
print(sys.executable)
print(sys.version)

In [None]:
!uv add git+https://github.com/openai/whisper.git
!uv add whisperx
!uv add pyannote.audio
!uv add torch
!uv add onnxruntime
# !pip install onnxruntime-gpu <-- NVIDIA GPU 사용하는 경우

In [None]:
import sys
import subprocess

def check_version(pkg):
    try:
        mod = __import__(pkg)
        return getattr(mod, '__version__', '버전 정보 없음')
    except ImportError:
        return '미설치'

def pip_version(pkg):
    try:
        out = subprocess.check_output([sys.executable, "-m", "pip", "show", pkg], encoding="utf-8")
        for line in out.splitlines():
            if line.startswith("Version:"):
                return line.split(":", 1)[1].strip()
        return "미설치"
    except Exception:
        return "미설치"

print(f"Python: {sys.version}")
print(f"pip: {pip_version('pip')}")
print(f"whisperx: {check_version('whisperx')}")
print(f"pyannote.audio: {check_version('pyannote.audio')}")
print(f"torch: {check_version('torch')}")
print(f"llvmlite: {check_version('llvmlite')}")
print(f"numpy: {check_version('numpy')}")
print(f"scipy: {check_version('scipy')}")
print(f"librosa: {check_version('librosa')}")
print(f"numba: {check_version('numba')}")


In [None]:
import pkg_resources

packages = [
    "torch",
    "whisperx",
    "pyannote.audio",
    "speechbrain",
    "tqdm",
    "setuptools"
]

for package in packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"{package} version: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package} is not installed")


In [None]:
import whisper

audio_path = r"C:\Users\user\Desktop\woogawooga\woogawooga_project\dataset\(대출빙자형)KB저축은행인데요 카카오톡친구추천부탁드립니다_ (1).mp3"

model = whisper.load_model("large")  # 또는 "small", "medium", "large"
result = model.transcribe(audio_path)
print(result["text"])

In [None]:
!uv pip install --upgrade whisperx

In [None]:
from pyannote.audio import Pipeline
import torch

HF_TOKEN = # "토큰 적어야함."

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=HF_TOKEN
)

# (선택) GPU 사용 시
# pipeline.to(torch.device("cuda"))

audio_path = r"C:\Users\user\Desktop\woogawooga\woogawooga_project\dataset\(대출빙자형)KB저축은행인데요 카카오톡친구추천부탁드립니다_ (1).mp3"

diarization = pipeline(audio_path)

for turn, _, speaker in diarization.itertracks(yield_label=True):
    print(f"화자: {speaker} | {turn.start:.2f}s ~ {turn.end:.2f}s")


In [None]:
import whisperx
from pyannote.audio import Pipeline
import csv
import subprocess
import os

# === 설정 ===
audio_path = r"C:\Users\user\Desktop\woogawooga\woogawooga_project\dataset\1. 기존 대출금 일부 변제해야 저금리 대출 가능(햇살론 사칭)_.mp3"
device = "cpu"
HF_TOKEN = ""

# === MP3 → WAV 변환 ===
def convert_to_wav(input_path, sample_rate=16000):
    output_path = input_path.replace(".mp3", "_converted.wav")
    cmd = [
        "ffmpeg", "-y",
        "-i", input_path,
        "-ac", "1",  # mono
        "-ar", str(sample_rate),
        output_path
    ]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_path

wav_path = convert_to_wav(audio_path)

# === 1. WhisperX STT 및 정렬 ===
print("🔹 WhisperX 모델 로딩 중...")
model = whisperx.load_model("large-v2", device, compute_type="float32")
stt_result = model.transcribe(wav_path, language="ko")

model_a, metadata = whisperx.load_align_model(language_code="ko", device=device)
aligned_result = whisperx.align(stt_result["segments"], model_a, metadata, wav_path, device)

# === 2. Pyannote 화자 분리 ===
print("🔹 Pyannote 화자 분리 모델 로딩 중...")
try:
    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=HF_TOKEN)
except Exception as e:
    print("❌ Pyannote 모델 로딩 실패:", e)
    exit("💥 diarization pipeline is None. 실행 중단합니다.")

diarization_result = pipeline(wav_path)

# === 3. 키워드 기반 역할 분류 ===
phisher_keywords = [
    "사기", "검찰", "경찰", "금융감독원", "압박", "계좌", "송금", "범죄", "확인", "가짜",
    "구속", "영장", "악성", "앱", "탈취", "계좌이체", "연락", "금전", "처벌", "조작",
    "법원", "공갈", "위조", "신분증", "금융", "피해", "위험", "사기범", "허위", "고소",
    "사건", "범인", "제재", "통제", "법적", "사칭", "수사", "협박", "불법", "수금",
    "명령", "강제", "증거", "위협", "중앙지검", "불법행위", "피해자", "압류", "통보", "단속", "네네"
]

victim_keywords = [
    "저", "제가", "우리", "어머니", "아버지", "회사", "돈", "도움", "확인", "모르겠어요",
    "죄송", "무서워요", "걱정", "전화", "상담", "사실", "설명", "전화번호", "진짜", "불안",
    "몰라요", "어떻게", "질문", "말씀", "신분증", "통장", "계좌", "확인", "부탁", "답답해요",
    "부탁드립니다", "알려주세요", "도와주세요", "맞나요", "어떻게 해야", "지금", "방금", "알겠습니다",
    "고맙습니다", "네", "아니요", "잘 모르겠습니다", "조심", "감사합니다", "연락", "계속", "다시", "알았어요", "그렇군요"
]

def is_overlap(start1, end1, start2, end2):
    return max(start1, start2) < min(end1, end2)

transcripts = [
    {
        "start": seg["start"],
        "end": seg["end"],
        "text": seg["text"].strip()
    }
    for seg in aligned_result["segments"]
]

seen_segments = set()
results = []

print("\n📣 전체 발화 내용:")
print("-" * 80)

all_turns = list(diarization_result.itertracks(yield_label=True))

for idx, (turn, _, speaker) in enumerate(all_turns):
    matched_texts = []
    for t in transcripts:
        if is_overlap(t["start"], t["end"], turn.start, turn.end):
            key = (round(t["start"], 2), round(t["end"], 2), t["text"])
            if key not in seen_segments:
                matched_texts.append(t["text"])
                seen_segments.add(key)

    combined_text = " ".join(matched_texts).strip()
    if not combined_text:
        continue

    role = "알 수 없음"
    lowered_text = combined_text.lower()
    if any(kw in lowered_text for kw in victim_keywords):
        role = "피해자"
    if any(kw in lowered_text for kw in phisher_keywords):
        role = "피싱범"

    current_entry = {
        "역할": role,
        "시작시간": round(turn.start, 2),
        "종료시간": round(turn.end, 2),
        "발화내용": combined_text
    }

    if role == "알 수 없음":
        prev_role = results[-1]["역할"].replace(" (추정)", "") if results else None

        next_role = None
        for j in range(idx + 1, len(all_turns)):
            future_turn, _, _ = all_turns[j]
            future_texts = []
            for t in transcripts:
                if is_overlap(t["start"], t["end"], future_turn.start, future_turn.end):
                    future_texts.append(t["text"])
            if not future_texts:
                continue
            future_combined = " ".join(future_texts).lower()
            if any(kw in future_combined for kw in victim_keywords):
                next_role = "피해자"
                break
            if any(kw in future_combined for kw in phisher_keywords):
                next_role = "피싱범"
                break

        if prev_role and next_role and prev_role == next_role and prev_role != "알 수 없음":
            current_entry["역할"] = f"{prev_role} (추정)"

    print(f"[{current_entry['역할']}] 화자: {speaker} | {current_entry['시작시간']}s ~ {current_entry['종료시간']}s")
    print(f"👉 {current_entry['발화내용']}")
    print("-" * 80)

    results.append(current_entry)

# === 4. CSV 저장 ===
csv_path = "dialogue_roles.csv"
with open(csv_path, mode="w", encoding="utf-8-sig", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["역할", "시작시간", "종료시간", "발화내용"])
    writer.writeheader()
    writer.writerows(results)

print(f"\n✅ CSV 저장 완료: {csv_path}")


In [None]:
import importlib.metadata as metadata

packages = ["whisper", "whisperx", "pyannote.audio", "torch"]

for pkg in packages:
    try:
        version = metadata.version(pkg)
        print(f"{pkg}: {version}")
    except metadata.PackageNotFoundError:
        print(f"{pkg}: ❌ 설치되어 있지 않음")

In [None]:
import pyannote.audio
print(pyannote.audio.__version__)

In [None]:
from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=HF_TOKEN)
diarization = pipeline("your_audio_file.wav")