In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from kiwipiepy import Kiwi

# 1. 데이터 불러오기
df_phish = pd.read_csv("C:/Users/user/Downloads/0708/woogawooga_project/dataset/피싱데이터에서llm테스트뺀것.csv")
df_normal = pd.read_csv("C:/Users/user/Downloads/0708/woogawooga_project/dataset/일반통화_남은셋.csv")

# 2. 텍스트 문자열화
df_phish["text"] = df_phish["text"].astype(str)
df_normal["text"] = df_normal["text"].astype(str)

# 3. 보이스피싱 데이터는 file_name 기준으로 대화 단위로 묶기
phish_dialogue = df_phish.groupby("file_name")["text"].apply(lambda x: ' '.join(x)).reset_index()
phish_dialogue["phishing_type"] = phish_dialogue["file_name"].map(
    df_phish.set_index("file_name")["phishing_type"].to_dict()
)

# 4. Kiwi 형태소 분석기 준비
kiwi = Kiwi()

#  표제어 기반 토큰화 함수
def tokenize_kiwi_lemmatized(text):
    tokens = kiwi.tokenize(text)
    return ' '.join([
        token.lemma
        for token in tokens
        if token.tag.startswith("N") or token.tag.startswith("V") or token.tag.startswith("VA")
    ])

# 5. 보이스피싱/일반 대화 텍스트를 표제어 기반 토큰화
phish_dialogue["tokenized_text"] = phish_dialogue["text"].apply(tokenize_kiwi_lemmatized)
normal_text = ' '.join(df_normal["text"])
normal_tokenized = tokenize_kiwi_lemmatized(normal_text)

# 6. 위험도 계산 (TF-IDF 기반)
docs = [' '.join(phish_dialogue["tokenized_text"]), normal_tokenized]
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(docs)

words = vectorizer.get_feature_names_out()
phish_scores = tfidf_matrix.toarray()[0]
normal_scores = tfidf_matrix.toarray()[1]

epsilon = 1e-6
risk_scores = phish_scores / (normal_scores + epsilon)
risk_dict = dict(zip(words, risk_scores))  # 단어: 위험도 점수

# 7. 보이스피싱 유형별 위험 키워드 추출
grouped_type = df_phish.groupby("phishing_type")["text"].apply(lambda x: ' '.join(x)).reset_index()
grouped_type["tokenized_text"] = grouped_type["text"].apply(tokenize_kiwi_lemmatized)

top_keywords_by_type = {}
for _, row in grouped_type.iterrows():
    ptype = row["phishing_type"]
    tokens = row["tokenized_text"].split()

    keyword_scores = {word: risk_dict[word] for word in tokens if word in risk_dict}
    top_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:30]
    top_keywords_by_type[ptype] = dict(top_keywords)

# 8. 결과 저장
df_result = pd.DataFrame(top_keywords_by_type).T
df_result.to_csv("유형별_위험키워드_Kiwi.csv", encoding="utf-8-sig")

print("'유형별_위험키워드_Kiwi.csv' 저장 완료 (표제어 기반)")

