In [None]:
import os
import json
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from kiwipiepy import Kiwi

# 1. 경로 설정 및 저장 함수
save_dir = "phishing_keyword_pipeline"
os.makedirs(save_dir, exist_ok=True)

def save_csv(df, name):
    path = os.path.join(save_dir, name)
    df.to_csv(path, index=False, encoding="utf-8-sig")
    print(f"CSV 저장: {path}")

def save_json(obj, name):
    path = os.path.join(save_dir, name)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)
    print(f"JSON 저장: {path}")

# 2. 데이터 불러오기
df_phish = pd.read_csv("C:/Users/user/Desktop/보이스피싱/woogawooga_project/dataset/phishing_total.csv")
df_normal = pd.read_csv("C:/Users/user/Desktop/보이스피싱/woogawooga_project/dataset/normal_비율맞춘_학습데이터셋.csv")

# 3. 대화 단위로 묶기
phish_dialogue = df_phish.groupby("file_name").agg({
    "text": lambda x: " ".join(x),
    "phishing_type": "first"
}).reset_index()

normal_dialogue = df_normal.groupby("file_name")["text"].apply(lambda x: " ".join(x)).reset_index()

save_csv(phish_dialogue, "phish_dialogue.csv")
save_csv(normal_dialogue, "normal_dialogue.csv")

# 4. Kiwi 형태소 분석기 + NNP 제거
kiwi = Kiwi()

def tokenize(text):
    tokens = kiwi.tokenize(str(text))
    return ' '.join([
        token.lemma for token in tokens
        if (token.tag.startswith("N") or token.tag.startswith("V") or token.tag.startswith("VA"))
        and not token.tag.startswith("NNP")  # 고유명사 제거
    ])

phish_dialogue["tokenized_text"] = phish_dialogue["text"].apply(tokenize)
normal_dialogue["tokenized_text"] = normal_dialogue["text"].apply(tokenize)

save_csv(phish_dialogue, "phish_tokenized.csv")
save_csv(normal_dialogue, "normal_tokenized.csv")

# 5. TF-IDF 기반 위험도 계산 (보이스피싱 vs 일반대화)
all_docs = list(phish_dialogue["tokenized_text"]) + list(normal_dialogue["tokenized_text"])
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(all_docs)
words = vectorizer.get_feature_names_out()

n_phish = len(phish_dialogue)
n_normal = len(normal_dialogue)
phish_avg = tfidf_matrix[:n_phish].mean(axis=0).A1
normal_avg = tfidf_matrix[n_phish:].mean(axis=0).A1

epsilon = 1e-6
risk_dict = {word: phish_avg[i] / (normal_avg[i] + epsilon) for i, word in enumerate(words)}
save_json(risk_dict, "risk_dict.json")

# 6. 위험도 × log(등장빈도) 기반 유형별 키워드 추출
top_keywords_log = {}

for ptype, group in phish_dialogue.groupby("phishing_type"):
    tokens = ' '.join(group["tokenized_text"]).split()
    counts = Counter(tokens)

    keyword_scores = {
        word: risk_dict.get(word, 0) * np.log1p(counts[word])
        for word in counts if word in risk_dict
    }

    top50 = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)[:50]
    top_keywords_log[ptype] = dict(top50)

save_json(top_keywords_log, "top_keywords_log.json")


✅ CSV 저장: phishing_keyword_pipeline2\phish_dialogue.csv
✅ CSV 저장: phishing_keyword_pipeline2\normal_dialogue.csv
✅ CSV 저장: phishing_keyword_pipeline2\phish_tokenized.csv
✅ CSV 저장: phishing_keyword_pipeline2\normal_tokenized.csv
✅ JSON 저장: phishing_keyword_pipeline2\risk_dict.json
✅ JSON 저장: phishing_keyword_pipeline2\top_keywords_log.json


In [None]:
import pandas as pd
import json
from collections import Counter

# 1. 데이터 로드
phish_df = pd.read_csv("phishing_keyword_pipeline/phish_tokenized.csv")
normal_df = pd.read_csv("phishing_keyword_pipeline/normal_tokenized.csv")

phish_df["label"] = 1
normal_df["label"] = 0

total_df = pd.concat([phish_df, normal_df], ignore_index=True)

# 2. 키워드 로드
with open("phishing_keyword_pipeline/top_keywords_log.json", "r", encoding="utf-8") as f:
    keyword_dict = json.load(f)

# 3. 전체 키워드 목록 통합
all_keywords = set()
for keyword_list in keyword_dict.values():
    all_keywords.update(keyword_list.keys())

# 정렬해서 고정된 순서로 사용
all_keywords = sorted(all_keywords)

# 4. 등장 횟수 기반 feature 생성
def keyword_count_vector(text, keyword_set):
    tokens = text.split()
    counts = Counter(tokens)
    return [counts.get(word, 0) for word in keyword_set]

# 5. X, y 생성
X = total_df["tokenized_text"].apply(lambda x: keyword_count_vector(x, all_keywords))
X_df = pd.DataFrame(X.tolist(), columns=[f"kw_{kw}" for kw in all_keywords])
X_df["label"] = total_df["label"]

# 6. 저장
X_df.to_csv("phishing_keyword_pipeline/X_train.csv", index=False, encoding="utf-8-sig")
print("X_train.csv 저장 완료!")


✅ X_train.csv 저장 완료!


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import scipy

# 1. 데이터 불러오기
df = pd.read_csv("phishing_keyword_pipeline/X_train.csv")

# 2. 레이블 분리
y = df["label"]
X_counts = df.drop(columns=["label"])

# 3. TF-IDF 가중치 적용
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X_counts)

# 4. 저장 (압축 희소행렬 포맷으로 저장)
scipy.sparse.save_npz("phishing_keyword_pipeline/X_train_tfidf.npz", X_tfidf)
y.to_csv("phishing_keyword_pipeline/y_train.csv", index=False)

print("TF-IDF 변환 완료 및 저장됨!")
print(f"TF-IDF 행렬 shape: {X_tfidf.shape}")


TF-IDF 변환 완료 및 저장됨!
TF-IDF 행렬 shape: (8047, 173)


로지스틱 테스트1 (증강데이터 미포함, 보류구간없음)

In [None]:
import pandas as pd
import scipy
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. 파일 경로
X = scipy.sparse.load_npz("phishing_keyword_pipeline/X_train_tfidf.npz")
y = pd.read_csv("phishing_keyword_pipeline/y_train.csv").values.ravel()

# 2. 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3. 로지스틱 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_val)[:, 1]

# 4. 다양한 보류 구간 테스트
thresholds = [(round(i, 2), round(j, 2)) for i in np.arange(0.1, 0.5, 0.05)
                                            for j in np.arange(0.6, 0.91, 0.05) if i < j]

results = []

for low, high in thresholds:
    def classify(p):
        if p < low:
            return 0
        elif p > high:
            return 1
        else:
            return 2

    y_pred = np.array([classify(p) for p in y_prob])
    mask = y_pred != 2

    if np.sum(mask) == 0:
        continue

    prec = precision_score(y_val[mask], y_pred[mask], zero_division=0)
    rec = recall_score(y_val[mask], y_pred[mask])
    f1 = f1_score(y_val[mask], y_pred[mask])
    hold_rate = 1 - np.mean(mask)

    results.append({
        "보류_하한": low,
        "보류_상한": high,
        "precision": round(prec, 4),
        "recall": round(rec, 4),
        "f1_score": round(f1, 4),
        "보류비율(%)": round(hold_rate * 100, 1)
    })

results_df = pd.DataFrame(results).sort_values(by="f1_score", ascending=False)
results_df.to_csv("phishing_keyword_pipeline/보류구간_성능비교.csv", index=False, encoding="utf-8-sig")
print("결과 저장 완료: 보류구간_성능비교.csv")


✅ 결과 저장 완료: 보류구간_성능비교.csv


In [14]:

# 2. 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=77
)

# 3. 로지스틱 모델 학습
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# 4. 확률 기반 예측
y_prob = model.predict_proba(X_val)[:, 1]  # 클래스 1(피싱) 확률만 가져오기

# 5. 보류 구간 처리
def classify(prob):
    if prob < 0.1:
        return 0  # 일반
    elif prob > 0.75:
        return 1  # 피싱
    else:
        return 2  # 보류

y_pred_custom = [classify(p) for p in y_prob]

# 6. 평가 출력
results_df = pd.DataFrame({
    "true": y_val,
    "prob": y_prob,
    "pred": y_pred_custom
})

# 7. 각 클래스별 개수 확인
print(" 예측 결과 분포 (0: 일반, 1: 피싱, 2: 보류):")
print(results_df["pred"].value_counts())

# 8. 혼동행렬 (보류 제외한 평가)
print("\n 보류 제외하고 정확도 평가 (0과 1만):")
mask = results_df["pred"] != 2
print(classification_report(results_df["true"][mask], results_df["pred"][mask], digits=4))

# 9. 보류된 데이터 수
n_total = len(results_df)
n_hold = (results_df["pred"] == 2).sum()
print(f"\n 전체 {n_total}개 중 보류된 데이터: {n_hold}개 ({n_hold / n_total:.1%})")

# 저장
joblib.dump(model, "phishing_keyword_pipeline/cy_logistic_보류구간v3.pkl")
print(" 모델 저장 완료: cy_logistic_보류구간v3.pkl")

 예측 결과 분포 (0: 일반, 1: 피싱, 2: 보류):
pred
0    942
2    402
1    266
Name: count, dtype: int64

 보류 제외하고 정확도 평가 (0과 1만):
              precision    recall  f1-score   support

           0     0.9894    0.9904    0.9899       941
           1     0.9662    0.9625    0.9644       267

    accuracy                         0.9843      1208
   macro avg     0.9778    0.9765    0.9771      1208
weighted avg     0.9843    0.9843    0.9843      1208


 전체 1610개 중 보류된 데이터: 402개 (25.0%)
 모델 저장 완료: cy_logistic_보류구간v3.pkl


결론: 택배사칭형을 놓쳤다

In [None]:
import pandas as pd
import scipy
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. TF-IDF 벡터 & 라벨 불러오기
X = scipy.sparse.load_npz("phishing_keyword_pipeline/X_train_tfidf.npz")
y = pd.read_csv("phishing_keyword_pipeline/y_train.csv").values.ravel()

# 2. 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3. XGBoost 모델 학습
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1)
xgb_model.fit(X_train, y_train)

# 4. 확률 예측
y_prob = xgb_model.predict_proba(X_val)[:, 1]

# 5. 보류 기준 적용 (예: 0.10 ~ 0.75)
def classify(prob, low=0.10, high=0.75):
    if prob < low:
        return 0
    elif prob > high:
        return 1
    else:
        return 2

y_pred_custom = [classify(p) for p in y_prob]

# 6. 평가 (보류 제외)
results_df = pd.DataFrame({
    "true": y_val,
    "prob": y_prob,
    "pred": y_pred_custom
})

mask = results_df["pred"] != 2
print("XGBoost 결과 (보류 제외)")
print(classification_report(results_df["true"][mask], results_df["pred"][mask], digits=4))

print(f" 보류 비율: {(results_df['pred'] == 2).mean() * 100:.1f}%")

# 7. 모델 저장 (선택)
import joblib
joblib.dump(xgb_model, "phishing_keyword_pipeline/model_xgboost_v1.pkl")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBoost 결과 (보류 제외)
              precision    recall  f1-score   support

           0     0.9797    0.9825    0.9811      1031
           1     0.9489    0.9408    0.9448       355

    accuracy                         0.9719      1386
   macro avg     0.9643    0.9617    0.9630      1386
weighted avg     0.9718    0.9719    0.9718      1386


🔸 보류 비율: 13.9%


['phishing_keyword_pipeline/model_xgboost_v1.pkl']

In [18]:
import pandas as pd
import scipy
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. 데이터 불러오기
X = scipy.sparse.load_npz("phishing_keyword_pipeline/X_train_tfidf.npz")
y = pd.read_csv("phishing_keyword_pipeline/y_train.csv").values.ravel()

# 2. train/val 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3. XGBoost 모델 학습
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1)
xgb_model.fit(X_train, y_train)

# 4. 예측 확률
y_prob = xgb_model.predict_proba(X_val)[:, 1]

# 5. 다양한 보류 구간 실험
thresholds = [(round(i, 2), round(j, 2)) for i in np.arange(0.1, 0.5, 0.05)
                                            for j in np.arange(0.6, 0.91, 0.05) if i < j]

results = []

for low, high in thresholds:
    def classify(p):
        if p < low:
            return 0
        elif p > high:
            return 1
        else:
            return 2

    y_pred = np.array([classify(p) for p in y_prob])
    mask = y_pred != 2

    if np.sum(mask) == 0:
        continue

    prec = precision_score(y_val[mask], y_pred[mask], zero_division=0)
    rec = recall_score(y_val[mask], y_pred[mask])
    f1 = f1_score(y_val[mask], y_pred[mask])
    hold_rate = 1 - np.mean(mask)

    results.append({
        "보류_하한": low,
        "보류_상한": high,
        "precision": round(prec, 4),
        "recall": round(rec, 4),
        "f1_score": round(f1, 4),
        "보류비율(%)": round(hold_rate * 100, 1)
    })

results_df = pd.DataFrame(results).sort_values(by="f1_score", ascending=False)
results_df.to_csv("phishing_keyword_pipeline/XGB_보류구간_성능비교.csv", index=False, encoding="utf-8-sig")

print("XGBoost 보류구간 실험 완료!")
print(" 결과 파일: XGB_보류구간_성능비교.csv 저장됨")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost 보류구간 실험 완료!
 결과 파일: XGB_보류구간_성능비교.csv 저장됨
