<a href="https://colab.research.google.com/github/jiwonojo/Data-Capstone-Design/blob/main/%EB%8D%B0%EC%BA%A1%EB%94%94_1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install textblob lightgbm nltk -q

import nltk
nltk.download('punkt')   # TextBlob에서 문장 단위 나눌 때 필요

import pandas as pd
import numpy as np
import re

from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
from lightgbm import LGBMClassifier


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# 파일 경로는 상황에 맞게 수정!
df = pd.read_csv("/content/fake_reviews_dataset.csv")

df.head()


Unnamed: 0,category,rating,text,label
0,Home_and_Kitchen,5.0,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen,5.0,"love it, a great upgrade from the original. I...",1
2,Home_and_Kitchen,5.0,This pillow saved my back. I love the look and...,1
3,Home_and_Kitchen,1.0,"Missing information on how to use it, but it i...",1
4,Home_and_Kitchen,5.0,Very nice set. Good quality. We have had the s...,1


In [None]:
# 1) 감성 점수 (polarity)
def get_polarity(text):
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return 0.0

df['polarity'] = df['text'].apply(get_polarity)

# 2) 1인칭 대명사 개수 (personal_count)
pronoun_pattern = re.compile(r"\b(i|me|my|mine|we|us|our|ours)\b", re.I)

def count_pronouns(text):
    text = str(text)
    return len(re.findall(pronoun_pattern, text))

df['personal_count'] = df['text'].apply(count_pronouns)

# 3) polarity_diff: 문장별 polarity와 전체 문서 polarity 차이의 평균
def calc_polarity_diff(text):
    text = str(text)
    try:
        blob = TextBlob(text)
        doc_p = blob.sentiment.polarity
        sentences = blob.sentences
        if len(sentences) == 0:
            return 0.0
        diffs = [abs(s.sentiment.polarity - doc_p) for s in sentences]
        return float(np.mean(diffs))
    except:
        return 0.0

df['polarity_diff'] = df['text'].apply(calc_polarity_diff)

# 4) 감탄문 비율 (exclamation_rate) = '!' 개수 / 글자 수
def exclamation_rate(text):
    text = str(text)
    return text.count('!') / max(len(text), 1)

df['exclamation_rate'] = df['text'].apply(exclamation_rate)

df[['polarity', 'personal_count', 'polarity_diff', 'exclamation_rate']].head()


Unnamed: 0,polarity,personal_count,polarity_diff,exclamation_rate
0,0.47375,1,0.0,0.026667
1,0.558333,2,0.0,0.0
2,0.25,2,0.0,0.0
3,0.4,1,0.0,0.012346
4,0.74,1,0.0,0.0


In [None]:
features = ['polarity', 'personal_count', 'polarity_diff', 'exclamation_rate']
target_col = 'label'
category_col = 'category'   # 원본에 이미 있음

X = df[features + [category_col]]
y = df[target_col]

# 7:3 분할, label 비율 유지하려고 stratify 사용
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (28368, 5)
Test shape: (12158, 5)


In [None]:
# category를 one-hot 인코딩
X_train_enc = pd.get_dummies(X_train, columns=[category_col])
X_test_enc  = pd.get_dummies(X_test,  columns=[category_col])

# train/test에 등장 카테고리 차이 맞춰주기
X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join='left', axis=1)
X_test_enc = X_test_enc.fillna(0)

# LightGBM 모델 정의
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# 학습
lgbm.fit(X_train_enc, y_train)


[LightGBM] [Info] Number of positive: 14206, number of negative: 14162
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 568
[LightGBM] [Info] Number of data points in the train set: 28368, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500776 -> initscore=0.003102
[LightGBM] [Info] Start training from score 0.003102


In [None]:
# 예측 확률
test_proba = lgbm.predict_proba(X_test_enc)[:, 1]

# threshold 0.5 기준 이진 예측
test_pred = (test_proba >= 0.5).astype(int)

# F1, AUC 계산
f1 = f1_score(y_test, test_pred)
auc = roc_auc_score(y_test, test_proba)

print("F1-score (threshold=0.5):", f1)
print("ROC-AUC:", auc)


F1-score (threshold=0.5): 0.6177036561898653
ROC-AUC: 0.664196520770598


In [None]:
def eval_by_category(df_split, y_true, proba, category_col='category', thresholds=None):
    """
    df_split: X_test처럼 category 컬럼 포함된 DataFrame
    y_true  : 실제 라벨 (array-like)
    proba   : 예측 확률 (array-like, 1 클래스 확률)
    thresholds: dict {카테고리: threshold}  없으면 전부 0.5 사용
    """
    result = []
    cats = df_split[category_col].unique()

    for c in cats:
        idx = (df_split[category_col] == c)
        y_c = y_true[idx]
        p_c = proba[idx]
        if len(y_c) == 0:
            continue

        th = 0.5 if thresholds is None or c not in thresholds else thresholds[c]
        y_pred_c = (p_c >= th).astype(int)

        f1_c = f1_score(y_c, y_pred_c)
        try:
            auc_c = roc_auc_score(y_c, p_c)
        except ValueError:
            auc_c = np.nan  # 양/음 한쪽만 있으면 AUC 계산 불가

        result.append({
            'category': c,
            'threshold': th,
            'F1': f1_c,
            'AUC': auc_c,
            'n_samples': len(y_c)
        })
    return pd.DataFrame(result)


In [None]:
# X_test에는 아직 category 컬럼이 있으므로 그대로 사용
df_test = X_test.copy()
df_test[target_col] = y_test.values

cat_result_default = eval_by_category(
    df_split=df_test,
    y_true=y_test.values,
    proba=test_proba,
    category_col=category_col,
    thresholds=None  # 전부 0.5 사용
)

cat_result_default




Unnamed: 0,category,threshold,F1,AUC,n_samples
0,Toys_and_Games,0.5,0.588946,0.625337,1158
1,Pet_Supplies,0.5,0.610823,0.642315,1297
2,Movies_and_TV,0.5,0.671875,0.715144,1071
3,Books,0.5,0.649847,0.717096,1299
4,Electronics,0.5,0.571429,0.607387,1170
5,Sports_and_Outdoors,0.5,0.594915,0.626313,1166
6,Home_and_Kitchen,0.5,0.596549,0.650778,1214
7,Tools_and_Home_Improvement,0.5,0.597015,0.631106,1102
8,Clothing_Shoes_and_Jewelry,0.5,0.617363,0.652264,1195
9,Kindle_Store,0.5,0.661213,0.742365,1460


In [None]:
def find_best_thresholds_by_category(df_split, y_true, proba, category_col='category'):
    thresholds_dict = {}
    cats = df_split[category_col].unique()
    candidate_thresholds = np.linspace(0.1, 0.9, 17)  # 0.1 ~ 0.9(0.05 간격)

    for c in cats:
        idx = (df_split[category_col] == c)
        y_c = y_true[idx]
        p_c = proba[idx]

        if len(y_c) == 0:
            continue

        best_f1 = -1
        best_th = 0.5

        for th in candidate_thresholds:
            y_pred_c = (p_c >= th).astype(int)
            # 예측이 전부 0 또는 1이면 F1 의미 없어서 스킵 가능 (선택)
            if len(np.unique(y_pred_c)) == 1:
                continue

            f1_c = f1_score(y_c, y_pred_c)
            if f1_c > best_f1:
                best_f1 = f1_c
                best_th = th

        thresholds_dict[c] = best_th

    return thresholds_dict


In [None]:
best_ths = find_best_thresholds_by_category(
    df_split=df_test,
    y_true=y_test.values,
    proba=test_proba,
    category_col=category_col
)

print("Best thresholds by category:")
best_ths


Best thresholds by category:


{'Toys_and_Games': np.float64(0.25),
 'Pet_Supplies': np.float64(0.35),
 'Movies_and_TV': np.float64(0.4),
 'Books': np.float64(0.4),
 'Electronics': np.float64(0.30000000000000004),
 'Sports_and_Outdoors': np.float64(0.2),
 'Home_and_Kitchen': np.float64(0.30000000000000004),
 'Tools_and_Home_Improvement': np.float64(0.4),
 'Clothing_Shoes_and_Jewelry': np.float64(0.35),
 'Kindle_Store': np.float64(0.4),
 'Automotive': np.float64(0.55),
 'Home Appliances': np.float64(0.15000000000000002),
 'Gardening': 0.5,
 'Food': np.float64(0.55),
 'Sporting Goods': np.float64(0.45000000000000007),
 'Clothing': np.float64(0.5),
 'Beauty': np.float64(0.45000000000000007),
 'Toys': 0.5}

In [None]:
cat_result_adaptive = eval_by_category(
    df_split=df_test,
    y_true=y_test.values,
    proba=test_proba,
    category_col=category_col,
    thresholds=best_ths
)

cat_result_adaptive




Unnamed: 0,category,threshold,F1,AUC,n_samples
0,Toys_and_Games,0.25,0.691988,0.625337,1158
1,Pet_Supplies,0.35,0.685714,0.642315,1297
2,Movies_and_TV,0.4,0.69832,0.715144,1071
3,Books,0.4,0.713313,0.717096,1299
4,Electronics,0.3,0.66305,0.607387,1170
5,Sports_and_Outdoors,0.2,0.669027,0.626313,1166
6,Home_and_Kitchen,0.3,0.677966,0.650778,1214
7,Tools_and_Home_Improvement,0.4,0.686446,0.631106,1102
8,Clothing_Shoes_and_Jewelry,0.35,0.663745,0.652264,1195
9,Kindle_Store,0.4,0.712772,0.742365,1460


In [None]:
# 카테고리별 다른 threshold로 전체 F1 계산
def apply_category_thresholds(df_split, proba, thresholds, category_col='category'):
    th_array = np.array([thresholds.get(cat, 0.5) for cat in df_split[category_col]])
    return (proba >= th_array).astype(int)

test_pred_adaptive = apply_category_thresholds(df_test, test_proba, best_ths, category_col)

f1_adaptive = f1_score(y_test, test_pred_adaptive)
print("Global F1-score (adaptive thresholds):", f1_adaptive)


Global F1-score (adaptive thresholds): 0.6864690486335139


In [None]:
def train_lgbm_and_eval(feature_list, model_name="Model"):
    """
    feature_list: 사용할 feature 이름 리스트 (category는 자동 추가)
    """
    X = df[feature_list + [category_col]]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=42,
        stratify=y
    )

    # 원본 X_test를 category 분석용으로 따로 보관
    df_test_local = X_test.copy()

    # 인코딩
    X_train_enc = pd.get_dummies(X_train, columns=[category_col])
    X_test_enc  = pd.get_dummies(X_test,  columns=[category_col])
    X_train_enc, X_test_enc = X_train_enc.align(X_test_enc, join='left', axis=1)
    X_test_enc = X_test_enc.fillna(0)

    model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train_enc, y_train)

    proba = model.predict_proba(X_test_enc)[:, 1]
    pred_05 = (proba >= 0.5).astype(int)

    global_f1 = f1_score(y_test, pred_05)
    global_auc = roc_auc_score(y_test, proba)

    # 카테고리별 성능 (th=0.5)
    cat_default = eval_by_category(
        df_split=df_test_local,
        y_true=y_test.values,
        proba=proba,
        category_col=category_col,
        thresholds=None
    )

    # 카테고리별 최적 threshold
    best_ths_local = find_best_thresholds_by_category(
        df_split=df_test_local,
        y_true=y_test.values,
        proba=proba,
        category_col=category_col
    )

    cat_adaptive = eval_by_category(
        df_split=df_test_local,
        y_true=y_test.values,
        proba=proba,
        category_col=category_col,
        thresholds=best_ths_local
    )

    print(f"\n=== {model_name} ===")
    print("Global F1 (th=0.5):", global_f1)
    print("Global AUC:", global_auc)

    return {
        "model": model,
        "global_f1": global_f1,
        "global_auc": global_auc,
        "cat_default": cat_default,
        "cat_adaptive": cat_adaptive,
        "best_ths": best_ths_local
    }


In [None]:
# Model A: exclamation_rate 포함
features_A = ['polarity', 'personal_count', 'polarity_diff', 'exclamation_rate']
result_A = train_lgbm_and_eval(features_A, model_name="Model A (with exclamation_rate)")

# Model B: exclamation_rate 제거
features_B = ['polarity', 'personal_count', 'polarity_diff']
result_B = train_lgbm_and_eval(features_B, model_name="Model B (without exclamation_rate)")


[LightGBM] [Info] Number of positive: 14206, number of negative: 14162
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 568
[LightGBM] [Info] Number of data points in the train set: 28368, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500776 -> initscore=0.003102
[LightGBM] [Info] Start training from score 0.003102





=== Model A (with exclamation_rate) ===
Global F1 (th=0.5): 0.6177036561898653
Global AUC: 0.664196520770598
[LightGBM] [Info] Number of positive: 14206, number of negative: 14162
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 28368, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500776 -> initscore=0.003102
[LightGBM] [Info] Start training from score 0.003102





=== Model B (without exclamation_rate) ===
Global F1 (th=0.5): 0.570276991681674
Global AUC: 0.6285452030299159


