In [None]:
# 필수 라이브러리 및 데이터 로딩
import pandas as pd
import numpy as np
import re
from collections import Counter
import pickle
import json
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# 이전 단계 데이터 로딩
train_para_df = pd.read_pickle('train_para_df.pkl')
test_df = pd.read_pickle('test_df.pkl')

with open('data_info.pkl', 'rb') as f:
    data_info = pickle.load(f)
    train_titles = data_info['train_titles']
    val_titles = data_info['val_titles']

print(f"데이터 로딩 완료")
print(f"훈련 데이터: {len(train_para_df)}개 문단")
print(f"테스트 데이터: {len(test_df)}개 문단")


In [None]:
# 한국어 AI 특화 특성 추출 함수들
def extract_korean_ai_features(text):
    """한국어 AI 생성 텍스트의 특징을 추출"""
    if not isinstance(text, str) or len(text.strip()) == 0:
        return np.zeros(40)
    
    features = []
    words = text.split()
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    
    # 1. 기본 통계 (5개)
    features.extend([
        len(words),
        len(sentences), 
        len(words) / len(sentences) if sentences else 0,
        len(set(words)) / len(words) if words else 0,  # 어휘 다양성
        np.std([len(w) for w in words]) if words else 0  # 단어 길이 변동성
    ])
    
    # 2. 한국어 어미 패턴 분석 (10개)
    formal_endings = ['습니다', '입니다', '했습니다', '있습니다', '됩니다']
    informal_endings = ['다', '야', '지', '네', '요']
    question_endings = ['까', '나', '니']
    
    formal_count = sum(text.count(ending) for ending in formal_endings)
    informal_count = sum(text.count(ending) for ending in informal_endings)
    question_count = sum(text.count(ending) for ending in question_endings)
    
    features.extend([
        formal_count / len(sentences) if sentences else 0,
        informal_count / len(sentences) if sentences else 0, 
        question_count / len(sentences) if sentences else 0,
        formal_count / (formal_count + informal_count + 1),  # 격식체 비율
        len(set([w[-2:] for w in words if len(w) >= 2])) / len(words) if words else 0,  # 어미 다양성
        text.count('다') / len(words) if words else 0,
        text.count('습니다') / len(sentences) if sentences else 0,
        text.count('입니다') / len(sentences) if sentences else 0,
        text.count('했습니다') / len(sentences) if sentences else 0,
        text.count('있습니다') / len(sentences) if sentences else 0
    ])
    
    # 3. 접속사 및 전이 표현 (8개)
    conjunctions = ['그리고', '또한', '따라서', '그러나', '하지만', '그러므로', '그런데', '즉']
    transition_words = ['첫째', '둘째', '셋째', '마지막으로', '결국', '결론적으로']
    
    conjunction_count = sum(text.count(conj) for conj in conjunctions)
    transition_count = sum(text.count(trans) for trans in transition_words)
    
    features.extend([
        conjunction_count / len(sentences) if sentences else 0,
        transition_count / len(sentences) if sentences else 0,
        text.count('그리고') / len(sentences) if sentences else 0,
        text.count('또한') / len(sentences) if sentences else 0,
        text.count('따라서') / len(sentences) if sentences else 0,
        text.count('하지만') / len(sentences) if sentences else 0,
        len(set(conjunctions) & set(words)) / len(conjunctions),  # 접속사 다양성
        (conjunction_count + transition_count) / len(words) if words else 0
    ])
    
    # 4. 조사 패턴 (7개)
    particles = ['은', '는', '이', '가', '을', '를', '에', '의', '로', '와', '과']
    particle_counts = [text.count(p) for p in particles]
    
    features.extend([
        sum(particle_counts) / len(words) if words else 0,
        np.std(particle_counts) if particle_counts else 0,
        text.count('의') / len(words) if words else 0,
        text.count('에') / len(words) if words else 0,
        text.count('은') / len(words) if words else 0,
        text.count('는') / len(words) if words else 0,
        (text.count('은') + text.count('는')) / (text.count('이') + text.count('가') + 1)  # 주제/주어 조사 비율
    ])
    
    # 5. 문장 구조 및 길이 패턴 (5개)
    sentence_lengths = [len(s.split()) for s in sentences] if sentences else [0]
    
    features.extend([
        np.mean(sentence_lengths),
        np.std(sentence_lengths),
        len([s for s in sentences if len(s.split()) > 20]) / len(sentences) if sentences else 0,  # 긴 문장 비율
        len([s for s in sentences if len(s.split()) < 5]) / len(sentences) if sentences else 0,   # 짧은 문장 비율
        max(sentence_lengths) / np.mean(sentence_lengths) if sentence_lengths and np.mean(sentence_lengths) > 0 else 0
    ])
    
    # 6. 반복 및 일관성 패턴 (5개)
    word_freq = Counter(words)
    most_common = word_freq.most_common(5)
    
    features.extend([
        len([w for w, c in word_freq.items() if c > 1]) / len(words) if words else 0,  # 반복 단어 비율
        most_common[0][1] / len(words) if most_common else 0,  # 최빈 단어 비율
        len([w for w in words if len(w) > 5]) / len(words) if words else 0,  # 긴 단어 비율
        text.count('것') / len(words) if words else 0,  # '것' 의존도
        len(re.findall(r'[0-9]+', text)) / len(words) if words else 0  # 숫자 사용 비율
    ])
    
    return np.array(features, dtype=np.float32)

print("한국어 특화 특성 추출 함수 정의 완료")


In [None]:
# 훈련 데이터에서 한국어 특성 추출
print("훈련 데이터 한국어 특성 추출 시작...")

train_mask = train_para_df['title'].isin(train_titles)
val_mask = train_para_df['title'].isin(val_titles)

train_data = train_para_df[train_mask]
val_data = train_para_df[val_mask]

# 특성 추출
train_korean_features = []
for text in train_data['paragraph_text']:
    features = extract_korean_ai_features(text)
    train_korean_features.append(features)

val_korean_features = []
for text in val_data['paragraph_text']:
    features = extract_korean_ai_features(text)
    val_korean_features.append(features)

X_train_korean = np.array(train_korean_features)
X_val_korean = np.array(val_korean_features)
y_train = train_data['generated'].values
y_val = val_data['generated'].values

print(f"훈련 특성 shape: {X_train_korean.shape}")
print(f"검증 특성 shape: {X_val_korean.shape}")
print(f"특성 개수: {X_train_korean.shape[1]}개")


In [None]:
# 특성 정규화 및 모델 훈련
print("한국어 특성 기반 모델 훈련...")

# 특성 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_korean)
X_val_scaled = scaler.transform(X_val_korean)

# Random Forest 모델 훈련 (한국어 특성에 적합)
korean_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

korean_model.fit(X_train_scaled, y_train)

# 검증 성능 평가
val_predictions = korean_model.predict_proba(X_val_scaled)[:, 1]
korean_auc = roc_auc_score(y_val, val_predictions)

print(f"한국어 특성 모델 검증 AUC: {korean_auc:.4f}")

# 특성 중요도 분석
feature_names = [
    'word_count', 'sentence_count', 'words_per_sentence', 'vocabulary_diversity', 'word_length_std',
    'formal_endings_rate', 'informal_endings_rate', 'question_endings_rate', 'formal_ratio', 'ending_diversity',
    'da_frequency', 'seumnida_rate', 'imnida_rate', 'haetseumnida_rate', 'itseumnida_rate',
    'conjunction_rate', 'transition_rate', 'geurigo_rate', 'tohan_rate', 'ttaraseo_rate', 'hajiman_rate', 'conjunction_diversity', 'logical_words_rate',
    'particle_rate', 'particle_std', 'ui_frequency', 'e_frequency', 'eun_frequency', 'neun_frequency', 'topic_subject_ratio',
    'avg_sentence_length', 'sentence_length_std', 'long_sentence_ratio', 'short_sentence_ratio', 'length_variation',
    'repeated_word_ratio', 'most_common_ratio', 'long_word_ratio', 'geot_dependency', 'number_ratio'
]

importances = korean_model.feature_importances_
top_features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True)[:10]

print("상위 10개 중요 특성:")
for name, importance in top_features:
    print(f"  {name}: {importance:.4f}")


In [None]:
# 테스트 데이터 특성 추출 및 예측
print("테스트 데이터 한국어 특성 추출...")

test_korean_features = []
for text in test_df['paragraph_text']:
    features = extract_korean_ai_features(text)
    test_korean_features.append(features)

X_test_korean = np.array(test_korean_features)
X_test_scaled = scaler.transform(X_test_korean)

# 테스트 예측
test_predictions = korean_model.predict_proba(X_test_scaled)[:, 1]

print(f"테스트 특성 shape: {X_test_korean.shape}")
print(f"예측 통계:")
print(f"  평균: {test_predictions.mean():.4f}")
print(f"  표준편차: {test_predictions.std():.4f}")
print(f"  최소값: {test_predictions.min():.4f}")
print(f"  최대값: {test_predictions.max():.4f}")

# 결과 저장
korean_results = {
    'train_features': X_train_korean,
    'val_features': X_val_korean, 
    'test_features': X_test_korean,
    'scaler': scaler,
    'model': korean_model,
    'val_auc': korean_auc,
    'test_predictions': test_predictions,
    'feature_names': feature_names,
    'feature_importances': importances
}

with open('korean_features_results.pkl', 'wb') as f:
    pickle.dump(korean_results, f)

# 메타데이터 저장
korean_metadata = {
    'model_type': 'korean_features',
    'val_auc': korean_auc,
    'feature_count': X_train_korean.shape[1],
    'top_features': top_features[:5],
    'improvements': [
        'korean_ending_patterns',
        'conjunction_analysis', 
        'particle_patterns',
        'vocabulary_diversity',
        'sentence_structure'
    ]
}

with open('step3_metadata.json', 'w') as f:
    json.dump(korean_metadata, f, indent=2)

print("3단계 완료 - 한국어 특화 특성 공학")
print("다음 단계: 04_advanced_ensemble.ipynb 실행")
