<a href="https://colab.research.google.com/github/hyunbini/Project/blob/main/SW_Ensemble_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install scikit-learn xgboost catboost pandas

In [None]:
pip install konlpy

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from konlpy.tag import Mecab
from preprocess import preprocess_dataframe
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 🔹 형태소 분석기 기반 tokenizer 정의
mecab = Mecab(dicpath='/opt/homebrew/Cellar/mecab-ko-dic/2.1.1-20180720/lib/mecab/dic/mecab-ko-dic')
def tokenize(text):
    return mecab.morphs(text)

# 🔹 1. 데이터 로드
train_df = pd.read_csv('/content/drive/MyDrive/train.csv', encoding='utf-8-sig')
train_df = preprocess_dataframe(train_df, text_col='full_text')

# 🔹 2. Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['generated'])

# 🔹 3. TF-IDF 벡터화
vectorizer = TfidfVectorizer(
    tokenizer=tokenize,
    ngram_range=(1, 2),
    max_features=10000
)
X = vectorizer.fit_transform(train_df['full_text'])

# 🔹 4. Train/Test 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 5. 개별 모델 학습
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

cat_model = CatBoostClassifier(verbose=0)
cat_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 🔹 6. Soft Voting 예측 (검증셋)
xgb_probs = xgb_model.predict_proba(X_val)
cat_probs = cat_model.predict_proba(X_val)
rf_probs = rf_model.predict_proba(X_val)

avg_probs = (xgb_probs + cat_probs + rf_probs) / 3
ensemble_preds = np.argmax(avg_probs, axis=1)

# 🔹 7. 평가 출력
print(classification_report(y_val, ensemble_preds))
print("Accuracy:", accuracy_score(y_val, ensemble_preds))
print("ROC-AUC:", roc_auc_score(y_val, avg_probs[:, 1]))

# 🔹 8. 테스트셋 예측
test_df = pd.read_csv('/content/drive/MyDrive/test.csv', encoding='utf-8-sig')
X_test_final = vectorizer.transform(test_df['paragraph_text'])

xgb_test_probs = xgb_model.predict_proba(X_test_final)
cat_test_probs = cat_model.predict_proba(X_test_final)
rf_test_probs = rf_model.predict_proba(X_test_final)

avg_test_probs = (xgb_test_probs + cat_test_probs + rf_test_probs) / 3
test_preds = np.argmax(avg_test_probs, axis=1)
test_labels = label_encoder.inverse_transform(test_preds)

# 🔹 9. 제출 파일 저장
submission = pd.read_csv('/content/drive/MyDrive/submission.csv')
submission['generated'] = test_labels
submission.to_csv('/content/drive/MyDrive/submission.csv', index=False)

print("Finish")