In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack

# -----------------------------
# 데이터 로드
# -----------------------------
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# -----------------------------
# 결측 과다 컬럼 삭제 (contest_award, idea_contest 유지)
# -----------------------------
drop_cols = ['class2', 'class3', 'class4',
             'previous_class_3', 'previous_class_4', 'previous_class_5',
             'previous_class_6', 'previous_class_7']
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

# -----------------------------
# 결측치 처리
# -----------------------------
for col in train.columns:
    if train[col].dtype == 'object':
        train[col] = train[col].fillna('missing')
    else:
        train[col] = train[col].fillna(0)
for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = test[col].fillna('missing')
    else:
        test[col] = test[col].fillna(0)

# -----------------------------
# binary 처리
# -----------------------------
for col in ['contest_award', 'idea_contest', 'contest_participitation']:
    train[col] = train[col].apply(lambda x: 0 if pd.isna(x) or x == 'missing' else 1)
    test[col]  = test[col].apply(lambda x: 0 if pd.isna(x) or x == 'missing' else 1)

# -----------------------------
# 자연어 컬럼 TF-IDF
# -----------------------------
def make_text(row):
    return (
        f"BDA 참여 이유: {row['whyBDA']} "
        f"BDA에서 얻고 싶은 것: {row['what_to_gain']} "
        f"원데이 클래스: {row['onedayclass_topic']} "
        f"희망 도메인: {row['expected_domain']}"
    )

train['text'] = train.apply(make_text, axis=1)
test['text']  = test.apply(make_text, axis=1)

tfidf = TfidfVectorizer(max_features=1000)
tfidf_train = tfidf.fit_transform(train['text'])
tfidf_test  = tfidf.transform(test['text'])

# -----------------------------
# 파생변수 생성
# -----------------------------
def add_features(df):
    df = df.copy()
    df['re_reg_time_mult'] = df['time_input'] * df['re_registration'].astype('category').cat.codes
    df['school1_len'] = df['school1'].apply(lambda x: len(str(x)))
    freq_school = df['school1'].value_counts()
    df['school1_freq'] = df['school1'].map(freq_school)
    df['onedayclass_topic_len'] = df['onedayclass_topic'].apply(lambda x: len(str(x)))
    for keyword in ['Python', '머신러닝', 'SQL']:
        df[f'onedayclass_has_{keyword}'] = df['onedayclass_topic'].apply(lambda x: 1 if keyword in str(x) else 0)
    df['interested_company_len'] = df['interested_company'].apply(lambda x: len(str(x)))
    df['interested_company_count'] = df['interested_company'].apply(lambda x: len(str(x).split(',')))
    df['major_comb'] = df['major_field'].astype(str) + "_" + df['major1_1'].astype(str)
    df['desired_job_same'] = (df['desired_job'] == df['desired_job_except_data']).astype(int)
    df['incumbents_reason_len'] = df['incumbents_lecture_scale_reason'].apply(lambda x: len(str(x)))
    df['whyBDA_len'] = df['whyBDA'].apply(lambda x: len(str(x)))
    df['hope_for_group_code'] = df['hope_for_group'].astype('category').cat.codes
    df['certificate_none'] = (df['certificate_acquisition'] == '없음').astype(int)
    return df

train = add_features(train)
test  = add_features(test)

# -----------------------------
# Label Encoding
# -----------------------------
text_cols = ['whyBDA', 'what_to_gain', 'onedayclass_topic', 'expected_domain', 'text']
cat_cols = train.columns.drop(['ID', 'withdrawal'] + text_cols)

for col in cat_cols:
    train[col] = train[col].astype(str)
    test[col]  = test[col].astype(str)
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]], axis=0))
    train[col] = le.transform(train[col])
    test[col]  = le.transform(test[col])

# -----------------------------
# 최종 feature 결합
# -----------------------------
X_tabular = train.drop(['ID', 'withdrawal'] + text_cols, axis=1)
X_test_tabular = test.drop(['ID'] + text_cols, axis=1)

X_full = hstack([X_tabular.values, tfidf_train])
X_test_full = hstack([X_test_tabular.values, tfidf_test])
y = train['withdrawal'].astype(int)

# 검증 분리
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# (고정) 최적 하이퍼파라미터로 학습
#   로그에서 확보한 값:
#   n_estimators=771, max_depth=25, min_samples_split=12, min_samples_leaf=20,
#   max_features=None, bootstrap=True
# -----------------------------
best_params_fixed = {
    'n_estimators': 771,
    'max_depth': 25,
    'min_samples_split': 12,
    'min_samples_leaf': 20,
    'max_features': None,
    'bootstrap': True,
    'random_state': 42,
    'n_jobs': -1
}

final_model = RandomForestClassifier(**best_params_fixed)
final_model.fit(X_train, y_train)

# -----------------------------
# 검증 성능
# -----------------------------
val_pred = final_model.predict(X_val)
f1 = f1_score(y_val, val_pred, average='macro')
print(f"[VAL] RandomForest (fixed) F1-macro: {f1:.4f}")
print("\n[VAL] Classification report:\n", classification_report(y_val, val_pred))
print("[VAL] Confusion matrix:\n", confusion_matrix(y_val, val_pred))

# -----------------------------
# 테스트 예측 및 저장
# -----------------------------
test_pred = final_model.predict(X_test_full)
sample_submission['withdrawal'] = test_pred
out_path = 'submission_tfidf_rf_fixed.csv'
sample_submission.to_csv(out_path, index=False)
print(f"✅ Saved → {out_path}")

[VAL] RandomForest (fixed) F1-macro: 0.4525

[VAL] Classification report:
               precision    recall  f1-score   support

           0       0.60      0.05      0.09        65
           1       0.70      0.99      0.82       147

    accuracy                           0.70       212
   macro avg       0.65      0.52      0.45       212
weighted avg       0.67      0.70      0.59       212

[VAL] Confusion matrix:
 [[  3  62]
 [  2 145]]
✅ Saved → submission_tfidf_rf_fixed.csv
