In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import optuna

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightgbm import early_stopping

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 데이터 핸들링
import numpy as np 
import pandas as pd 

# 시각화
import matplotlib.pyplot as plt

# 전처리
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler ,RobustScaler
from sklearn.model_selection import train_test_split 

#하이퍼 파라미터 튜닝: 보통 랜덤서치로 대략 파악한 다음 그리드 서치로 미세조정
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# 모델
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

# 평가도구 
from sklearn.metrics import accuracy_score

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
train.info()

In [None]:
orig_df = pd.read_csv('/kaggle/input/extrovert-vs-introvert-behavior-data-backup/personality_datasert.csv')
orig_df.isnull().sum()

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
test_df.info()

In [None]:
df_cols = [
    'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
    'Going_outside', 'Drained_after_socializing', 
    'Friends_circle_size', 'Post_frequency'
]

df = (
    orig_df.rename(columns={'Personality': 'match_p'})
           .drop_duplicates(subset=df_cols)
)


def merge_with_match_p(df, ref_df, merge_cols):
    merged_df = df.merge(ref_df, how='left', on=merge_cols)
    merged_df['match_p_is_null'] = merged_df['match_p'].isna().astype(int)
    return merged_df

train = merge_with_match_p(train, df, df_cols)
test_df = merge_with_match_p(test_df, df, df_cols)

In [None]:
train['match_p'] = train['match_p'].fillna('unknown')
test_df['match_p'] = test_df['match_p'].fillna('unknown')

train['Stage_fear'] = train['Stage_fear'].fillna('unknown')
test_df['Stage_fear'] = test_df['Stage_fear'].fillna('unknown')

train['Drained_after_socializing'] = train['Drained_after_socializing'].fillna('unknown')
test_df['Drained_after_socializing'] = test_df['Drained_after_socializing'].fillna('unknown')

In [None]:
def numeric_impute(df, col_lst):
    for col in col_lst:
        df[col] = df[col].fillna(df[col].mean())
    return df 

col_lst = train.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns
col_lst2 = test_df.drop(columns=['id']).select_dtypes(include=['int64', 'float64']).columns

train = numeric_impute(train, col_lst)
test_df = numeric_impute(test_df, col_lst2)


In [None]:
from sklearn.preprocessing import OrdinalEncoder

cat_features = ['match_p', 'Stage_fear', 'Drained_after_socializing']

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[cat_features] = encoder.fit_transform(train[cat_features])

train['Personality'] = train['Personality'].map({"Introvert" : 0, "Extrovert" : 1})



test_df[cat_features] = encoder.transform(test_df[cat_features])

In [None]:
x = train.drop(['Personality', 'id'], axis=1)
test  = test_df.drop('id', axis =1)
y = train['Personality']

In [None]:
test = numeric_impute(test, df_cols)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# 초기화
oof_preds = np.zeros((x.shape[0],))   
test_preds = np.zeros((test.shape[0],))  
proba_oof = np.zeros((x.shape[0], 2))    
proba_test = np.zeros((test.shape[0], 2))
fold_accuracies = []

# RandomForest 하이퍼파라미터 예시
rf_params = {
    'n_estimators': 100,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1,
    'class_weight': 'balanced'  # 선택사항: 불균형 데이터라면 유용
}

# Stratified K-Fold 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 교차검증 루프
for fold, (train_idx, val_idx) in enumerate(skf.split(x, y)):
    print(f"\n📊 Training Fold {fold + 1}...")

    X_train, X_val = x.iloc[train_idx], x.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = RandomForestClassifier(**rf_params)
    model.fit(X_train, y_train)

    proba_oof[val_idx] = model.predict_proba(X_val)
    val_preds = np.argmax(proba_oof[val_idx], axis=1)
    acc = accuracy_score(y_val, val_preds)

    print(f"Fold {fold + 1} Accuracy: {acc:.4f}")
    fold_accuracies.append(acc)

# 평균 정확도 출력
print(f"\n📉 Average CV Accuracy: {np.mean(fold_accuracies):.4f}")


In [None]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.3,random_state=123)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=123)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# RandomForest 하이퍼파라미터 (필요시 수정)
rf_params = {
    'n_estimators': 100,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1,
    'class_weight': 'balanced'  # 클래스 불균형 대응
}

# 전체 학습 데이터로 최종 모델 학습
final_model = RandomForestClassifier(**rf_params)
final_model.fit(x, y)

# 테스트셋 예측 (확률 + 예측값)
final_proba_test = final_model.predict_proba(test)
final_test_preds = np.argmax(final_proba_test, axis=1)

# 예측 결과를 라벨로 변환
label_map = {0: "Introvert", 1: "Extrovert"}
final_labels = [label_map[pred] for pred in final_test_preds]

# 제출 파일 생성
submission = pd.DataFrame({
    'id': test_df['id'],
    'Personality': final_labels
})

submission.to_csv('/kaggle/working/submission.csv', index=False)
