In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 핸들링
import numpy as np 
import pandas as pd 

# 시각화
import matplotlib.pyplot as plt

# 전처리
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler ,RobustScaler
from sklearn.model_selection import train_test_split 

#하이퍼 파라미터 튜닝: 보통 랜덤서치로 대략 파악한 다음 그리드 서치로 미세조정
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# 모델
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

# 평가도구 
from sklearn.metrics import accuracy_score

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [4]:
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6175 entries, 0 to 6174
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         6175 non-null   int64  
 1   Time_spent_Alone           5750 non-null   float64
 2   Stage_fear                 5577 non-null   object 
 3   Social_event_attendance    5778 non-null   float64
 4   Going_outside              5709 non-null   float64
 5   Drained_after_socializing  5743 non-null   object 
 6   Friends_circle_size        5825 non-null   float64
 7   Post_frequency             5767 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 386.1+ KB


In [5]:
train.isnull().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [6]:
# 결측치 처리
for df_ in [train, test]:
    # Time_spent_Alone > 중앙값
    if 'Time_spent_Alone' in df_.columns:
        df_['Time_spent_Alone'].fillna(df_['Time_spent_Alone'].median(), inplace=True)
    
    # 나머지 수치형 변수 > 평균
    numeric_cols = df_.select_dtypes(include=['int64', 'float64']).columns.drop(['Time_spent_Alone'], errors='ignore')
    df_[numeric_cols] = df_[numeric_cols].fillna(df_[numeric_cols].mean())

# 범주형 변수 결측치 처리
# 기준 값
time_mean = train['Time_spent_Alone'].mean()
threshold = time_mean - 1

for df_ in [train, test]:
    # 1. 하나만 NaN인 경우
    mask_stage_nan = df_['Stage_fear'].isna() & df_['Drained_after_socializing'].notna()
    df_.loc[mask_stage_nan, 'Stage_fear'] = df_.loc[mask_stage_nan, 'Drained_after_socializing']
    
    mask_drained_nan = df_['Drained_after_socializing'].isna() & df_['Stage_fear'].notna()
    df_.loc[mask_drained_nan, 'Drained_after_socializing'] = df_.loc[mask_drained_nan, 'Stage_fear']

    # 2. 둘 다 NaN인 경우 → 새로운 DataFrame 생성 후 병합
    mask_both_nan = df_['Stage_fear'].isna() & df_['Drained_after_socializing'].isna()
    fill_values = df_.loc[mask_both_nan, 'Time_spent_Alone'].apply(
        lambda x: ('No', 'No') if x <= threshold else ('Yes', 'Yes')
    )
    fill_df = pd.DataFrame(fill_values.tolist(), index=fill_values.index, columns=['Stage_fear', 'Drained_after_socializing'])
    df_.loc[mask_both_nan, ['Stage_fear', 'Drained_after_socializing']] = fill_df

In [7]:
# 범주형 변수 숫자로 변환
for df_ in [train, test]:
    df_['Stage_fear'] = df_['Stage_fear'].map({'No': 0, 'Yes': 1})
    df_['Drained_after_socializing'] = df_['Drained_after_socializing'].map({'No': 0, 'Yes': 1})

In [8]:
train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0,6.0,4.0,0,15.0,5.0,Extrovert
1,1,1.0,0,7.0,3.0,0,10.0,8.0,Extrovert
2,2,6.0,1,1.0,0.0,1,3.0,0.0,Introvert
3,3,3.0,0,7.0,3.0,0,11.0,5.0,Extrovert
4,4,1.0,0,4.0,4.0,0,13.0,4.982097,Extrovert


In [9]:
# 데이터 분리
X = train.drop(columns=['id', 'Personality'])
y = train['Personality']

In [10]:
x_train, x_test, y_train, y_test=train_test_split(X,y,test_size=0.3,random_state=123)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=123)

In [11]:
# 랜덤 포레스트 모델 생성 및 학습
rf = RandomForestClassifier(random_state=123, n_estimators=100).fit(X=x_train, y=y_train)

# 예측
y_pred = rf.predict(x_test)

# 훈련, 검증 정확도 계산
train_score = rf.score(x_train, y_train)
val_score = rf.score(x_val, y_val)

# 과적합 판단
differ = round((train_score - val_score), 3)

if not differ <= 0.05:
    print(differ, '=> 과적합')
else:
    print(differ, '=> 과적합 아님')

# 테스트 정확도 출력
print('accuracy =', round(accuracy_score(y_test, y_pred), 3))


0.029 => 과적합 아님
accuracy = 0.964
