In [194]:
import warnings
warnings.filterwarnings('ignore')

In [195]:
# 데이터 핸들링
import numpy as np 
import pandas as pd 

# 시각화
import matplotlib.pyplot as plt

# 전처리
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split 

#하이퍼 파라미터 튜닝: 보통 랜덤서치로 대략 파악한 다음 그리드 서치로 미세조정
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# 모델
from sklearn.linear_model import LogisticRegression 

# 평가도구 
from sklearn.metrics import accuracy_score

In [196]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


In [197]:
train2= train.copy()

In [198]:
train2=train2.drop('id',axis=1)
cols=train2.columns
categorical_cols=[col for col in cols if train2[col].dtype=='object']
numerical_cols=[col for col in cols if train2[col].dtype!='object']

categorical_cols,numerical_cols

(['Stage_fear', 'Drained_after_socializing', 'Personality'],
 ['Time_spent_Alone',
  'Social_event_attendance',
  'Going_outside',
  'Friends_circle_size',
  'Post_frequency'])

In [199]:
cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
for i in cols:
    mode_val = train2[i].mode()[0]
    train2[i] = train2[i].fillna(mode_val)

In [200]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

train3 = train2.copy()

label_encoders = {}

# 1. 모든 범주형 열(대상 포함)에 대해 astype(str) + LabelEncoding 적용
for col in train3.columns:
    if train3[col].dtype == 'object' or train3[col].isnull().any():
        train3[col] = train3[col].astype(str)  # NaN -> 'nan' 문자열 변환
        le = LabelEncoder()
        train3[col] = le.fit_transform(train3[col])
        label_encoders[col] = le

# 2. Stage_fear 결측치 예측
mask_sf = train2['Stage_fear'].notnull()
known_sf = train3[mask_sf]
unknown_sf = train3[~mask_sf]

if not unknown_sf.empty:
    X_sf_train = known_sf.drop('Stage_fear', axis=1)
    y_sf_train = known_sf['Stage_fear']
    X_sf_pred = unknown_sf.drop('Stage_fear', axis=1)

    clf_sf = RandomForestClassifier(random_state=42)
    clf_sf.fit(X_sf_train, y_sf_train)
    pred_sf = clf_sf.predict(X_sf_pred)

    # 역변환 후 train2에 채우기
    train2.loc[~mask_sf, 'Stage_fear'] = label_encoders['Stage_fear'].inverse_transform(pred_sf)

# 3. Drained_after_socializing 결측치 예측
mask_dr = train2['Drained_after_socializing'].notnull()
known_dr = train3[mask_dr]
unknown_dr = train3[~mask_dr]

if not unknown_dr.empty:
    X_dr_train = known_dr.drop('Drained_after_socializing', axis=1)
    y_dr_train = known_dr['Drained_after_socializing']
    X_dr_pred = unknown_dr.drop('Drained_after_socializing', axis=1)

    clf_dr = RandomForestClassifier(random_state=42)
    clf_dr.fit(X_dr_train, y_dr_train)
    pred_dr = clf_dr.predict(X_dr_pred)

    train2.loc[~mask_dr, 'Drained_after_socializing'] = label_encoders['Drained_after_socializing'].inverse_transform(pred_dr)


In [201]:
train2.isnull().sum() 

Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64

In [202]:
# 전처리
train2['Stage_fear'] = LabelEncoder().fit_transform(train2['Stage_fear'])
train2['Drained_after_socializing'] = LabelEncoder().fit_transform(train2['Drained_after_socializing'])
train2['Personality'] = LabelEncoder().fit_transform(train2['Personality']) # 0:Extro, 1:Intro

cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency']
train2[cols] = StandardScaler().fit_transform(train2[cols])

X = train2.iloc[:,0:7]
y = train2['Personality']

X.columns

Index(['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency'],
      dtype='object')

In [203]:
x_train, x_test, y_train, y_test=train_test_split(X,y,train_size=0.7,random_state=123)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.85, random_state=123)

In [204]:
# 기본모델 
lr = LogisticRegression(solver='lbfgs', max_iter=100, random_state=123).fit(X=x_train, y=y_train)

y_pred = lr.predict(x_test)

train_score = lr.score(x_train, y_train)
val_score = lr.score(x_val, y_val)

differ = round((train_score-val_score),3)

if not 0<=differ<=0.05:
    print(differ,'=> 과적합')
else:
    print(differ,'=> 과적합 아님')

print('accuracy =',round(accuracy_score(y_test,y_pred),3)) 

-0.006 => 과적합
accuracy = 0.968
