In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 한글 폰트 설정 (Mac)
# plt.rc('font', family='AppleGothic')
# 한글 폰트 설정 (Windows)
# plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False # 마이너스 기호 깨짐 방지

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    print("데이터 불러오기 완료.")
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

print("\n" + "="*50)
print("1. 데이터 기본 정보 및 결측치 확인")
print("="*50)
train_df.info()
print("\n결측치 비율:\n", train_df.isnull().sum() / len(train_df) * 100)

print("\n" + "="*50)
print("2. 결측치 시각화")
print("="*50)
msno.matrix(train_df)
plt.title('Missingness Matrix')
plt.show()

msno.bar(train_df)
plt.title('Missingness Bar Chart')
plt.show()

msno.heatmap(train_df)
plt.title('Missingness Heatmap (Correlation between missing values)')
plt.show()

# 3. 데이터 유형별 분리
numerical_cols = train_df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_df.select_dtypes(include='object').columns.tolist()
numerical_cols.remove('stress_score')
categorical_cols.remove('ID')

print("\n" + "="*50)
print("3. 수치형 변수 EDA")
print("="*50)
for col in numerical_cols:
    plt.figure(figsize=(12, 5))
    
    # 히스토그램과 KDE
    plt.subplot(1, 2, 1)
    sns.histplot(train_df[col], kde=True)
    plt.title(f'{col} Distribution')
    
    # 박스 플롯으로 이상치 확인
    plt.subplot(1, 2, 2)
    sns.boxplot(x=train_df[col])
    plt.title(f'{col} Outliers')
    
    plt.tight_layout()
    plt.show()

# 수치형 변수 간의 상관관계
print("\n수치형 변수 간의 상관관계")
plt.figure(figsize=(12, 10))
sns.heatmap(train_df[numerical_cols + ['stress_score']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()


print("\n" + "="*50)
print("4. 범주형 변수 EDA")
print("="*50)
for col in categorical_cols:
    print(f"\n** {col} 변수 분석 **")
    print("빈도:\n", train_df[col].value_counts(dropna=False))
    
    plt.figure(figsize=(10, 5))
    
    # 빈도 시각화
    plt.subplot(1, 2, 1)
    sns.countplot(y=train_df[col], order=train_df[col].value_counts().index)
    plt.title(f'{col} Distribution')

    # 타겟 변수(stress_score)와의 관계
    plt.subplot(1, 2, 2)
    sns.boxplot(x='stress_score', y=col, data=train_df)
    plt.title(f'{col} vs Stress Score')
    plt.tight_layout()
    plt.show()

print("\n" + "="*50)
print("5. 결측치가 있는 변수와 타겟 변수의 관계")
print("="*50)
missing_cols = ['medical_history', 'family_medical_history', 'edu_level', 'mean_working']
for col in missing_cols:
    temp_df = train_df.copy()
    
    # 이 부분을 수정했어! 불리언 값을 정수(0, 1)로 변환
    temp_df[f'{col}_is_missing'] = temp_df[col].isnull().astype(int)
    
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='stress_score', y=f'{col}_is_missing', data=temp_df)
    plt.title(f'Stress Score vs {col} Missingness')
    plt.show()


print("\n" + "="*50)
print("6. LightGBM 모델을 활용한 변수 중요도 (Feature Importance)")
print("="*50)
# 결측치 처리 (간단하게)
temp_df = train_df.copy()
for col in ['medical_history', 'family_medical_history', 'edu_level']:
    temp_df[col] = temp_df[col].fillna('unknown')
temp_df['mean_working'] = temp_df['mean_working'].fillna(temp_df['mean_working'].median())
temp_df = temp_df.drop('ID', axis=1)

# 범주형 변수 레이블 인코딩
for col in temp_df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    temp_df[col] = le.fit_transform(temp_df[col])

X = temp_df.drop('stress_score', axis=1)
y = temp_df['stress_score']

lgb_model = lgb.LGBMRegressor(random_state=42, n_estimators=100)
lgb_model.fit(X, y)

feature_importance = pd.DataFrame({'feature': X.columns, 'importance': lgb_model.feature_importances_})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance from LightGBM')
plt.show()
print(feature_importance)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

Collecting catboost
  Downloading catboost-1.2.8-cp310-cp310-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.2.0-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-2.0.1-py3-none-any.whl.metadata (11 kB)
Downloading catboost-1.2.8-cp310-cp310-win_amd64.whl (102.5 MB)
   ---------------------------------------- 0.0/102.5 MB ? eta -:--:--
   - -------------------------------------- 3.1/102.5 MB 16.8 MB/s eta 0:00:06
   --- ------------------------------------ 9.2/102.5 MB 23.8 MB/s eta 0:00:04
   --- ------------------------------------ 9.7/102.5 MB 19.5 MB/s eta 0:00:05
   ---- ----------------------------------- 11.5/102.5 MB 15.0 MB/s eta 0:00:07
   ---- ----------------------------------- 12.1/102.5 MB 12.2 MB/s eta 0:00:08
   ------ --------------------------------- 17.6/102.5 MB 14.6 MB/

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.42-cp310-cp310-win_amd64.whl.metadata (9.8 kB)
Collecting PyYAML (from optuna)
  Downloading PyYAML-6.0.2-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Using cached mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting tomli (from alembic>=1.5.0->optuna)
  Using cached tomli-2.2.1-py3-none-any.whl.metadata (10 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.3-cp310-cp310-win_amd64.whl.metadata (4.2 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Downloading MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Downloading 

In [24]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생변수 생성 함수 (개선된 버전)
def preprocess_and_feature_engineer_improved(df, scaler=None, fit_scaler=False):
    df_processed = df.copy()

    # 이상치 처리
    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0

    # 결측치 처리 (그룹별 중앙값 사용)
    for col in ['medical_history', 'family_medical_history', 'edu_level']:
        df_processed[col] = df_processed[col].fillna('unknown')
    
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())

    # 파생 변수 생성
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    df_processed['Hypertension_flag'] = ((df_processed['systolic_blood_pressure'] >= 140) | (df_processed['diastolic_blood_pressure'] >= 90)).astype(int)
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    
    bins = [0, 8, 12, 16, 24]
    labels = ['~8h', '8~12h', '12~16h', '16h~']
    df_processed['Working_Hour_Group'] = pd.cut(df_processed['mean_working'], bins=bins, labels=labels, right=False)

    # 범주형 변수 원-핫 인코딩
    categorical_cols = ['gender', 'activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level',
                        'Working_Hour_Group']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    # 변수 스케일링
    numerical_cols = ['age', 'height', 'weight', 'systolic_blood_pressure', 
                      'diastolic_blood_pressure', 'cholesterol', 'glucose', 
                      'bone_density', 'mean_working', 'BMI', 
                      'Pulse_Pressure', 'bp_interaction', 'cholesterol_glucose_ratio']

    numerical_cols_for_scaling = [col for col in numerical_cols if col in df_processed.columns]
    
    if fit_scaler:
        scaler = StandardScaler()
        df_processed[numerical_cols_for_scaling] = scaler.fit_transform(df_processed[numerical_cols_for_scaling])
        return df_processed, scaler
    else:
        df_processed[numerical_cols_for_scaling] = scaler.transform(df_processed[numerical_cols_for_scaling])
        return df_processed, scaler

# 3. 데이터에 함수 적용
train_final, scaler_fit = preprocess_and_feature_engineer_improved(train_df.drop('ID', axis=1), fit_scaler=True)
test_final, _ = preprocess_and_feature_engineer_improved(test_df.drop('ID', axis=1), scaler=scaler_fit)

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기 및 중요도 낮은 변수 제거
train_cols = set(train_final.drop('stress_score', axis=1).columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))

# 중요도가 낮은 변수 리스트 (기존 코드와 동일)
low_importance_features = ['sleep_pattern_sleep difficulty', 'activity_moderate', 'mean_working']
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train = train_final[final_cols]
y_train = train_final['stress_score']
X_test = test_final[final_cols]

# 5. 모델 학습 및 예측 (기존 코드와 동일)
# LightGBM 모델
lgb_model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=300,
    learning_rate=0.1,
    num_leaves=40
)
lgb_model.fit(X_train, y_train)
lgb_predictions = lgb_model.predict(X_test)

# XGBoost 모델
xgb_model = xgb.XGBRegressor(
    random_state=42,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=7,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# 6. 두 모델의 예측값에 가중치를 부여해 최종 예측값 생성 (기존 코드와 동일)
weight_lgb = 0.6
weight_xgb = 0.4
weighted_predictions = (weight_lgb * lgb_predictions) + (weight_xgb * xgb_predictions)

# 7. 제출 파일 생성
submission_df['stress_score'] = weighted_predictions
submission_df.to_csv('../data/submission_improved.csv', index=False)

print("---")
print("submission_improved.csv 파일이 성공적으로 생성되었습니다! 제출해서 점수를 확인해 보세요.")

---
submission_improved.csv 파일이 성공적으로 생성되었습니다! 제출해서 점수를 확인해 보세요.


In [34]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

# 1. 데이터 불러오기
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    submission_df = pd.read_csv('../data/sample_submission.csv')
except FileNotFoundError:
    print("파일 경로를 다시 확인해주세요.")
    exit()

# 2. 전처리 및 파생변수 생성 함수
def preprocess_and_feature_engineer(df):
    df_processed = df.copy()
    
    # gender 열 제거
    if 'gender' in df_processed.columns:
        df_processed = df_processed.drop('gender', axis=1)

    df_processed.loc[df_processed['bone_density'] < 0, 'bone_density'] = 0
    df_processed[['medical_history', 'family_medical_history', 'edu_level']] = df_processed[['medical_history', 'family_medical_history', 'edu_level']].fillna('unknown')
    df_processed['mean_working'] = df_processed.groupby(['smoke_status', 'edu_level'])['mean_working'].transform(lambda x: x.fillna(x.median()))
    df_processed['mean_working'] = df_processed['mean_working'].fillna(df_processed['mean_working'].median())
    
    # gender를 제외한 범주형 변수 처리
    categorical_cols = ['activity', 'smoke_status', 'sleep_pattern',
                        'medical_history', 'family_medical_history', 'edu_level']
    df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=True)
    
    df_processed['BMI'] = df_processed['weight'] / (df_processed['height'] / 100) ** 2
    # Hypertension_flag 파생변수 생성 부분 제외
    df_processed['Pulse_Pressure'] = df_processed['systolic_blood_pressure'] - df_processed['diastolic_blood_pressure']
    df_processed['bp_interaction'] = df_processed['systolic_blood_pressure'] * df_processed['diastolic_blood_pressure']
    df_processed['cholesterol_glucose_ratio'] = df_processed['cholesterol'] / (df_processed['glucose'] + 1)
    return df_processed

# 3. 데이터에 함수 적용
train_final = preprocess_and_feature_engineer(train_df.copy())
test_final = preprocess_and_feature_engineer(test_df.copy())

# 4. 학습 데이터와 테스트 데이터의 컬럼 일치시키기
train_cols = set(train_final.columns)
test_cols = set(test_final.columns)
common_cols = list(train_cols.intersection(test_cols))
common_cols = [col for col in common_cols if col not in ['ID', 'stress_score']]

# 중요도가 낮은 변수 리스트 (제거)
low_importance_features = ['sleep_pattern_sleep difficulty', 'activity_moderate', 'mean_working']
final_cols = [col for col in common_cols if col not in low_importance_features]

X_train = train_final[final_cols]
y_train = train_final['stress_score']
X_test = test_final[final_cols]

# 5. 모델 학습 및 예측
# LightGBM 모델 (최적의 파라미터 적용)
lgb_model = lgb.LGBMRegressor(
    random_state=42,
    n_estimators=300,
    learning_rate=0.1,
    num_leaves=40
)
lgb_model.fit(X_train, y_train)
lgb_predictions = lgb_model.predict(X_test)

# XGBoost 모델 (최적의 파라미터 적용)
xgb_model = xgb.XGBRegressor(
    random_state=42,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=7,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# 6. 두 모델의 예측값에 가중치를 부여해 최종 예측값 생성
# LightGBM과 XGBoost의 예측값을 가중 평균
weight_lgb = 0.6
weight_xgb = 0.4
weighted_predictions = (weight_lgb * lgb_predictions) + (weight_xgb * xgb_predictions)

# 7. 제출 파일 생성
submission_df['stress_score'] = weighted_predictions
submission_df.to_csv('../data/submission_weighted_final_corrected.csv', index=False)

print("---")
print("submission_weighted_final_corrected.csv 파일이 성공적으로 생성되었습니다! 이 파일을 제출해서 점수를 확인해 봐.")

---
submission_weighted_final_corrected.csv 파일이 성공적으로 생성되었습니다! 이 파일을 제출해서 점수를 확인해 봐.
