In [None]:
# Import Library
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, f1_score

# Load Data
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

# Data Preprocessing
# SUBCLASS가 범주형이기 때문에 LabelEncoder 사용
le_subclass = LabelEncoder()
train['SUBCLASS'] = le_subclass.fit_transform(train['SUBCLASS'])

# 특성 및 타겟 변수 분리
X = train.drop(columns=['SUBCLASS', 'ID'])
y_subclass = train['SUBCLASS']

# 범주형 특징에 대해 원-핫 인코딩 수행
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

### 1. RandomForest를 사용한 차원 축소 ###
# RandomForest를 사용한 중요도 기반 차원 축소
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_encoded, y_subclass)

# 중요도 누적값을 기준으로 중요한 피처만 선택
feature_importances = rf.feature_importances_
sorted_idx = feature_importances.argsort()[::-1]
cumulative_importance = feature_importances[sorted_idx].cumsum()

# 중요도 누적값 0.7 기준으로 피처 선택
important_features_idx = sorted_idx[cumulative_importance <= 0.7]
X_reduced = X_encoded.iloc[:, important_features_idx]

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_reduced, y_subclass, test_size=0.2, random_state=42)

### 2. 최적화된 모델 설정 ###
# XGBoost 모델 최적화된 파라미터로 설정
xgb_model = xgb.XGBClassifier(
    n_estimators=233,
    max_depth=5,
    learning_rate=0.09066897671128973,
    subsample=0.7445604876879595,
    colsample_bytree=0.6030050347739594,
    random_state=42,
    eval_metric='mlogloss',
    use_label_encoder=False
)

# RandomForest 모델 최적화된 파라미터로 설정
rf_model = RandomForestClassifier(
    n_estimators=349,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42
)

# LightGBM 모델 최적화된 파라미터로 설정
lgb_model = lgb.LGBMClassifier(
    n_estimators=259,
    max_depth=3,
    learning_rate=0.040719770187575215,
    subsample=0.9981840021295828,
    colsample_bytree=0.9255086919552659,
    random_state=42
)

# Logistic Regression 모델
lr_model = LogisticRegression(max_iter=200, random_state=42)

### 3. Stacking 앙상블 ###
# StackingClassifier 정의
estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('lr', lr_model),
    ('lgb', lgb_model)
]

stacking_model = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(),  # 최종 메타 모델
    cv=5
)

# 스태킹 모델 훈련
stacking_model.fit(X_train, y_train)

# 검증 세트에 대한 예측 및 성능 평가
y_val_pred = stacking_model.predict(X_val)
y_val_pred_proba = stacking_model.predict_proba(X_val)
validation_macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Log Loss: {log_loss(y_val, y_val_pred_proba)}")
print(f"Validation Macro F1 Score: {validation_macro_f1}")

# Inference
test_X = test.drop(columns=['ID'])
X_encoded_test = pd.get_dummies(test_X, columns=categorical_columns)

# 누락된 원-핫 인코딩 컬럼을 맞추기 위해 train 데이터의 컬럼 기준으로 맞춤
X_encoded_test = X_encoded_test.reindex(columns = X_encoded.columns, fill_value=0)

# 테스트 데이터에서 중요한 피처만 선택
X_reduced_test = X_encoded_test.iloc[:, important_features_idx]

# 최종 예측 수행
predictions = stacking_model.predict(X_reduced_test)
original_labels = le_subclass.inverse_transform(predictions)

# Submission
submission = pd.read_csv("./sample_submission.csv")
submission["SUBCLASS"] = original_labels
submission.to_csv('./submission_with_onehot_0.7.csv', encoding='UTF-8-sig', index=False)


In [1]:
# Import Library
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, f1_score

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load Data
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [None]:
# Data Preprocessing
# SUBCLASS가 범주형이기 때문에 LabelEncoder 사용
le_subclass = LabelEncoder()
train['SUBCLASS'] = le_subclass.fit_transform(train['SUBCLASS'])

In [None]:
# 특성 및 타겟 변수 분리
X = train.drop(columns=['SUBCLASS', 'ID'])
y_subclass = train['SUBCLASS']

In [None]:
# 범주형 특징에 대해 원-핫 인코딩 수행
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
X_encoded = pd.get_dummies(X, columns=categorical_columns)

In [None]:
### 1. RandomForest를 사용한 차원 축소 ###
# RandomForest를 사용한 중요도 기반 차원 축소
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_encoded, y_subclass)

In [None]:
# 중요도 누적값을 기준으로 중요한 피처만 선택
feature_importances = rf.feature_importances_
sorted_idx = feature_importances.argsort()[::-1]
cumulative_importance = feature_importances[sorted_idx].cumsum()

In [None]:
# 중요도 누적값 0.7 기준으로 피처 선택
important_features_idx = sorted_idx[cumulative_importance <= 0.7]
X_reduced = X_encoded.iloc[:, important_features_idx]

# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X_reduced, y_subclass, test_size=0.2, random_state=42)

In [None]:
### 2. 최적화된 모델 설정 ###
# XGBoost 모델 최적화된 파라미터로 설정
xgb_model = xgb.XGBClassifier(
    n_estimators=233,
    max_depth=5,
    learning_rate=0.09066897671128973,
    subsample=0.7445604876879595,
    colsample_bytree=0.6030050347739594,
    random_state=42,
    eval_metric='mlogloss',
    use_label_encoder=False
)

# RandomForest 모델 최적화된 파라미터로 설정
rf_model = RandomForestClassifier(
    n_estimators=349,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42
)

# LightGBM 모델 최적화된 파라미터로 설정
lgb_model = lgb.LGBMClassifier(
    n_estimators=259,
    max_depth=3,
    learning_rate=0.040719770187575215,
    subsample=0.9981840021295828,
    colsample_bytree=0.9255086919552659,
    random_state=42
)

# Logistic Regression 모델
lr_model = LogisticRegression(max_iter=200, random_state=42)

In [None]:
### 3. Stacking 앙상블 ###
# StackingClassifier 정의
estimators = [
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('lr', lr_model),
    ('lgb', lgb_model)
]

stacking_model = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(),  # 최종 메타 모델
    cv=5
)

# 스태킹 모델 훈련
stacking_model.fit(X_train, y_train)

In [None]:
# 검증 세트에 대한 예측 및 성능 평가
y_val_pred = stacking_model.predict(X_val)
y_val_pred_proba = stacking_model.predict_proba(X_val)
validation_macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Log Loss: {log_loss(y_val, y_val_pred_proba)}")
print(f"Validation Macro F1 Score: {validation_macro_f1}")

In [None]:
# Inference
test_X = test.drop(columns=['ID'])
X_encoded_test = pd.get_dummies(test_X, columns=categorical_columns)

# 누락된 원-핫 인코딩 컬럼을 맞추기 위해 train 데이터의 컬럼 기준으로 맞춤
X_encoded_test = X_encoded_test.reindex(columns = X_encoded.columns, fill_value=0)

# 테스트 데이터에서 중요한 피처만 선택
X_reduced_test = X_encoded_test.iloc[:, important_features_idx]

In [None]:
# 최종 예측 수행
predictions = stacking_model.predict(X_reduced_test)
original_labels = le_subclass.inverse_transform(predictions)

In [None]:
# Submission
submission = pd.read_csv("./sample_submission.csv")
submission["SUBCLASS"] = original_labels
submission.to_csv('./submission_with_onehot_0.7.csv', encoding='UTF-8-sig', index=False)