In [None]:
!pip install catboost



In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
path = '/content/drive/MyDrive/is665_004/Final/'

In [None]:
train_df = pd.read_csv(path + 'data/train.csv')
test_df = pd.read_csv(path + 'data/test.csv')

In [None]:
train_df

Unnamed: 0,ID,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Income
0,TRAIN_00000,63,M,Middle (7-8),Full-Time,4,Social Services,Services,White,All other,...,Native,US,US,US,Nonfiler,0,0,0,Unknown,425
1,TRAIN_00001,37,M,Associates degree (Vocational),Full-Time,52,Entertainment,Services,White,All other,...,Native,US,US,US,Single,0,0,0,Under Median,0
2,TRAIN_00002,58,F,High graduate,Full-Time,52,Manufacturing (Non-durable),Admin Support (include Clerical),Black,All other,...,Native,US,US,US,Married Filling Jointly both under 65 (MFJ),3411,0,0,Under Median,860
3,TRAIN_00003,44,M,High graduate,Full-Time,52,Retail,Technicians & Support,White,All other,...,Native,US,US,US,Single,0,0,0,Under Median,850
4,TRAIN_00004,37,F,High graduate,Full-Time,52,Retail,Sales,White,All other,...,Native,US,US,US,Head of Household (HOH),0,0,0,Unknown,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,TRAIN_19995,33,M,High graduate,Children or Armed Forces,52,Manufacturing (Durable),Handlers/Cleaners,White,All other,...,Native,US,US,US,Single,0,0,0,Under Median,1300
19996,TRAIN_19996,20,F,College,Full-Time,12,Education,Admin Support (include Clerical),White,Mexican-American,...,Native,US,Mexico,Mexico,Nonfiler,0,0,0,Under Median,850
19997,TRAIN_19997,22,M,College,Children or Armed Forces,52,Transportation,Technicians & Support,White,All other,...,Native,US,US,US,Single,0,0,0,Unknown,999
19998,TRAIN_19998,76,F,High graduate,Not Working,0,Not in universe or children,Unknown,White,All other,...,Native,US,Scotland,England,Single,0,0,0,Under Median,0


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      20000 non-null  object
 1   Age                     20000 non-null  int64 
 2   Gender                  20000 non-null  object
 3   Education_Status        20000 non-null  object
 4   Employment_Status       20000 non-null  object
 5   Working_Week (Yearly)   20000 non-null  int64 
 6   Industry_Status         20000 non-null  object
 7   Occupation_Status       20000 non-null  object
 8   Race                    20000 non-null  object
 9   Hispanic_Origin         20000 non-null  object
 10  Martial_Status          20000 non-null  object
 11  Household_Status        20000 non-null  object
 12  Household_Summary       20000 non-null  object
 13  Citizenship             20000 non-null  object
 14  Birth_Country           20000 non-null  object
 15  Bi

## train 데이터셋 전처리

In [None]:
train_df = train_df.drop(columns=['ID'])

In [None]:
# 범주형 변수 label encoding
label_encoder = LabelEncoder()

categorical_cols = train_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
  train_df[col] = label_encoder.fit_transform(train_df[col].astype(str))

## target column 분리

In [None]:
X_train = train_df.drop(columns=['Income'])
y_train = train_df['Income']

## 모델 훈련

In [None]:
models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(random_state=42, verbose=False)
}

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = {}

for model_name, model in models.items():
    rmse_scores[model_name] = []
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        model.fit(X_train_fold, y_train_fold)

        # 검증 데이터에 대한 예측 및 RMSE 계산
        val_preds = model.predict(X_val_fold)
        rmse = mean_squared_error(y_val_fold, val_preds, squared=False)
        rmse_scores[model_name].append(rmse)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002912 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 770
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 21
[LightGBM] [Info] Start training from score 556.153500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 21
[LightGBM] [Info] Start training from score 555.873750
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [None]:
# 모델별 k-fold cross validation의 평균 RMSE 출력
for model_name, scores in rmse_scores.items():
    print(f"{model_name}: Mean RMSE - {np.mean(scores)}")

Random Forest: Mean RMSE - 626.3519812800967
Gradient Boosting: Mean RMSE - 598.8592937297442
XGBoost: Mean RMSE - 618.5430357563104
LightGBM: Mean RMSE - 595.2660220437843
CatBoost: Mean RMSE - 602.089069795253


In [None]:
# 가장 낮은 RMSE를 가진 모델 선택하여 전체 훈련 데이터로 재학습
best_model_name = min(rmse_scores, key=rmse_scores.get)
best_model = models[best_model_name]
best_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003095 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 777
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 21
[LightGBM] [Info] Start training from score 554.565250


## test 데이터셋 전처리

In [None]:
test_ids = test_df['ID']
test_df = test_df.drop(columns=['ID'])

In [None]:
for col in categorical_cols:
  test_df[col] = test_df[col].map(lambda x: 'Unknown' if x not in label_encoder.classes_ else x)
  test_df[col] = label_encoder.transform(test_df[col].astype(str))

## test 데이터셋 예측

In [None]:
test_preds = best_model.predict(test_df)

## submission 파일 생성

In [None]:
submission_df = pd.DataFrame({'ID': test_ids, 'Income': test_preds})
submission_df
#submission_df.to_csv(path + 'submission/submission_20240402(2).csv', index=False)

Unnamed: 0,ID,Income
0,TEST_0000,597.278354
1,TEST_0001,765.793977
2,TEST_0002,872.154198
3,TEST_0003,874.376747
4,TEST_0004,636.047869
...,...,...
9995,TEST_9995,970.132633
9996,TEST_9996,1092.971681
9997,TEST_9997,701.365395
9998,TEST_9998,633.044912
