# 1. 데이터 로딩 & 기본 전처리

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1-1) 데이터 로드
df = pd.read_csv('train.csv')

# 1-2) 불필요 컬럼 제거
df.drop(columns=['id', 'Policy Start Date'], inplace=True)

# 1-3) Previous Claims 전처리
df['PrevClaims_na'] = df['Previous Claims'].isna().astype(int)
df['Previous Claims'] = df['Previous Claims'].fillna(0)
clip_val = df['Previous Claims'].quantile(0.95)
df['Previous Claims'] = df['Previous Claims'].clip(upper=clip_val).astype('uint8')

# 1-4) Health Score 전처리
df['Health_na'] = df['Health Score'].isna().astype(int)
median_health = df['Health Score'].median()
df['Health Score'] = df['Health Score'].fillna(median_health)
q1, q3 = df['Health Score'].quantile([0.25,0.75])
iqr = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
df['Health Score'] = df['Health Score'].clip(lower, upper).astype('float32')


# 2. 도메인 기반 구간화 (Binning)


In [3]:
# Age
bins_age = [17,25,35,45,55,100]
labels_age = ['18-25','26-35','36-45','46-55','56+']
df['Age_bin'] = pd.cut(df['Age'], bins=bins_age, labels=labels_age)

# Credit Score
bins_cr = [0,580,670,740,800,1000]
labels_cr = ['Poor','Fair','Good','VeryGood','Excellent']
df['Credit_bin'] = pd.cut(df['Credit Score'], bins=bins_cr, labels=labels_cr)

# Health Score
df['Health_bin'] = pd.qcut(df['Health Score'], q=[0,0.25,0.75,1.0],
                           labels=['Low','Mid','High'])

# Previous Claims
df['PrevClaims_bin'] = pd.cut(df['Previous Claims'],
                              bins=[-1,0,2,df['Previous Claims'].max()],
                              labels=['0','1-2','3+'])


# 3. 합성 리스크 스코어 생성


In [6]:
# 3-1) 매핑 & 숫자 변환
credit_num = df['Credit_bin'].astype(str).map(map_credit).astype(float)
health_num = df['Health_bin'].astype(str).map(map_health).astype(float)
prev_num   = df['PrevClaims_bin'].astype(str).map(map_prev).astype(float)

# 3-2) 합성 Risk Score 계산
df['Risk_Score'] = (
    credit_num * weights['Credit'] +
    health_num * weights['Health'] +
    prev_num   * weights['PrevClaims']
)

# 4. 인코딩 & 모델 입력 준비

In [13]:
from sklearn.preprocessing import LabelEncoder

# 4-1) 구간화된 컬럼들만 Label Encoding
bin_cols = ['Age_bin','Credit_bin','Health_bin','PrevClaims_bin']
le_dict = {}
for col in bin_cols:
    # 문자열 카테고리를 0,1,2,… 숫자로 인코딩
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

# 4-2) 다른 범주형도 인코딩 (필요하다면)
other_cat = ['Gender','Marital Status','Occupation','Location',
             'Policy Type','Smoking Status','Exercise Frequency',
             'Education Level','Number of Dependents','Property Type']
for col in other_cat:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

# 4-3) 최종 X, y 준비
# 인코딩된 숫자 컬럼만 사용하도록 col 리스트 구성
cat_cols_le = [c + '_le' for c in bin_cols + other_cat]
num_cols = ['Age','Annual Income','Previous Claims','Health Score','Vehicle Age',
            'Credit Score','Insurance Duration','Risk_Score']  # 필요에 맞게 조정

X = df[cat_cols_le + num_cols]
y = df['Premium Amount']



# 5. 학습 & 평가 (CatBoost 회귀)

In [14]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# 5-1) 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5-2) Pool 생성 — cat_features 로는 LE된 컬럼명을 사용
train_pool = Pool(X_train, y_train, cat_features=cat_cols_le)
val_pool   = Pool(X_val,   y_val,   cat_features=cat_cols_le)

# 5-3) 모델 학습
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100
)
model.fit(train_pool, eval_set=val_pool)

# 5-4) 예측
preds = model.predict(X_val)

# 5-5) 평가
rmse = np.sqrt(mean_squared_error(y_val, preds))
mae  = mean_absolute_error(y_val, preds)
r2   = r2_score(y_val, preds)

print(f"Validation RMSE : {rmse:.4f}")
print(f"Validation MAE  : {mae:.4f}")
print(f"Validation R²   : {r2:.4f}")


0:	learn: 864.1600341	test: 863.4862553	best: 863.4862553 (0)	total: 976ms	remaining: 16m 15s
100:	learn: 850.8071590	test: 850.6957856	best: 850.6957856 (100)	total: 1m 8s	remaining: 10m 12s
200:	learn: 849.4974670	test: 849.6279905	best: 849.6279905 (200)	total: 2m 7s	remaining: 8m 27s
300:	learn: 848.4709070	test: 848.7616923	best: 848.7616923 (299)	total: 2m 59s	remaining: 6m 57s
400:	learn: 847.6067472	test: 848.1117011	best: 848.1117011 (400)	total: 4m 2s	remaining: 6m 1s
500:	learn: 847.1311863	test: 847.8346045	best: 847.8346045 (500)	total: 5m 2s	remaining: 5m
600:	learn: 846.7424752	test: 847.5839800	best: 847.5839800 (600)	total: 6m 4s	remaining: 4m 1s
700:	learn: 846.4097948	test: 847.4541511	best: 847.4536275 (697)	total: 7m 6s	remaining: 3m 1s
800:	learn: 846.0274140	test: 847.2518577	best: 847.2501992 (796)	total: 8m 9s	remaining: 2m 1s
900:	learn: 845.7958317	test: 847.1754991	best: 847.1754991 (900)	total: 9m 11s	remaining: 1m
999:	learn: 845.5009440	test: 847.0792028	

In [15]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor, Pool

# 1) KFold 세팅
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 2) 결과 저장용 리스트
rmses, maes, r2s = [], [], []

# 3) Fold별 학습 & 평가
for train_idx, val_idx in kf.split(X):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    # Pool 생성
    train_pool = Pool(X_tr, y_tr, cat_features=cat_cols_le)
    val_pool   = Pool(X_va, y_va, cat_features=cat_cols_le)

    # 모델 정의 & 학습 (verbose 숨김)
    model = CatBoostRegressor(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        early_stopping_rounds=50,
        verbose=0
    )
    model.fit(train_pool, eval_set=val_pool)

    # 예측
    preds = model.predict(X_va)

    # 지표 계산
    rmses.append(np.sqrt(mean_squared_error(y_va, preds)))
    maes.append(mean_absolute_error(y_va, preds))
    r2s.append(r2_score(y_va, preds))

# 4) Fold 평균·표준편차 출력
print(f"CV RMSE : {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
print(f"CV MAE  : {np.mean(maes):.4f} ± {np.std(maes):.4f}")
print(f"CV R²   : {np.mean(r2s):.4f} ± {np.std(r2s):.4f}")


CV RMSE : 847.8654 ± 0.8391
CV MAE  : 647.6859 ± 0.1465
CV R²   : 0.0392 ± 0.0008
