# 1. 데이터 로드 & 날짜 파생 후 불필요 컬럼 제거


In [1]:
import pandas as pd
import numpy as np
from textblob import TextBlob

# 1-1) 데이터 로드
df = pd.read_csv('train.csv')

# 1-2) PolicyStartDate 파생
df['PolicyStartDate'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')

# 1-3) 불필요 컬럼 삭제
df.drop(columns=['id', 'Policy Start Date'], inplace=True)

# 1-4) Previous Claims 결측·극단치 처리
df['PrevClaims_na']       = df['Previous Claims'].isna().astype(int)
df['Previous Claims']      = df['Previous Claims'].fillna(0)
clip95                    = df['Previous Claims'].quantile(0.95)
df['Previous Claims']      = df['Previous Claims'].clip(upper=clip95).astype('uint8')

# 1-5) Health Score 결측·극단치 처리
df['Health_na']            = df['Health Score'].isna().astype(int)
med_h                      = df['Health Score'].median()
df['Health Score']         = df['Health Score'].fillna(med_h)
q1, q3                     = df['Health Score'].quantile([0.25,0.75])
iqr                        = q3 - q1
df['Health Score']         = df['Health Score'].clip(q1-1.5*iqr, q3+1.5*iqr).astype('float32')



# 2. 풍부한 파생변수 생성


In [2]:
# 2-1) 날짜 파생 (이미 만든 PolicyStartDate 사용)
df['PolicyStartMonth']  = df['PolicyStartDate'].dt.month
df['PolicyStartQuarter']= df['PolicyStartDate'].dt.quarter
today                   = pd.to_datetime('2025-08-05')
df['TenureDays']        = (today - df['PolicyStartDate']).dt.days

# 2-2) 소득 파생
df['Income_per_Dependent'] = df['Annual Income'] / (1 + df['Number of Dependents'])
df['Income_log']           = np.log1p(df['Annual Income'])

# 2-3) 교호작용 파생
df['Age_Health']       = df['Age'] * df['Health Score']
df['Credit_Health']    = df['Credit Score'] * df['Health Score']
df['Credit_Income']    = df['Credit Score'] * df['Annual Income']

# 2-4) 고객 피드백 텍스트 파생
df['Customer Feedback']   = df['Customer Feedback'].fillna('')
df['Feedback_Len']        = df['Customer Feedback'].str.len()
df['Feedback_WordCnt']    = df['Customer Feedback'].str.split().map(len)
df['Feedback_Polarity']   = df['Customer Feedback']\
                              .apply(lambda x: TextBlob(x).sentiment.polarity)

# 2-5) 차량·유지 기간 파생
df['VehicleAge_bin']   = pd.cut(df['Vehicle Age'],
                                bins=[-1,3,7,100], labels=['0-3','4-7','8+'])
df['TenureYears']      = df['Insurance Duration'] / 365.0


# 3. 도메인 기반 Binning & Risk Score


In [3]:
# 3-1) Binning
df['Age_bin']         = pd.cut(df['Age'],
                               bins=[17,25,35,45,55,100],
                               labels=['18-25','26-35','36-45','46-55','56+'])
df['Credit_bin']      = pd.cut(df['Credit Score'],
                               bins=[0,580,670,740,800,1000],
                               labels=['Poor','Fair','Good','VeryGood','Excellent'])
df['Health_bin']      = pd.qcut(df['Health Score'],
                               q=[0,0.25,0.75,1.0],
                               labels=['Low','Mid','High'])
df['PrevClaims_bin']  = pd.cut(df['Previous Claims'],
                               bins=[-1,0,2,df['Previous Claims'].max()],
                               labels=['0','1-2','3+'])

# 3-2) 합성 Risk Score
map_credit = {'Poor':1,'Fair':2,'Good':3,'VeryGood':4,'Excellent':5}
map_health = {'Low':1,'Mid':2,'High':3}
map_prev   = {'0':0,'1-2':1,'3+':2}
weights    = {'Credit':1.0,'Health':0.8,'PrevClaims':1.2}

df['Risk_Score'] = (
    df['Credit_bin'].astype(str).map(map_credit).astype(float)*weights['Credit'] +
    df['Health_bin'].astype(str).map(map_health).astype(float)*weights['Health'] +
    df['PrevClaims_bin'].astype(str).map(map_prev).astype(float)*weights['PrevClaims']
)


# 4. 인코딩 & 피처셋 구성

In [4]:
from sklearn.preprocessing import LabelEncoder

# 4-1) 범주형 → 숫자 인코딩
cat_cols = [
    'Age_bin','Credit_bin','Health_bin','PrevClaims_bin',
    'PolicyStartMonth','PolicyStartQuarter','VehicleAge_bin',
    'Gender','Marital Status','Occupation','Location',
    'Policy Type','Smoking Status','Exercise Frequency',
    'Education Level','Number of Dependents','Property Type'
]
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

# 4-2) 최종 X, y 정의
feature_cols = [
    *[c + '_le' for c in cat_cols],
    'Annual Income','Income_per_Dependent','Income_log',
    'Age_Health','Credit_Health','Credit_Income',
    'Feedback_Len','Feedback_WordCnt','Feedback_Polarity',
    'TenureDays','TenureYears','Risk_Score'
]
X = df[feature_cols]
y = df['Premium Amount']
cat_features_le = [c + '_le' for c in cat_cols]

# 5. 학습 & 평가 (k-Fold + Tweedie)


In [5]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmses, maes, r2s = [], [], []

for train_idx, val_idx in kf.split(X):
    X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

    pool_tr = Pool(X_tr, y_tr, cat_features=cat_features_le)
    pool_va = Pool(X_va, y_va, cat_features=cat_features_le)

    model = CatBoostRegressor(
        loss_function='Tweedie:variance_power=1.5',
        iterations=500,
        learning_rate=0.05,
        depth=6,
        random_seed=42,
        early_stopping_rounds=30,
        verbose=0
    )
    model.fit(pool_tr, eval_set=pool_va)

    pred = model.predict(X_va)
    rmses.append(np.sqrt(mean_squared_error(y_va, pred)))
    maes.append(mean_absolute_error(y_va, pred))
    r2s.append(r2_score(y_va, pred))

print(f"CV RMSE : {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
print(f"CV MAE  : {np.mean(maes):.4f} ± {np.std(maes):.4f}")
print(f"CV R²   : {np.mean(r2s):.4f} ± {np.std(r2s):.4f}")


CV RMSE : 851.9773 ± 0.9447
CV MAE  : 652.1679 ± 0.2607
CV R²   : 0.0299 ± 0.0006
