# 1. 데이터 로딩 & 기본 전처리

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1-1) 데이터 로드
df = pd.read_csv('train.csv')

# 1-2) 불필요 컬럼 제거
df.drop(columns=['id', 'Policy Start Date'], inplace=True)

# 1-3) Previous Claims 전처리
df['PrevClaims_na'] = df['Previous Claims'].isna().astype(int)
df['Previous Claims'] = df['Previous Claims'].fillna(0)
clip_val = df['Previous Claims'].quantile(0.95)
df['Previous Claims'] = df['Previous Claims'].clip(upper=clip_val).astype('uint8')

# 1-4) Health Score 전처리
df['Health_na'] = df['Health Score'].isna().astype(int)
median_health = df['Health Score'].median()
df['Health Score'] = df['Health Score'].fillna(median_health)
q1, q3 = df['Health Score'].quantile([0.25,0.75])
iqr = q3 - q1
lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
df['Health Score'] = df['Health Score'].clip(lower, upper).astype('float32')



# 2. 도메인 기반 구간화 (Binning)


In [8]:
# Age
bins_age = [17,25,35,45,55,100]
labels_age = ['18-25','26-35','36-45','46-55','56+']
df['Age_bin'] = pd.cut(df['Age'], bins=bins_age, labels=labels_age)

# Credit Score
bins_cr = [0,580,670,740,800,1000]
labels_cr = ['Poor','Fair','Good','VeryGood','Excellent']
df['Credit_bin'] = pd.cut(df['Credit Score'], bins=bins_cr, labels=labels_cr)

# Health Score
df['Health_bin'] = pd.qcut(df['Health Score'], q=[0,0.25,0.75,1.0],
                           labels=['Low','Mid','High'])

# Previous Claims
df['PrevClaims_bin'] = pd.cut(df['Previous Claims'],
                              bins=[-1,0,2,df['Previous Claims'].max()],
                              labels=['0','1-2','3+'])


# 3. 합성 리스크 스코어 생성


In [9]:
# 3단계: 합성 Risk Score 생성

# 3-1) 각 구간을 숫자로 매핑할 딕셔너리 정의
map_credit = {'Poor':1, 'Fair':2, 'Good':3, 'VeryGood':4, 'Excellent':5}
map_health = {'Low':1, 'Mid':2, 'High':3}
map_prev   = {'0':0, '1-2':1, '3+':2}

# 3-2) 가중치 정의 (원하시는 대로 조정 가능)
weights = {'Credit':1.0, 'Health':0.8, 'PrevClaims':1.2}

# 3-3) 실제 계산 — Categorical → str → map → float 순으로 변환
df['Risk_Score'] = (
    df['Credit_bin'].astype(str).map(map_credit).astype(float) * weights['Credit']
  + df['Health_bin'].astype(str).map(map_health).astype(float) * weights['Health']
  + df['PrevClaims_bin'].astype(str).map(map_prev).astype(float) * weights['PrevClaims']
)


# 4. 인코딩 & 모델 입력 준비

In [10]:
from sklearn.preprocessing import LabelEncoder

# 4-1) 구간화된 컬럼들만 Label Encoding
bin_cols = ['Age_bin','Credit_bin','Health_bin','PrevClaims_bin']
le_dict = {}
for col in bin_cols:
    # 문자열 카테고리를 0,1,2,… 숫자로 인코딩
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

# 4-2) 다른 범주형도 인코딩 (필요하다면)
other_cat = ['Gender','Marital Status','Occupation','Location',
             'Policy Type','Smoking Status','Exercise Frequency',
             'Education Level','Number of Dependents','Property Type']
for col in other_cat:
    le = LabelEncoder()
    df[col + '_le'] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

# 4-3) 최종 X, y 준비
# 인코딩된 숫자 컬럼만 사용하도록 col 리스트 구성
cat_cols_le = [c + '_le' for c in bin_cols + other_cat]
num_cols = ['Age','Annual Income','Previous Claims','Health Score','Vehicle Age',
            'Credit Score','Insurance Duration','Risk_Score']  # 필요에 맞게 조정

X = df[cat_cols_le + num_cols]
y = df['Premium Amount']

# 4-1   파생변수 기입 TenureDays (가입 기간 일수), Income_per_Dependent, Feedback_Len & Feedback_Polarity, Age_Health & Credit_Health, PolicyStartMonth

In [None]:
# import pandas as pd
# import numpy as np
# from textblob import TextBlob

# # 1) 날짜 파생
# df['PolicyStartDate'] = pd.to_datetime(df['Policy Start Date'])
# df['PolicyStartMonth']   = df['PolicyStartDate'].dt.month
# df['PolicyStartQuarter'] = df['PolicyStartDate'].dt.quarter
# today = pd.to_datetime('2025-08-05')
# df['TenureDays'] = (today - df['PolicyStartDate']).dt.days

# # 2) 소득 파생
# df['Income_per_Dependent'] = df['Annual Income'] / (1 + df['Number of Dependents'])
# df['Income_log']           = np.log1p(df['Annual Income'])

# # 3) 교호작용 파생
# df['Age_Health']    = df['Age'] * df['Health Score']
# df['Credit_Health'] = df['Credit Score'] * df['Health Score']
# df['Credit_Income'] = df['Credit Score'] * df['Annual Income']

# # 4) 피드백 파생
# df['Customer Feedback']    = df['Customer Feedback'].fillna('')
# df['Feedback_Len']         = df['Customer Feedback'].str.len()
# df['Feedback_WordCnt']     = df['Customer Feedback'].str.split().map(len)
# df['Feedback_Polarity']    = df['Customer Feedback']\
#     .apply(lambda x: TextBlob(x).sentiment.polarity)

# # 5) 차량·유지 파생
# df['VehicleAge_bin'] = pd.cut(df['Vehicle Age'],
#     bins=[-1,3,7,100], labels=['0-3','4-7','8+'])
# df['TenureYears']    = df['Insurance Duration'] / 365.0


# 5. 학습 & 평가 Tweedie 손실 함수 기반 모델 실험

In [12]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# --- 5-1) 학습/검증 분리 (기존과 동일) ---
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 5-2) Pool 생성 (기존과 동일) ---
train_pool = Pool(X_tr, y_tr, cat_features=cat_cols_le)
val_pool   = Pool(X_va, y_va,   cat_features=cat_cols_le)

# --- 5-3) Tweedie 손실 모델 정의 & 학습 ---
tweedie_model = CatBoostRegressor(
    loss_function='Tweedie:variance_power=1.5',
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    early_stopping_rounds=30,
    verbose=100
)
tweedie_model.fit(train_pool, eval_set=val_pool)

# --- 5-4) 예측 & 평가 (기존과 동일) ---
preds = tweedie_model.predict(X_va)

rmse = np.sqrt(mean_squared_error(y_va, preds))
mae  = mean_absolute_error(y_va, preds)
r2   = r2_score(y_va, preds)

print(f"Tweedie Model Validation RMSE : {rmse:.4f}")
print(f"Tweedie Model Validation MAE  : {mae:.4f}")
print(f"Tweedie Model Validation R²   : {r2:.4f}")


0:	learn: 2099.7638604	test: 2100.1376825	best: 2100.1376825 (0)	total: 917ms	remaining: 7m 37s
100:	learn: 133.3197544	test: 133.3378097	best: 133.3378097 (100)	total: 1m 13s	remaining: 4m 50s
200:	learn: 132.4495890	test: 132.4774211	best: 132.4774211 (200)	total: 2m 21s	remaining: 3m 30s
300:	learn: 132.3789427	test: 132.4155078	best: 132.4155078 (300)	total: 3m 25s	remaining: 2m 15s
400:	learn: 132.3511200	test: 132.3935197	best: 132.3935197 (400)	total: 4m 34s	remaining: 1m 7s
499:	learn: 132.3349281	test: 132.3829538	best: 132.3829288 (496)	total: 5m 45s	remaining: 0us

bestTest = 132.3829288
bestIteration = 496

Shrink model to first 497 iterations.
Tweedie Model Validation RMSE : 848.1744
Tweedie Model Validation MAE  : 647.4286
Tweedie Model Validation R²   : 0.0373


In [13]:
# import pandas as pd
# import numpy as np
# from textblob import TextBlob

# # 1) 날짜 파생
# df['PolicyStartDate'] = pd.to_datetime(df['Policy Start Date'])
# df['PolicyStartMonth']   = df['PolicyStartDate'].dt.month
# df['PolicyStartQuarter'] = df['PolicyStartDate'].dt.quarter
# today = pd.to_datetime('2025-08-05')
# df['TenureDays'] = (today - df['PolicyStartDate']).dt.days

# # 2) 소득 파생
# df['Income_per_Dependent'] = df['Annual Income'] / (1 + df['Number of Dependents'])
# df['Income_log']           = np.log1p(df['Annual Income'])

# # 3) 교호작용 파생
# df['Age_Health']    = df['Age'] * df['Health Score']
# df['Credit_Health'] = df['Credit Score'] * df['Health Score']
# df['Credit_Income'] = df['Credit Score'] * df['Annual Income']

# # 4) 피드백 파생
# df['Customer Feedback']    = df['Customer Feedback'].fillna('')
# df['Feedback_Len']         = df['Customer Feedback'].str.len()
# df['Feedback_WordCnt']     = df['Customer Feedback'].str.split().map(len)
# df['Feedback_Polarity']    = df['Customer Feedback']\
#     .apply(lambda x: TextBlob(x).sentiment.polarity)

# # 5) 차량·유지 파생
# df['VehicleAge_bin'] = pd.cut(df['Vehicle Age'],
#     bins=[-1,3,7,100], labels=['0-3','4-7','8+'])
# df['TenureYears']    = df['Insurance Duration'] / 365.0
