In [1]:
import pandas as pd 

test = pd.read_csv('test.csv')

sample = pd.read_csv('sample_submission.csv')

train = pd.read_csv('train.csv')

# 기본 전처리 

In [2]:
# Number of Dependents: 구간화(Binning) & 결측 플래그 추가
import numpy as np

# Number of Dependents
train['Number of Dependents'] = train['Number of Dependents'].astype('Int64')
train['Number of Dependents'] = train['Number of Dependents'].astype('category')
train['Number of Dependents'] = train['Number of Dependents'].cat.add_categories('Missing')
train['Number of Dependents'] = train['Number of Dependents'].fillna('Missing')

#결측 플래그 처리 컬럼 예시(Age, Annual Income, Health Score, Credit Score, Customer Feedback 등)

# Age에 대한 중앙값 대체 예시
train['Age'] = train['Age'].fillna(train['Age'].median())

# Annual Income
train['Annual Income'] = train['Annual Income'].fillna(train['Annual Income'].median())

# Health Score
train['Health Score'] = train['Health Score'].fillna(train['Health Score'].median())

# Credit Score
train['Credit Score'] = train['Credit Score'].fillna('Missing')

# Customer Feedback
train['Customer Feedback'] = train['Customer Feedback'].fillna('No Feedback')

# Vehicle Age 결측 행 삭제
train = train[train['Vehicle Age'].notnull()]

# Insurance Duration 결측 행 삭제
train = train[train['Insurance Duration'].notnull()]

# Marital Status 결측치는 'Unknown'으로 대체
train['Marital Status'] = train['Marital Status'].fillna('Unknown')

In [3]:
print("결측 제거 후 행 개수 :", len(train))

print(train.isna().sum())

결측 제거 후 행 개수 : 1199993
id                           0
Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation              358074
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims         364028
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64


# test2 1-1&2-2 train2

In [4]:
train2 = train.copy()
train2['Occupation'] = train2['Occupation'].fillna('Unknown')


train2['Previous Claims'] = train2['Previous Claims'].astype('Int64')
train2['Previous Claims'] = train2['Previous Claims'].astype('category')
train2['Previous Claims'] = train2['Previous Claims'].cat.add_categories('Missing')
train2['Previous Claims'] = train2['Previous Claims'].fillna('Missing')

# train2 Catboost 적용


In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. 타겟 설정
target2 = 'Premium Amount'
y2 = train2[target2]
X2 = train2.drop(columns=[target2])

# 3. 결측치 처리 및 범주형 컬럼 지정
cat_features2 = X2.select_dtypes(include=['object', 'category']).columns.tolist()


# NaN을 문자열 'Missing'으로 변환 (CatBoost 호환)
for col in cat_features2:
   X2[col] = X2[col].astype(str).fillna("Missing")


# 숫자형 컬럼 NaN은 중앙값으로 대체
for col in X2.select_dtypes(include=[np.number]).columns:
   if X2[col].isnull().sum() > 0:
       X2[col] = X2[col].fillna(X2[col].median())


# 4. 학습/검증 데이터 분리
X_train2, X_val2, y_train2, y_val2 = train_test_split(X2, y2, test_size=0.2, random_state=42)


# 5. 모델 정의
model2 = CatBoostRegressor(
   iterations=1000,
   learning_rate=0.05,
   depth=6,
   eval_metric='MAE',
   cat_features=cat_features2,
   random_seed=42,
   verbose=100
)

# 6. 모델 학습
model2.fit(X_train2, y_train2, eval_set=(X_val2, y_val2), use_best_model=True)

# 예측 및 평가
y_pred2 = model2.predict(X_val2)
mae = mean_absolute_error(y_val2, y_pred2)
mse = mean_squared_error(y_val2, y_pred2)
rmse = np.sqrt(mse)
r2 = r2_score(y_val2, y_pred2)

print(f"train2 MAE:  {mae:.4f}")
print(f"train2 MSE:  {mse:.4f}")
print(f"train2 RMSE: {rmse:.4f}")
print(f"train2 R2:   {r2:.4f}")


0:	learn: 667.9957958	test: 668.7504018	best: 668.7504018 (0)	total: 1.17s	remaining: 19m 28s
100:	learn: 653.7569083	test: 654.0719997	best: 654.0719997 (100)	total: 1m 34s	remaining: 13m 56s
200:	learn: 650.4555579	test: 650.7101920	best: 650.7101920 (200)	total: 3m 13s	remaining: 12m 48s
300:	learn: 648.5507692	test: 648.7653519	best: 648.7653519 (300)	total: 4m 51s	remaining: 11m 16s
400:	learn: 646.8784779	test: 647.0883025	best: 647.0764638 (398)	total: 6m 28s	remaining: 9m 39s
500:	learn: 645.9964941	test: 646.2142727	best: 646.2127673 (496)	total: 8m 13s	remaining: 8m 11s
600:	learn: 645.5947060	test: 645.9210703	best: 645.9114401 (584)	total: 10m 6s	remaining: 6m 42s
700:	learn: 645.1294000	test: 645.5306877	best: 645.5276143 (693)	total: 11m 47s	remaining: 5m 1s
800:	learn: 644.8586762	test: 645.3606559	best: 645.3532022 (789)	total: 13m 39s	remaining: 3m 23s
900:	learn: 644.4804676	test: 645.0836087	best: 645.0824615 (897)	total: 15m 21s	remaining: 1m 41s
999:	learn: 644.227