In [1]:
import pandas as pd 

test = pd.read_csv('test.csv')

sample = pd.read_csv('sample_submission.csv')

train = pd.read_csv('train.csv')

In [2]:
# Number of Dependents: 구간화(Binning) & 결측 플래그 추가
import numpy as np

# Number of Dependents
train['Number of Dependents'] = train['Number of Dependents'].astype('Int64')
train['Number of Dependents'] = train['Number of Dependents'].astype('category')
train['Number of Dependents'] = train['Number of Dependents'].cat.add_categories('Missing')
train['Number of Dependents'] = train['Number of Dependents'].fillna('Missing')

#결측 플래그 처리 컬럼 예시(Age, Annual Income, Health Score, Credit Score, Customer Feedback 등)

# Age에 대한 중앙값 대체 예시
train['Age'] = train['Age'].fillna(train['Age'].median())

# Annual Income
train['Annual Income'] = train['Annual Income'].fillna(train['Annual Income'].median())

# Health Score
train['Health Score'] = train['Health Score'].fillna(train['Health Score'].median())

# Credit Score
train['Credit Score'] = train['Credit Score'].fillna('Missing')

# Customer Feedback
train['Customer Feedback'] = train['Customer Feedback'].fillna('No Feedback')

# Vehicle Age 결측 행 삭제
train = train[train['Vehicle Age'].notnull()]

# Insurance Duration 결측 행 삭제
train = train[train['Insurance Duration'].notnull()]

# Marital Status 결측치는 'Unknown'으로 대체
train['Marital Status'] = train['Marital Status'].fillna('Unknown')

In [3]:
print("결측 제거 후 행 개수 :", len(train))

print(train.isna().sum())

결측 제거 후 행 개수 : 1199993
id                           0
Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation              358074
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims         364028
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64


# test4 1-2&2-2 train4


In [4]:
train4 = train.copy()
occupation_income_mean = train.groupby('Occupation')['Annual Income'].mean()
def fill_occupation(row):
  if pd.isnull(row['Occupation']):
      diffs = (occupation_income_mean - row['Annual Income']).abs()
      return diffs.idxmin()
  else:
      return row['Occupation']


train4['Occupation'] = train4.apply(fill_occupation, axis=1)


train4['Previous Claims'] = train4['Previous Claims'].astype('Int64')
train4['Previous Claims'] = train4['Previous Claims'].astype('category')
train4['Previous Claims'] = train4['Previous Claims'].cat.add_categories('Missing')
train4['Previous Claims'] = train4['Previous Claims'].fillna('Missing')

# train4 Catboost 적용


In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. 타겟 설정
target4 = 'Premium Amount'
y4 = train4[target4]
X4 = train4.drop(columns=[target4])

# 3. 결측치 처리 및 범주형 컬럼 지정
cat_features4 = X4.select_dtypes(include=['object', 'category']).columns.tolist()


# NaN을 문자열 'Missing'으로 변환 (CatBoost 호환)
for col in cat_features4:
   X4[col] = X4[col].astype(str).fillna("Missing")


# 숫자형 컬럼 NaN은 중앙값으로 대체
for col in X4.select_dtypes(include=[np.number]).columns:
   if X4[col].isnull().sum() > 0:
       X4[col] = X4[col].fillna(X4[col].median())


# 4. 학습/검증 데이터 분리
X_train4, X_val4, y_train4, y_val4 = train_test_split(X4, y4, test_size=0.2, random_state=42)


# 5. 모델 정의
model4 = CatBoostRegressor(
   iterations=1000,
   learning_rate=0.05,
   depth=6,
   eval_metric='MAE',
   cat_features=cat_features4,
   random_seed=42,
   verbose=100
)

# 6. 모델 학습
model4.fit(X_train4, y_train4, eval_set=(X_val4, y_val4), use_best_model=True)

# 예측 및 평가
y_pred4 = model4.predict(X_val4)
mae = mean_absolute_error(y_val4, y_pred4)
mse = mean_squared_error(y_val4, y_pred4)
rmse = np.sqrt(mse)
r2 = r2_score(y_val4, y_pred4)

print(f"train4 MAE:  {mae:.4f}")
print(f"train4 MSE:  {mse:.4f}")
print(f"train4 RMSE: {rmse:.4f}")
print(f"train4 R2:   {r2:.4f}")

0:	learn: 667.9957958	test: 668.7504018	best: 668.7504018 (0)	total: 1.08s	remaining: 17m 55s
100:	learn: 654.1391893	test: 654.4613850	best: 654.4613850 (100)	total: 1m 31s	remaining: 13m 36s
200:	learn: 650.1962425	test: 650.3591072	best: 650.3591072 (200)	total: 3m 8s	remaining: 12m 29s
300:	learn: 648.1702904	test: 648.2604486	best: 648.2604486 (300)	total: 4m 48s	remaining: 11m 10s
400:	learn: 646.6817101	test: 646.8220760	best: 646.8220760 (400)	total: 6m 23s	remaining: 9m 32s
500:	learn: 646.1328152	test: 646.3165737	best: 646.3155761 (499)	total: 8m 4s	remaining: 8m 2s
600:	learn: 645.5366160	test: 645.7848910	best: 645.7848109 (599)	total: 9m 40s	remaining: 6m 25s
700:	learn: 645.1220903	test: 645.4612904	best: 645.4568991 (690)	total: 11m 21s	remaining: 4m 50s
800:	learn: 644.8164599	test: 645.2795833	best: 645.2699558 (794)	total: 12m 57s	remaining: 3m 13s
900:	learn: 644.5489111	test: 645.1090732	best: 645.1027936 (896)	total: 14m 42s	remaining: 1m 36s
999:	learn: 644.26242