In [1]:
import pandas as pd 

test = pd.read_csv('test.csv')

sample = pd.read_csv('sample_submission.csv')

train = pd.read_csv('train.csv')

In [2]:
# Number of Dependents: 구간화(Binning) & 결측 플래그 추가
import numpy as np

# Number of Dependents
train['Number of Dependents'] = train['Number of Dependents'].astype('Int64')
train['Number of Dependents'] = train['Number of Dependents'].astype('category')
train['Number of Dependents'] = train['Number of Dependents'].cat.add_categories('Missing')
train['Number of Dependents'] = train['Number of Dependents'].fillna('Missing')

#결측 플래그 처리 컬럼 예시(Age, Annual Income, Health Score, Credit Score, Customer Feedback 등)

# Age에 대한 중앙값 대체 예시
train['Age'] = train['Age'].fillna(train['Age'].median())

# Annual Income
train['Annual Income'] = train['Annual Income'].fillna(train['Annual Income'].median())

# Health Score
train['Health Score'] = train['Health Score'].fillna(train['Health Score'].median())

# Credit Score
train['Credit Score'] = train['Credit Score'].fillna('Missing')

# Customer Feedback
train['Customer Feedback'] = train['Customer Feedback'].fillna('No Feedback')

# Vehicle Age 결측 행 삭제
train = train[train['Vehicle Age'].notnull()]

# Insurance Duration 결측 행 삭제
train = train[train['Insurance Duration'].notnull()]

# Marital Status 결측치는 'Unknown'으로 대체
train['Marital Status'] = train['Marital Status'].fillna('Unknown')

In [3]:
print("결측 제거 후 행 개수 :", len(train))

print(train.isna().sum())

결측 제거 후 행 개수 : 1199993
id                           0
Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation              358074
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims         364028
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64


# test3 1-2&2-1 train3


In [4]:
train3 = train.copy()
occupation_income_mean = train.groupby('Occupation')['Annual Income'].mean()
def fill_occupation(row):
  if pd.isnull(row['Occupation']):
      diffs = (occupation_income_mean - row['Annual Income']).abs()
      return diffs.idxmin()
  else:
      return row['Occupation']


train3['Occupation'] = train3.apply(fill_occupation, axis=1)


train3['Previous Claims'] = train3['Previous Claims'].fillna(0)

# train3 Catboost 적용


In [9]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# 2. 타겟 설정
target3 = 'Premium Amount'
y3 = train3[target3]
X3 = train3.drop(columns=[target3])

# 3. 결측치 처리 및 범주형 컬럼 지정
cat_features3 = X3.select_dtypes(include=['object', 'category']).columns.tolist()

# NaN을 문자열 'Missing'으로 변환 (CatBoost 호환)
for col in cat_features3:
  X3[col] = X3[col].astype(str).fillna("Missing")

# 숫자형 컬럼 NaN은 중앙값으로 대체
for col in X3.select_dtypes(include=[np.number]).columns:
  if X3[col].isnull().sum() > 0:
      X3[col] = X3[col].fillna(X3[col].median())

# 4. 학습/검증 데이터 분리
X_train3, X_val3, y_train3, y_val3 = train_test_split(X3, y3, test_size=0.2, random_state=42)

# 5. 모델 정의
model3 = CatBoostRegressor(
  iterations=1000,
  learning_rate=0.05,
  depth=6,
  eval_metric='MAE',
  cat_features=cat_features3,
  random_seed=42,
  verbose=100
)

# 6. 모델 학습
model3.fit(X_train3, y_train3, eval_set=(X_val3, y_val3), use_best_model=True)

# 예측 및 평가
y_pred3 = model3.predict(X_val3)
mae = mean_absolute_error(y_val3, y_pred3)
mse = mean_squared_error(y_val3, y_pred3)
rmse = np.sqrt(mse)
r2 = r2_score(y_val3, y_pred3)

print(f"train3 MAE:  {mae:.4f}")
print(f"train3 MSE:  {mse:.4f}")
print(f"train3 RMSE: {rmse:.4f}")
print(f"train3 R2:   {r2:.4f}")

0:	learn: 667.9864127	test: 668.7337461	best: 668.7337461 (0)	total: 896ms	remaining: 14m 54s
100:	learn: 653.4596191	test: 654.0788837	best: 654.0788837 (100)	total: 1m 36s	remaining: 14m 18s
200:	learn: 650.3516463	test: 650.8452309	best: 650.8452309 (200)	total: 3m 8s	remaining: 12m 28s
300:	learn: 647.9209334	test: 648.2849533	best: 648.2849533 (300)	total: 4m 47s	remaining: 11m 7s
400:	learn: 646.5360110	test: 647.0060495	best: 647.0060495 (400)	total: 6m 29s	remaining: 9m 41s


KeyboardInterrupt: 