In [1]:
import pandas as pd

In [2]:
test = pd.read_csv('test.csv')

In [3]:
sample = pd.read_csv('sample.csv')

In [4]:
train = pd.read_csv('train.csv')

In [5]:
print("전체 행 개수 :", len(train))
print(train.isna().sum())     

전체 행 개수 : 1200000
id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64


## 데이터 전처리(나머지 변수, 공통)

In [6]:
import numpy as np


# Number of Dependents
train['Number of Dependents'] = train['Number of Dependents'].astype('Int64')
train['Number of Dependents'] = train['Number of Dependents'].astype('category')
train['Number of Dependents'] = train['Number of Dependents'].cat.add_categories('Missing')
train['Number of Dependents'] = train['Number of Dependents'].fillna('Missing')


# Age
train['Age'] = train['Age'].fillna(train['Age'].median())


# Annual Income
train['Annual Income'] = train['Annual Income'].fillna(train['Annual Income'].median())


# Health Score
train['Health Score'] = train['Health Score'].fillna(train['Health Score'].median())


# Credit Score
train['Credit Score'] = train['Credit Score'].fillna('Missing')


# Customer Feedback
train['Customer Feedback'] = train['Customer Feedback'].fillna('No Feedback')


# Marital Status
train['Marital Status'] = train['Marital Status'].fillna('Unknown')


# Vehicle Age 결측 행 삭제
train = train[train['Vehicle Age'].notnull()]


# Insurance Duration 결측 행 삭제
train = train[train['Insurance Duration'].notnull()]

In [7]:
train.isna().sum()

id                           0
Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation              358074
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims         364028
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64

# 실험2

In [15]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# 1. 데이터 복제
train5 = train.copy()
train5 = train.drop(columns=['Policy Start Date']).copy()


# 2. 범주형 컬럼 선정
categorical_cols = ['Occupation', 'Previous Claims', 'Number of Dependents','Policy Type', 'Smoking Status', 'Exercise Frequency', 'Property Type',
                   'Credit Score', 'Customer Feedback', 'Marital Status', 'Gender', 'Education Level', 'Location']


# 3. Label Encoding (KNN 적용 위해)
label_encoders = {}
for col in categorical_cols:
   le = LabelEncoder()
   train5[col] = train5[col].astype(str).replace('nan', '-1')
   train5[col] = le.fit_transform(train5[col])
   label_encoders[col] = le


# 4. KNN Imputer로 결측치 채우기 (K=5)
imputer = KNNImputer(n_neighbors=5, weights='uniform')
train5_imputed = imputer.fit_transform(train5)
train5 = pd.DataFrame(train5_imputed, columns=train5.columns)


# 5. Label Decoding (숫자→문자 복원)
for col in categorical_cols:
   le = label_encoders[col]
   train5[col] = train5[col].round(0).astype(int)
   train5[col] = le.inverse_transform(train5[col])


# 6. CatBoost용 범주형 컬럼 자동 추출
target = 'Premium Amount'
y = train5[target]
X = train5.drop(columns=[target])
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()


# 7. 결측 처리 (적용데이터에만)
for col in cat_features:
   X[col] = X[col].astype(str).fillna('Missing')
for col in X.select_dtypes(include=[np.number]).columns:
   if X[col].isnull().sum() > 0:
       X[col] = X[col].fillna(X[col].median())


# 8. 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# 9. CatBoost 회귀모델 정의 및 학습
model = CatBoostRegressor(
   iterations=1000,
   learning_rate=0.05,
   depth=6,
   eval_metric='MAE',
   cat_features=cat_features,
   random_seed=42,
   verbose=100
)
model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)


# 10. 예측 및 평가
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)


print(f"MAE:  {mae:.4f}")
print(f"MSE:  {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2:   {r2:.4f}")




0:	learn: 668.1017534	test: 668.8478210	best: 668.8478210 (0)	total: 726ms	remaining: 12m 5s
100:	learn: 653.7201041	test: 654.0395569	best: 654.0395569 (100)	total: 1m 28s	remaining: 13m 3s
200:	learn: 650.9166620	test: 651.1224771	best: 651.1221724 (199)	total: 3m 37s	remaining: 14m 22s
300:	learn: 649.5639330	test: 649.7653442	best: 649.7653442 (300)	total: 5m 42s	remaining: 13m 16s
400:	learn: 647.4697659	test: 647.5510786	best: 647.5489566 (399)	total: 7m 58s	remaining: 11m 54s
500:	learn: 646.4675531	test: 646.5588100	best: 646.5588100 (500)	total: 10m 10s	remaining: 10m 7s
600:	learn: 645.6709926	test: 645.7521911	best: 645.7520319 (599)	total: 13m 3s	remaining: 8m 40s
700:	learn: 645.2857883	test: 645.4111485	best: 645.4102644 (693)	total: 14m 57s	remaining: 6m 23s
800:	learn: 644.9600244	test: 645.1541639	best: 645.1533538 (785)	total: 17m 38s	remaining: 4m 22s
900:	learn: 644.7205264	test: 644.9874623	best: 644.9849297 (893)	total: 20m 24s	remaining: 2m 14s
999:	learn: 644.40