In [7]:
import pandas as pd 

test = pd.read_csv('test.csv')

sample = pd.read_csv('sample_submission.csv')

train = pd.read_csv('train.csv')

In [8]:
# Number of Dependents: 구간화(Binning) & 결측 플래그 추가
import numpy as np

# Number of Dependents
train['Number of Dependents'] = train['Number of Dependents'].astype('Int64')
train['Number of Dependents'] = train['Number of Dependents'].astype('category')
train['Number of Dependents'] = train['Number of Dependents'].cat.add_categories('Missing')
train['Number of Dependents'] = train['Number of Dependents'].fillna('Missing')

#결측 플래그 처리 컬럼 예시(Age, Annual Income, Health Score, Credit Score, Customer Feedback 등)

# Age에 대한 중앙값 대체 예시
train['Age'] = train['Age'].fillna(train['Age'].median())

# Annual Income
train['Annual Income'] = train['Annual Income'].fillna(train['Annual Income'].median())

# Health Score
train['Health Score'] = train['Health Score'].fillna(train['Health Score'].median())

# Credit Score
train['Credit Score'] = train['Credit Score'].fillna('Missing')

# Customer Feedback
train['Customer Feedback'] = train['Customer Feedback'].fillna('No Feedback')

# Vehicle Age 결측 행 삭제
train = train[train['Vehicle Age'].notnull()]

# Insurance Duration 결측 행 삭제
train = train[train['Insurance Duration'].notnull()]

# Marital Status 결측치는 'Unknown'으로 대체
train['Marital Status'] = train['Marital Status'].fillna('Unknown')

In [9]:
print("결측 제거 후 행 개수 :", len(train))

print(train.isna().sum())

결측 제거 후 행 개수 : 1199993
id                           0
Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation              358074
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims         364028
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64


# kdd 실험

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score


def feature_engineering(df):
    """
    기본 피처셋 생성:
      - Credit Score 결측 플래그 생성 및 중앙값 대체
      - 수치형 스케일링
      - 범주형 원-핫 인코딩
    """
    df = df.copy()
    # Credit Score 처리
    df['CreditScore_missing_flag'] = df['Credit Score'].isna().astype(int)
    df['Credit Score'] = pd.to_numeric(df['Credit Score'], errors='coerce')
    df['Credit Score'].fillna(df['Credit Score'].median(), inplace=True)

    # 컬럼 정의
    num_cols = [
        'Age', 'Annual Income', 'Health Score',
        'Credit Score', 'Vehicle Age', 'Insurance Duration',
        'CreditScore_missing_flag'
    ]
    cat_cols = ['Customer Feedback', 'Marital Status', 'Number of Dependents']

    # 범주형을 문자열로 통일
    df[cat_cols] = df[cat_cols].astype(str)

    # 원-핫 인코딩
    ohe_df = pd.get_dummies(df[cat_cols], prefix=cat_cols, dummy_na=False)

    # 수치형 스케일링
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df[num_cols])
    scaled_df = pd.DataFrame(scaled, columns=num_cols, index=df.index)

    # 최종 피처셋 반환
    X_base = pd.concat([scaled_df, ohe_df], axis=1)
    return df, X_base


def knn_impute(X_base, df, col, model_cls, **model_kwargs):
    """
    KNN을 이용한 결측치 채우기 함수
      - model_cls: KNeighborsClassifier or KNeighborsRegressor
      - col: 결측치가 있는 컬럼 이름
    결과는 df[f"{col}_imputed"]에 저장
    """
    mask = df[col].isna()
    X_train, y_train = X_base.loc[~mask], df.loc[~mask, col]
    X_pred = X_base.loc[mask]

    model = model_cls(**model_kwargs)
    model.fit(X_train, y_train)
    df.loc[mask, f"{col}_imputed"] = model.predict(X_pred)


def evaluate_imputations(train, df, X_base):
    """
    단순 대체(train5)와 KNN 대체(df_knn) 성능 비교
      - CatBoostRegressor 5-폴드 CV로 MAE, RMSE, R2 계산
    """
    # 단순 대체 (Occupation: 'Unknown', Previous Claims: 0)
    train5 = train.copy()
    train5['Occupation'] = train5['Occupation'].fillna('Unknown')
    train5['Previous Claims'] = train5['Previous Claims'].fillna(0)
    train5 = pd.concat([X_base, train5[['Occupation', 'Previous Claims']]], axis=1)
    train5['target'] = train['Premium Amount']

    # KNN 대체
    df_knn = pd.concat([
        X_base,
        df['Occupation_imputed'].rename('Occupation'),
        df['PreviousClaims_imputed'].rename('Previous Claims')
    ], axis=1)
    df_knn['target'] = train['Premium Amount']

    def evaluate(df_):
        X = df_.drop(columns=['Occupation', 'Previous Claims', 'target'])
        y = df_['target']
        model = CatBoostRegressor(
            iterations=200,
            learning_rate=0.05,
            depth=6,
            random_seed=42,
            verbose=False
        )
        mae = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
        rmse = -cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error').mean()
        r2 = cross_val_score(model, X, y, cv=5, scoring='r2').mean()
        return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

    return evaluate(train5), evaluate(df_knn)

In [12]:
# 1) 피처셋 생성
df, X_base = feature_engineering(train)

# 2) KNN 보간 실행
knn_impute(
    X_base, df, 'Occupation',
    KNeighborsClassifier,
    n_neighbors=5, weights='distance', metric='euclidean'
)
knn_impute(
    X_base, df, 'Previous Claims',
    KNeighborsRegressor,
    n_neighbors=5, weights='distance', metric='euclidean'
)

# 3) 성능 비교
metrics_simple, metrics_knn = evaluate_imputations(train, df, X_base)

# 4) 결과 출력
print("=== 단순 대체(train5) 성능 ===")
print(metrics_simple)   # {'MAE': 123.45, 'RMSE': 167.89, 'R2': 0.6789}

print("\n=== KNN 대체(df_knn) 성능 ===")
print(metrics_knn)      # {'MAE': 117.33, 'RMSE': 159.02, 'R2': 0.7123}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Credit Score'].fillna(df['Credit Score'].median(), inplace=True)


KeyError: 'PreviousClaims_imputed'

요약본 

In [1]:
# import pandas as pd
# from sklearn.preprocessing import StandardScaler

# # 1) 복사본 준비
# df = train.copy()

# # 2) Credit Score 처리: 
# #    - 플래그 컬럼 따로 만들고, 
# #    - 원본은 중앙값 대체
# df['CreditScore_missing_flag'] = df['Credit Score'].isna().astype(int)
# df['Credit Score'] = pd.to_numeric(df['Credit Score'], errors='coerce')
# df['Credit Score'] = df['Credit Score'].fillna(df['Credit Score'].median())

# # 3) 피처 컬럼 재정의
# num_cols = [
#     'Age', 'Annual Income', 'Health Score', 
#     'Credit Score', 'Vehicle Age', 'Insurance Duration'
# ]
# # 기존 Credit Score 플래그 추가
# num_cols += ['CreditScore_missing_flag']

# cat_cols = ['Customer Feedback', 'Marital Status', 'Number of Dependents']

# # 4) 범주형을 문자열로 통일
# df[cat_cols] = df[cat_cols].astype(str)

# # 5) 원-핫 인코딩
# ohe_df = pd.get_dummies(
#     df[cat_cols], 
#     prefix=cat_cols, 
#     dummy_na=False
# )

# # 6) 수치형 스케일링
# scaler = StandardScaler()
# scaled_num = scaler.fit_transform(df[num_cols])
# scaled_df = pd.DataFrame(
#     scaled_num, 
#     columns=num_cols, 
#     index=df.index
# )

# # 7) 최종 피처셋
# X_base = pd.concat([scaled_df, ohe_df], axis=1)

# print("X_base shape:", X_base.shape)
# print(X_base.head())


In [2]:
# print("결측 제거 후 행 개수 :", len(train))

# print(train.isna().sum())

In [3]:
# from sklearn.neighbors import KNeighborsClassifier

# # 1) Occupation 결측 마스크
# mask_occ = df['Occupation'].isna()

# # 2) 학습(결측 없는 행)용 데이터
# X_train_occ = X_base.loc[~mask_occ]
# y_train_occ = df.loc[~mask_occ, 'Occupation']

# # 3) 예측(결측 있는 행)용 데이터
# X_pred_occ = X_base.loc[mask_occ]

# # 4) KNN 분류기 초기화 & 학습
# knn_clf = KNeighborsClassifier(
#     n_neighbors=5,      # 이웃 개수
#     weights='distance', # 거리 가중치
#     metric='euclidean'  # 거리 척도
# )
# knn_clf.fit(X_train_occ, y_train_occ)

# # 5) 결측치 예측 및 채우기
# df.loc[mask_occ, 'Occupation_imputed'] = knn_clf.predict(X_pred_occ)

# # 6) 결과 확인
# print("Imputed Occupation 분포:")
# print(df['Occupation_imputed'].value_counts())


In [4]:
# from sklearn.neighbors import KNeighborsRegressor

# # 1) Previous Claims 결측 마스크
# mask_prev = df['Previous Claims'].isna()

# # 2) 학습용 데이터 (결측 없는 행)
# X_train_prev = X_base.loc[~mask_prev]
# y_train_prev = df.loc[~mask_prev, 'Previous Claims']

# # 3) 예측용 데이터 (결측 있는 행)
# X_pred_prev = X_base.loc[mask_prev]

# # 4) KNN 회귀기 초기화 및 학습
# knn_reg = KNeighborsRegressor(
#     n_neighbors=5,      # 이웃 개수
#     weights='distance', # 거리 가중치
#     metric='euclidean'  # 거리 척도
# )
# knn_reg.fit(X_train_prev, y_train_prev)

# # 5) 결측치 예측 및 채우기
# df.loc[mask_prev, 'PreviousClaims_imputed'] = knn_reg.predict(X_pred_prev)

# # 6) 결과 확인
# print("Imputed Previous Claims 통계:")
# print(df['PreviousClaims_imputed'].describe())


In [5]:
# import pandas as pd
# from catboost import CatBoostRegressor
# from sklearn.model_selection import cross_val_score

# # ——————————————————————————————
# # 1) train5 준비 (단순 결측 대체)
# train5 = train.copy()
# train5['Occupation'] = train5['Occupation'].fillna('Unknown')
# train5['Previous Claims'] = train5['Previous Claims'].fillna(0)

# # X_base(피처셋)와 합치기
# train5 = pd.concat([X_base, 
#                     train5[['Occupation', 'Previous Claims']]], axis=1)
# train5['target'] = train['Premium Amount']  # 예측 목표 칼럼

# # ——————————————————————————————
# # 2) df_knn 준비 (기존 이름 유지)
# df_knn = pd.concat([X_base, 
#                     df['Occupation_imputed'].rename('Occupation'),
#                     df['PreviousClaims_imputed'].rename('Previous Claims')], axis=1)
# df_knn['target'] = train['Premium Amount']

# # ——————————————————————————————
# # 3) 평가 함수
# def evaluate(dataframe):
#     X = dataframe.drop(columns=['Occupation','Previous Claims','target'])
#     y = dataframe['target']
#     model = CatBoostRegressor(
#         iterations=200,
#         learning_rate=0.05,
#         depth=6,
#         random_seed=42,
#         verbose=False
#     )
#     mae = -cross_val_score(
#         model, X, y, cv=5, scoring='neg_mean_absolute_error'
#     ).mean()
#     rmse = -cross_val_score(
#         model, X, y, cv=5, scoring='neg_root_mean_squared_error'
#     ).mean()
#     r2 = cross_val_score(
#         model, X, y, cv=5, scoring='r2'
#     ).mean()
#     return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

# # ——————————————————————————————
# # 4) 실행 및 결과 출력
# metrics_train5 = evaluate(train5)
# metrics_knn   = evaluate(df_knn)

# print("=== train5 (단순 대체) 모델 성능 ===")
# print(metrics_train5)
# print("\n=== KNN 대체 모델 성능 ===")
# print(metrics_knn)


In [6]:
# import pandas as pd
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import cross_val_score
# from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# from catboost import CatBoostRegressor

# # --- 0) train, df(원본), X_base 준비는 앞 단계 그대로 쓰면 돼 ---

# # --- 1) KNN 으로 Occupation 결측 채우기 ---
# mask_occ = df['Occupation'].isna()
# X_train_occ = X_base.loc[~mask_occ]
# y_train_occ = df.loc[~mask_occ, 'Occupation']
# X_pred_occ  = X_base.loc[ mask_occ]

# knn_clf = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='euclidean')
# knn_clf.fit(X_train_occ, y_train_occ)
# df.loc[mask_occ, 'Occupation_imputed'] = knn_clf.predict(X_pred_occ)

# # --- 2) KNN 으로 Previous Claims 결측 채우기 ---
# mask_prev = df['Previous Claims'].isna()
# X_train_prev = X_base.loc[~mask_prev]
# y_train_prev = df.loc[~mask_prev, 'Previous Claims']
# X_pred_prev  = X_base.loc[ mask_prev]

# knn_reg = KNeighborsRegressor(n_neighbors=5, weights='distance', metric='euclidean')
# knn_reg.fit(X_train_prev, y_train_prev)
# df.loc[mask_prev, 'PreviousClaims_imputed'] = knn_reg.predict(X_pred_prev)

# # --- 3) train5(단순 대체) / df_knn(KNN 대체) 데이터셋 구성 ---
# train5 = train.copy()
# train5['Occupation']        = train5['Occupation'].fillna('Unknown')
# train5['Previous Claims']   = train5['Previous Claims'].fillna(0)
# train5 = pd.concat([X_base, train5[['Occupation','Previous Claims']]], axis=1)
# train5['target'] = train['Premium Amount']

# df_knn = pd.concat([
#     X_base,
#     df['Occupation_imputed'].rename('Occupation'),
#     df['PreviousClaims_imputed'].rename('Previous Claims')
# ], axis=1)
# df_knn['target'] = train['Premium Amount']

# # --- 4) 모델 평가 함수 정의 ---
# def evaluate(df_):
#     X = df_.drop(columns=['Occupation','Previous Claims','target'])
#     y = df_['target']
#     model = CatBoostRegressor(
#         iterations=200, learning_rate=0.05, depth=6,
#         random_seed=42, verbose=False
#     )
#     mae  = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error').mean()
#     rmse = -cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error').mean()
#     r2   = cross_val_score(model, X, y, cv=5, scoring='r2').mean()
#     return mae, rmse, r2

# # --- 5) 실행 및 결과 확인 ---
# mae5, rmse5, r25 = evaluate(train5)
# maek, rmsek, r2k = evaluate(df_knn)

# print(f"--- train5 (단순 대체) ---\n MAE: {mae5:.2f}\n RMSE: {rmse5:.2f}\n R2: {r25:.4f}")
# print(f"\n--- df_knn (KNN 대체) ---\n MAE: {maek:.2f}\n RMSE: {rmsek:.2f}\n R2: {r2k:.4f}")


# TRAIN 5 KNN방법 -> catboost  _실험

In [17]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1) 데이터 준비
df     = train.copy()
target = 'Premium Amount'
X      = df.drop(columns=[target])
y      = df[target]

# 2) 범주형 전처리 + 숫자형 결측 처리
cat_features = X.select_dtypes(include=['object','category']).columns.tolist()
X[cat_features] = X[cat_features].astype(str).fillna('Missing')
num_features = X.select_dtypes(include=['number']).columns.tolist()
X[num_features] = X[num_features].fillna(X[num_features].median())

# 3) 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 4) CatBoost 모델 정의 & 학습
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=100
)
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    use_best_model=True
)

# 5) 예측 및 평가
y_pred = model.predict(X_val)
mae    = mean_absolute_error(y_val, y_pred)
mse    = mean_squared_error(y_val, y_pred)
rmse   = np.sqrt(mse)
r2     = r2_score(y_val, y_pred)

print(f"MAE:  {mae:.2f}")
print(f"MSE:  {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2:   {r2:.4f}")


0:	learn: 863.6532513	test: 866.0782078	best: 866.0782078 (0)	total: 1s	remaining: 8m 21s
100:	learn: 850.7399443	test: 852.9504233	best: 852.9504233 (100)	total: 1m 35s	remaining: 6m 17s
200:	learn: 848.4086361	test: 850.6780477	best: 850.6780477 (200)	total: 3m 4s	remaining: 4m 34s
300:	learn: 846.9344445	test: 849.2600870	best: 849.2600870 (300)	total: 4m 51s	remaining: 3m 12s
400:	learn: 845.5264231	test: 847.9348910	best: 847.9348910 (400)	total: 6m 30s	remaining: 1m 36s
499:	learn: 845.0583962	test: 847.5491308	best: 847.5491308 (499)	total: 8m 9s	remaining: 0us

bestTest = 847.5491308
bestIteration = 499

MAE:  646.41
MSE:  718339.53
RMSE: 847.55
R2:   0.0443


# TEST6 TRAIN6 추천 모델: DeepIFSAC (DeepIFSA, 2025)

In [11]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from deepifsac import DeepIFSAC    # 실제 설치된 구현체를 사용하세요
# from catboost import CatBoostRegressor
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # 1) 원본 복제 & 분리
# df = train.copy()
# y = df.pop('Premium Amount')  # target만 떼어놓기

# # 2) Train/Test 분리 (누수 방지)
# X_train, X_val, y_train, y_val = train_test_split(
#     df, y, test_size=0.2, random_state=42
# )

# # 3) 수치형 / 범주형 컬럼 정의
# num_cols = ['Age','Annual Income','Health Score','Vehicle Age','Insurance Duration']
# cat_cols = ['Occupation','Previous Claims','Number of Dependents',
#             'Credit Score','Customer Feedback','Marital Status']

# # 4) 범주형 → 문자열로, 수치형 그대로
# X_train[cat_cols] = X_train[cat_cols].fillna('Missing').astype(str)
# X_val  [cat_cols] = X_val [cat_cols].fillna('Missing').astype(str)

# # 5) DeepIFSAC Imputation (Train에만 fit, Val에 transform)
# imputer = DeepIFSAC(embed_dim=64, n_heads=4, n_layers=3, epochs=50, batch_size=512)
# X_train_imp = imputer.fit_transform(X_train)
# X_val_imp   = imputer.transform(X_val)

# # (imputer 출력이 numpy array라면 DataFrame으로 복원)
# X_train_imp = pd.DataFrame(X_train_imp, columns=X_train.columns, index=X_train.index)
# X_val_imp   = pd.DataFrame(X_val_imp,   columns=X_val.columns,   index=X_val.index)

# # 6) 수치형 스케일링 & 범주형 인코딩
# scaler = StandardScaler()
# X_train_num = pd.DataFrame(scaler.fit_transform(X_train_imp[num_cols]), columns=num_cols, index=X_train.index)
# X_val_num   = pd.DataFrame(scaler.transform(X_val_imp[num_cols]), columns=num_cols, index=X_val.index)

# ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# X_train_cat = pd.DataFrame(ohe.fit_transform(X_train_imp[cat_cols]),
#                            columns=ohe.get_feature_names_out(cat_cols), index=X_train.index)
# X_val_cat   = pd.DataFrame(ohe.transform(X_val_imp[cat_cols]),
#                            columns=ohe.get_feature_names_out(cat_cols), index=X_val.index)

# X_train_final = pd.concat([X_train_num, X_train_cat], axis=1)
# X_val_final   = pd.concat([X_val_num,   X_val_cat],   axis=1)

# # 7) CatBoost 학습 및 평가
# model = CatBoostRegressor(
#     iterations=500, learning_rate=0.05, depth=6,
#     random_seed=42, verbose=False
# )
# model.fit(X_train_final, y_train,
#           eval_set=(X_val_final, y_val), use_best_model=True)

# y_pred = model.predict(X_val_final)
# print("MAE: ", mean_absolute_error(y_val, y_pred))
# print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred)))
# print("R2:  ", r2_score(y_val, y_pred))
