In [15]:
import pandas as pd

In [None]:
test = pd.read_csv('/Users/t2023-m0149/Documents/spartacodingclub/Projects/4. insurance/test.csv')

In [None]:
sample = pd.read_csv('/Users/t2023-m0149/Documents/spartacodingclub/Projects/4. insurance/sample.csv')

In [None]:
train = pd.read_csv('/Users/t2023-m0149/Documents/spartacodingclub/Projects/4. insurance/train.csv')

In [19]:
print("전체 행 개수 :", len(train))
print(train.isna().sum())     

전체 행 개수 : 1200000
id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64


## 보험회사 피처엔지니어링

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neural_network import MLPRegressor

# 파생변수 및 전처리 함수(미국 보험사 논문 참조)
def preprocess(df):
    # 소득 로그변환, 결측치 처리
    df = df.dropna(subset=['Annual Income'])
    df['Annual Income'] = np.log1p(df['Annual Income'])
    for col in ['Health Score', 'Credit Score', 'Age', 'Vehicle Age']:
        df[col] = df[col].fillna(df[col].mean())
    df['Previous Claims'] = df['Previous Claims'].fillna(df['Previous Claims'].median())
    # 직업/피드백 결측 빈도 인코딩
    for col in ['Occupation', 'Customer Feedback']:
        df[col] = df[col].fillna('Unknown')
        freq_map = df[col].value_counts().to_dict()
        df[col] = df[col].map(freq_map)
    # 'Age_Group': 20대, 30대 등 구간화
    df['Age_Group'] = pd.cut(df['Age'], bins=[0,20,30,40,50,60,70,100], labels=False)
    # 'Policy_Months_Passed': 가입 후 경과 월(위험률 업계에서 중요)
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
    today = pd.to_datetime('today')
    df['Policy_Months_Passed'] = ((today.year - df['Policy Start Date'].dt.year) * 12 +
                                  (today.month - df['Policy Start Date'].dt.month)).fillna(0)
    # 과거 청구 건수(이상치 클리핑)
    df['Previous Claims'] = df['Previous Claims'].clip(upper=10)
    # 주요 범주형 변수 인코딩
    label_cols = [
        'Marital Status', 'Education Level', 'Policy Type', 'Gender',
        'Smoking Status', 'Number of Dependents', 'Property Type',
        'Exercise Frequency', 'Location'
    ]
    for col in label_cols:
        df[col] = df[col].astype(str)
        df[col] = LabelEncoder().fit_transform(df[col])
    # 불필요 컬럼 삭제
    for col in ['id', 'Insurance Duration', 'Policy Start Date']:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)
    return df

# 신경망모델(MLPRegressor) 학습 및 평가
def train_and_evaluate(df):
    df_processed = preprocess(df)
    X = df_processed.drop('Premium Amount', axis=1)
    y = df_processed['Premium Amount']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model = MLPRegressor(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', max_iter=500, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    # 평가지표
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    print(f"MLPRegressor Model Performance:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.4f}")
    return model

# 실제 사용 예시:
df = pd.read_csv('train.csv')
model = train_and_evaluate(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Annual Income'] = np.log1p(df['Annual Income'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Previous Claims'] = df['Previous Claims'].fillna(df['Previous Claims'].median())
A value is trying to be set

MLPRegressor Model Performance:
RMSE: 861.36
MAE: 671.90
R2 Score: -0.0003
