# 빅데이터 분석기사 실기 준비 놀이터
- https://www.kaggle.com/datasets/agileteam/bigdatacertificationkr

## T2-5. Insurance Forecast (Regression) / 보험액 예측

### 5. 보험액 예측
- 데이터셋 : insurance.csv
- https://www.kaggle.com/code/agileteam/insurance-starter-tutorial/notebook

#### 0. 시험 환경 세팅

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("./data/insurance.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='charges')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 7), (268, 7), (1070, 2), (268, 2))

#### 1. 라이브러리 및 데이터 호출

In [2]:
import numpy as np
import pandas as pd

In [3]:
X_train.drop('id', axis=1, inplace=True)
id = X_test.pop('id')
y = y_train['charges']

X_train.shape, X_test.shape, y.shape

((1070, 6), (268, 6), (1070,))

#### 2. EDA

In [4]:
display(X_train.info(), X_train.head(), X_train.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1070 entries, 209 to 1140
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1070 non-null   int64  
 1   sex       1070 non-null   object 
 2   bmi       1070 non-null   float64
 3   children  1070 non-null   int64  
 4   smoker    1070 non-null   object 
 5   region    1070 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 58.5+ KB


None

Unnamed: 0,age,sex,bmi,children,smoker,region
209,40,male,41.23,1,no,northeast
540,34,female,38.0,3,no,southwest
747,19,male,21.755,0,no,northwest
39,60,male,39.9,0,yes,southwest
640,33,male,42.4,5,no,southwest


age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [5]:
display(X_test.info(), X_test.head(), X_test.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 1088 to 116
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       268 non-null    int64  
 1   sex       268 non-null    object 
 2   bmi       268 non-null    float64
 3   children  268 non-null    int64  
 4   smoker    268 non-null    object 
 5   region    268 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 14.7+ KB


None

Unnamed: 0,age,sex,bmi,children,smoker,region
1088,52,male,47.74,1,no,southeast
1157,23,female,23.18,2,no,northwest
1267,24,male,31.065,0,yes,northeast
506,22,male,31.35,1,no,northwest
659,57,female,28.785,4,no,northeast


age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

#### 3. Preprocessing

In [6]:
obj_col = X_train.select_dtypes(include='object').columns

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X_train[obj_col] = X_train[obj_col].apply(le.fit_transform)
X_test[obj_col] = X_test[obj_col].apply(le.fit_transform)

In [7]:
int_col = X_train.select_dtypes(exclude='object').columns

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[int_col] = scaler.fit_transform(X_train[int_col])
X_test[int_col] = scaler.transform(X_test[int_col])

#### 4. Modeling

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor
from xgboost import XGBRegressor, XGBRFRegressor

In [9]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.2, shuffle=True, random_state=156)

In [10]:
rf = RandomForestRegressor(random_state=156)
gb = GradientBoostingRegressor(random_state=156)
xgb = XGBRegressor(random_state=156)
xgbrf = XGBRFRegressor(random_state=156)

models = [rf, gb, xgb, xgbrf]

In [11]:
for model in models:
    model.fit(X_tr, y_tr)
    pred = model.predict(X_val)
    
    name = str(model).split('(')[0]
    print(name)
    print('RMSE :', np.sqrt(mean_squared_error(y_val, pred)))
    print('r2 :', r2_score(y_val, pred))
    print('='*100)

RandomForestRegressor
RMSE : 5080.489862198674
r2 : 0.8295489520383348
GradientBoostingRegressor
RMSE : 4753.760354995449
r2 : 0.8507676196303099
XGBRegressor
RMSE : 5309.074549600171
r2 : 0.8138658137617791
XGBRFRegressor
RMSE : 4715.170098805694
r2 : 0.8531806738818499


In [12]:
xgbrf = XGBRFRegressor(random_state=156)

xgbrf.fit(X_train, y)
pred = xgbrf.predict(X_test)

In [13]:
out_data = pd.DataFrame({'id':id, 'charges': pred})
out_data

Unnamed: 0,id,charges
1088,1088,13286.305664
1157,1157,6757.561523
1267,1267,34499.667969
506,506,5005.382324
659,659,14788.097656
...,...,...
1275,1275,12218.154297
88,88,11229.602539
646,646,6657.268555
654,654,14595.181641


In [14]:
# out_data.to_csv('123456.csv', index=False)

In [15]:
# 가채점
print(np.sqrt(mean_squared_error(y_test.charges, pred)))
print(r2_score(y_test.charges, pred))

4547.373971901542
0.8533363799646512


#### 5. 다른 모델 평가

In [16]:
for model in models:
    model.fit(X_train, y)
    pred = model.predict(X_test)
    
    name = str(model).split('(')[0]
    print(name)
    print('RMSE :', np.sqrt(mean_squared_error(y_test.charges, pred)))
    print('r2 :', r2_score(y_test.charges, pred))
    print('='*100)

RandomForestRegressor
RMSE : 4975.733659022783
r2 : 0.824403721365911
GradientBoostingRegressor
RMSE : 4591.698071281115
r2 : 0.850463330668066
XGBRegressor
RMSE : 5246.263999815888
r2 : 0.8047903246074996
XGBRFRegressor
RMSE : 4547.373971901542
r2 : 0.8533363799646512
