In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('./data/premium.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [6]:
df[df.duplicated(keep=False)]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
195,19,male,30.59,0,no,northwest,1639.5631
581,19,male,30.59,0,no,northwest,1639.5631


In [15]:
df.drop_duplicates()
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   str    
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   str    
 5   region    1338 non-null   str    
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), str(3)
memory usage: 73.3 KB


## bmi의 널처리

In [16]:
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [17]:
df['region'].unique()

<StringArray>
['southwest', 'southeast', 'northwest', 'northeast']
Length: 4, dtype: str

## 문자열 데이터의 수치화 > LabelEncoder

In [18]:
from sklearn.preprocessing import LabelEncoder

In [22]:
col_list = ['sex', 'smoker', 'region']
for col in col_list :
  enc = LabelEncoder()
  df[col] = enc.fit_transform(df[col])

df.head()
  

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


## 스케일링 하기

In [27]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
df['charges'].describe()

count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64

In [36]:
from sklearn.preprocessing import StandardScaler

# charges 제외! bmi만 스케일링
scale_cols = ['bmi']

scaler = StandardScaler()

X_train[scale_cols] = scaler.fit_transform(X_train[scale_cols])
X_test[scale_cols] = scaler.transform(X_test[scale_cols])

# 선형회귀

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae, mse, rmse, r2

(4172.783193083617,
 33640928.3211508,
 np.float64(5800.0800271333155),
 0.7833094802263592)

# 다항회귀

In [42]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

In [43]:
degree = [2,3,4]

for deg in degree :
  model_poly = Pipeline( [('poly', PolynomialFeatures(degree=deg, include_bias=False)), ('linear', LinearRegression())])
  model_poly.fit(X_train, y_train)
  poly_pred = model_poly.predict(X_test)
  
  mae = mean_absolute_error(y_test, poly_pred)
  mse = mean_squared_error(y_test, poly_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, poly_pred)
  print (f'Degree:{deg}, Mae:{mae:.4f}, MSE:{mse:.4f}, RMSE:{rmse:.4f}, R^2: {r2:.4f}')

Degree:2, Mae:2740.6271, MSE:20737594.4176, RMSE:4553.8549, R^2: 0.8664
Degree:3, Mae:2818.6301, MSE:21577598.7166, RMSE:4645.1694, R^2: 0.8610
Degree:4, Mae:3120.6621, MSE:34379524.8383, RMSE:5863.4056, R^2: 0.7786


In [44]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=100, random_state=0)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_rf)
mae, mse, rmse, r2

(2499.0538510119104,
 21474275.29598436,
 np.float64(4634.034451316084),
 0.861678255985478)

In [45]:
model_rf.feature_importances_

array([0.13626757, 0.00692881, 0.2079445 , 0.02268396, 0.61074257,
       0.01543259])