In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('insurance.csv')
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [3]:
# 결측치 제거
data.dropna(axis=0)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
# categorical -> numerical
new_data = pd.get_dummies(data[['sex', 'smoker', 'region']])
new_data

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0,1,1,0,0,1,0,0
1334,1,0,1,0,1,0,0,0
1335,1,0,1,0,0,0,1,0
1336,1,0,1,0,0,0,0,1


In [5]:
# dataframe 수정

data.drop(['sex', 'smoker', 'region'], axis='columns', inplace=True)

# 중복되는 category 제거
new_data.drop(['sex_female', 'smoker_no'], axis='columns', inplace=True)

# 기존 dataframe과 병합
data = pd.concat([data, new_data], axis=1)

data

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,0,1
1,18,33.770,1,1725.55230,1,0,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0,0
4,32,28.880,0,3866.85520,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,1,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,0,1


In [6]:
x_data = data[['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']]
y_data = data[['charges']]

# Normalization

normalizer = MinMaxScaler()
x_data = normalizer.fit_transform(x_data)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size = 0.8, test_size = 0.2)

model = LinearRegression()
model.fit(x_train, y_train)

accuracy = model.score(x_test, y_test)
print("Linear Regression Score:", accuracy)

Linear Regression Score: 0.7629216546011564


In [10]:
# polynomial regression

for d in range(2, 5+1):
    x_poly_data = PolynomialFeatures(degree = d).fit_transform(x_data)

    x_train, x_test, y_train, y_test = train_test_split(x_poly_data, y_data, train_size = 0.8, test_size = 0.2)

    model = LinearRegression()
    model.fit(x_train, y_train)
    
    accuracy = model.score(x_train, y_train)
    print("Train score with degree {}:".format(d), accuracy)

    accuracy = model.score(x_test, y_test)
    print("Polynomial Regression with degree {} Score:".format(d), accuracy)
    print()

Train score with degree 2: 0.842738900868399
Polynomial Regression with degree 2 Score: 0.860472978070063

Train score with degree 3: 0.8597931605602591
Polynomial Regression with degree 3 Score: 0.8259415204385473

Train score with degree 4: 0.8566151611949415
Polynomial Regression with degree 4 Score: 0.7154381809679637

Train score with degree 5: 0.913525406166482
Polynomial Regression with degree 5 Score: -0.3264607709913352

