In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest ,chi2
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.shape

(1338, 7)

In [5]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [15]:
df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [7]:
x_train,x_test,y_train,y_test = train_test_split(df.drop('charges',axis=1),
                                                df['charges'],
                                                test_size=0.2,
                                                random_state=0)

In [9]:
x_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
621,37,male,34.1,4,yes,southwest
194,18,male,34.43,0,no,southeast
240,23,female,36.67,2,yes,northeast
1168,32,male,35.2,2,no,southwest
1192,58,female,32.395,1,no,northeast


In [12]:
y_train.head()

621     40182.24600
194      1137.46970
240     38511.62830
1168     4670.64000
1192    13019.16105
Name: charges, dtype: float64

In [39]:
# one hot encoding
trf1 = ColumnTransformer([
    ('ohe_sex_smoker_region',OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first'),[1,4,5])
],remainder='passthrough')

In [40]:
# scale

trf2 = ColumnTransformer([
    ('scale',StandardScaler(),[6])
],remainder='passthrough')

In [45]:
trf4 = LinearRegression()

## Create a pipeline

In [46]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf4',trf4)
])

In [47]:
pipe.fit(x_train,y_train)

In [48]:
y_pred = pipe.predict(x_test)

In [49]:
y_pred

array([1.11699271e+04, 9.48670909e+03, 3.81811231e+04, 1.62663133e+04,
       6.91464801e+03, 3.96348488e+03, 1.57939691e+03, 1.43852566e+04,
       9.01257970e+03, 7.50846068e+03, 4.49176728e+03, 1.02795839e+04,
       8.80129751e+03, 3.79802013e+03, 2.79262010e+04, 1.07151158e+04,
       1.12889756e+04, 6.10501768e+03, 8.24104117e+03, 2.71445089e+04,
       3.36440910e+04, 1.43551043e+04, 1.17372043e+04, 3.21374335e+04,
       4.17005913e+03, 9.25496051e+03, 1.08433751e+03, 9.80417085e+03,
       3.77104596e+03, 1.04318587e+04, 9.00931722e+03, 4.00749509e+04,
       1.56889543e+04, 1.38794545e+04, 2.47597127e+04, 5.16638285e+03,
       1.26109277e+04, 3.07691018e+04, 3.35498325e+04, 3.67154946e+03,
       3.97568613e+03, 3.98729942e+03, 3.05285774e+04, 3.95053023e+04,
       2.78105036e+04, 5.09258923e+03, 1.06042481e+04, 7.82952256e+03,
       3.59255553e+03, 1.02128745e+04, 5.72038147e+03, 3.42627499e+03,
       3.30210242e+04, 3.84738218e+04, 1.60534782e+04, 7.16491905e+03,
      

In [93]:
y_test = y_test.to_numpy()

In [118]:
pipe.score(x_test,y_pred)

1.0

In [97]:
y_test.shape

(268,)

In [98]:
y_pred.shape

(268,)

In [101]:
x_test

Unnamed: 0,age,sex,bmi,children,smoker,region
578,52,male,30.200,1,no,southwest
610,47,female,29.370,1,no,southeast
569,48,male,40.565,2,yes,northwest
1034,61,male,38.380,0,no,northwest
198,51,female,18.050,0,no,northwest
...,...,...,...,...,...,...
1084,62,female,30.495,2,no,northwest
726,41,male,28.405,1,no,northwest
1132,57,male,40.280,0,no,northeast
725,30,female,39.050,3,yes,southeast


In [102]:
y_pred

array([1.11699271e+04, 9.48670909e+03, 3.81811231e+04, 1.62663133e+04,
       6.91464801e+03, 3.96348488e+03, 1.57939691e+03, 1.43852566e+04,
       9.01257970e+03, 7.50846068e+03, 4.49176728e+03, 1.02795839e+04,
       8.80129751e+03, 3.79802013e+03, 2.79262010e+04, 1.07151158e+04,
       1.12889756e+04, 6.10501768e+03, 8.24104117e+03, 2.71445089e+04,
       3.36440910e+04, 1.43551043e+04, 1.17372043e+04, 3.21374335e+04,
       4.17005913e+03, 9.25496051e+03, 1.08433751e+03, 9.80417085e+03,
       3.77104596e+03, 1.04318587e+04, 9.00931722e+03, 4.00749509e+04,
       1.56889543e+04, 1.38794545e+04, 2.47597127e+04, 5.16638285e+03,
       1.26109277e+04, 3.07691018e+04, 3.35498325e+04, 3.67154946e+03,
       3.97568613e+03, 3.98729942e+03, 3.05285774e+04, 3.95053023e+04,
       2.78105036e+04, 5.09258923e+03, 1.06042481e+04, 7.82952256e+03,
       3.59255553e+03, 1.02128745e+04, 5.72038147e+03, 3.42627499e+03,
       3.30210242e+04, 3.84738218e+04, 1.60534782e+04, 7.16491905e+03,
      

In [103]:
df['charges'][578]

9724.53

In [112]:
test_input2 = np.array([71, 'female', 27.454900, 0, 'no', 'southwest'],dtype=object).reshape(1,6)

In [113]:
pipe.predict(test_input2)



array([14646.53869699])