<a href="https://colab.research.google.com/github/jc890/python/blob/master/Assignment03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

data = pd.read_csv('/insurance.csv')

data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
x = data.drop(columns='charges', axis=1)
y = data['charges']

In [11]:
numerical_features = x.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = x.select_dtypes(exclude=[np.number]).columns.tolist()

In [27]:
numerical_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [28]:
categorical_preprocess = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

In [37]:
preprocess = ColumnTransformer([
    ('numerical', numerical_preprocess, numerical_features),
    ('categorical', categorical_preprocess, categorical_features)
])

In [39]:
pipe = Pipeline([
    ('preprocess', preprocess),
    ('regressor', LinearRegression())
])

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

In [51]:
print("MSE:", mean_squared_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))


MSE: 33596915.851361476
R^2: 0.7835929767120722


In [53]:
x_train


Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,female,19.950,2,no,northwest
1285,47,female,24.320,0,no,northeast
1142,52,female,24.860,0,no,southeast
969,39,female,34.320,5,no,southeast
486,54,female,21.470,3,no,northwest
...,...,...,...,...,...,...
1095,18,female,31.350,4,no,northeast
1130,39,female,23.870,5,no,southeast
1294,58,male,25.175,0,no,northeast
860,37,female,47.600,2,yes,southwest


In [54]:
x_test


Unnamed: 0,age,sex,bmi,children,smoker,region
764,45,female,25.175,2,no,northeast
887,36,female,30.020,0,no,northwest
890,64,female,26.885,0,yes,northwest
1293,46,male,25.745,3,no,northwest
259,19,male,31.920,0,yes,northwest
...,...,...,...,...,...,...
109,63,male,35.090,0,yes,southeast
575,58,female,27.170,0,no,northwest
535,38,male,28.025,1,no,northeast
543,54,female,47.410,0,yes,southeast


In [56]:
y_train , y_test


(560      9193.83850
 1285     8534.67180
 1142    27117.99378
 969      8596.82780
 486     12475.35130
            ...     
 1095     4561.18850
 1130     8582.30230
 1294    11931.12525
 860     46113.51100
 1126    10214.63600
 Name: charges, Length: 1070, dtype: float64,
 764      9095.06825
 887      5272.17580
 890     29330.98315
 1293     9301.89355
 259     33750.29180
            ...     
 109     47055.53210
 575     12222.89830
 535      6067.12675
 543     63770.42801
 846      9872.70100
 Name: charges, Length: 268, dtype: float64)

In [57]:
feature_names = pipe.named_steps["preprocess"].get_feature_names_out()
coefs = pipe.named_steps["regressor"].coef_


In [58]:
print("Coefficients:")
for feature, coef in zip(feature_names, coefs):
    print(f"{feature}: {coef}")

Coefficients:
numerical__age: 3614.975414827998
numerical__bmi: 2036.2281228967177
numerical__children: 516.8902471991507
categorical__sex_female: 9.29584582050855
categorical__sex_male: -9.295845820504471
categorical__smoker_no: -11825.56442788061
categorical__smoker_yes: 11825.564427880612
categorical__region_northeast: 459.58524424372945
categorical__region_northwest: 88.90791801592897
categorical__region_southeast: -198.27905232092397
categorical__region_southwest: -350.21410993873786
