<a href="https://colab.research.google.com/github/hermesgido/machine-learning-models/blob/main/insurance__charges_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error



In [6]:

# Load the dataset into a pandas DataFrame
from google.colab import files
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['05-insurance_charges.csv']))


# Explore the data
print(df.describe())
print(df['sex'].value_counts())
print(df['smoker'].value_counts())
print(df['region'].value_counts())

# Preprocess the data



Saving 05-insurance_charges.csv to 05-insurance_charges (2).csv
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010
male      676
female    662
Name: sex, dtype: int64
no     1064
yes     274
Name: smoker, dtype: int64
southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


In [10]:
# Handle missing values
df.fillna(df.mean(), inplace=True)

# Encode categorical features
#df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'])


In [11]:
# Split the data into training and test sets
X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
preprocessor = ColumnTransformer(
    [('scaler', StandardScaler(), ['age', 'bmi', 'children'])],
    remainder='passthrough'
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [13]:
# Train the model on the training set
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('scaler', StandardScaler(),
                                                  ['age', 'bmi',
                                                   'children'])])),
                ('regressor', LinearRegression())])

In [14]:

# Evaluate the model on the test set
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 4181.19


In [15]:
# Save the model
import joblib
joblib.dump(model, 'insurance_cost_predictor.pkl')

['insurance_cost_predictor.pkl']

In [21]:
# Import the trained model
model = joblib.load('insurance_cost_predictor.pkl')

# Define the input data
input_data = {
    'age': 70,
    'bmi': 66,
    'children': 1,
    'sex_female': 1,
    'sex_male': 0,
    'smoker_no': 1,
    'smoker_yes': 0,
    'region_northeast': 1,
    'region_northwest': 0,
    'region_southeast': 0,
    'region_southwest': 1
}

# Convert the input data to a pandas DataFrame
input_data = pd.DataFrame([input_data])

# Make a prediction using the model
prediction = model.predict(input_data)[0]
print(f'Predicted insurance cost: {prediction:.2f}')


Predicted insurance cost: 28380.25
