In [1]:
import pandas as pd

In [63]:
df=pd.read_csv("insurance.csv")

## 1. Display Top 5 Rows of The Dataset

In [64]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


## 2. Display last 5 Rows of The Dataset

In [65]:
df.tail()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95
1337,61,female,29.1,0,yes,northwest,29141.36


## 3. Find Shape of Our Dataset

In [66]:
df.shape

(1338, 7)

## 4. Get Information About Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   expenses  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## 5.Check for Null Values

In [68]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

## 6. Get Statistics

In [69]:
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


## 7. Convert Columns From String ['sex' ,'smoker','region' ] To Numerical Values

In [70]:
from sklearn.preprocessing import LabelEncoder

In [71]:
le_sex = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

In [72]:
df['sex'] = le_sex.fit_transform(df['sex'])
df['smoker'] = le_smoker.fit_transform(df['smoker'])
df['region'] = le_region.fit_transform(df['region'])

In [73]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,3,16884.92
1,18,1,33.8,1,0,2,1725.55
2,28,1,33.0,3,0,2,4449.46
3,33,1,22.7,0,0,1,21984.47
4,32,1,28.9,0,0,1,3866.86


## 8.Separate features (X) and target variable (y)

In [74]:
X = df.drop('expenses', axis=1)
y = df['expenses']

## 9. Split the data into training and testing sets

In [75]:
from sklearn.model_selection import train_test_split

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 10. Initialize models

In [77]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [78]:
linear_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor()
random_forest_model = RandomForestRegressor()
svm_model = SVR()
gradient_boosting_model = GradientBoostingRegressor()
elasticnet_model = ElasticNet()
xgboost_model = XGBRegressor()

In [79]:
models = [linear_model, decision_tree_model, random_forest_model, svm_model, gradient_boosting_model, elasticnet_model, xgboost_model]

In [80]:
models

[LinearRegression(),
 DecisionTreeRegressor(),
 RandomForestRegressor(),
 SVR(),
 GradientBoostingRegressor(),
 ElasticNet(),
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)]

## 11.Train and evaluate each model

In [81]:
from sklearn.metrics import mean_absolute_error

In [82]:
for model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    print(f"{model.__class__.__name__} Mean Absolute Error: {mae}")



LinearRegression Mean Absolute Error: 4186.9401063170135
DecisionTreeRegressor Mean Absolute Error: 2861.3028731343284
RandomForestRegressor Mean Absolute Error: 2554.879317835821
SVR Mean Absolute Error: 8592.793396346126
GradientBoostingRegressor Mean Absolute Error: 2504.858952637542
ElasticNet Mean Absolute Error: 7426.115816977492
XGBRegressor Mean Absolute Error: 2801.2309728320674


## 12. Choose the model with the lowest Mean Absolute Error

In [83]:
best_model = min(zip(models, [mean_absolute_error(y_test, model.predict(X_test)) for model in models]), key=lambda x: x[1])[0]


In [84]:
best_model

In [85]:
# Get user input for prediction
user_age = int(input('Enter age: '))
user_sex = le_sex.transform([input('Enter sex (female/male): ')])[0]
user_bmi = float(input('Enter BMI: '))
user_children = int(input('Enter number of children: '))
user_smoker = le_smoker.transform([input('Smoker? (yes/no): ')])[0]
user_region = le_region.transform([input('Enter region (northwest/northeast/southwest/southeast): ')])[0]

# Make a prediction
user_data = pd.DataFrame({
    'age': [user_age],
    'sex': [user_sex],
    'bmi': [user_bmi],
    'children': [user_children],
    'smoker': [user_smoker],
    'region': [user_region]
})

prediction = model.predict(user_data)

print(f'Predicted Insurance Premium: ${prediction[0]:.2f}')


Enter age: 30
Enter sex (female/male): male
Enter BMI: 20
Enter number of children: 1
Smoker? (yes/no): yes
Enter region (northwest/northeast/southwest/southeast): northeast
Predicted Insurance Premium: $14505.72
