In [54]:
import pandas as pd 

train = pd.read_csv("./insurance/train.csv")
test = pd.read_csv("./insurance/test.csv")

In [55]:
train.shape, test.shape

((1070, 8), (268, 7))

In [56]:
train.head(3)

Unnamed: 0,id,age,sex,bmi,children,smoker,region,charges
0,330,61,female,36.385,1,yes,northeast,48517.56315
1,748,47,female,36.0,1,no,southwest,8556.907
2,684,33,female,18.5,1,no,southwest,4766.022


In [57]:
test.head(3)

Unnamed: 0,id,age,sex,bmi,children,smoker,region
0,508,24,female,25.27,0,no,northeast
1,1309,41,male,32.2,2,no,southwest
2,766,47,male,32.3,1,no,southwest


In [58]:
train.isnull().sum()

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [59]:
test.isnull().sum()

id          0
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [60]:
train.describe()

Unnamed: 0,id,age,bmi,children,charges
count,1070.0,1070.0,1070.0,1070.0,1070.0
mean,671.090654,39.024299,30.730495,1.090654,13193.6348
std,380.66678,13.916945,6.05326,1.204997,12211.531632
min,0.0,18.0,16.815,0.0,1121.8739
25%,347.25,27.0,26.4,0.0,4695.398638
50%,673.0,39.0,30.4,1.0,9273.6388
75%,990.25,51.0,34.8,2.0,15826.112723
max,1337.0,64.0,52.58,5.0,63770.42801


In [61]:
train.describe(include="object")

Unnamed: 0,sex,smoker,region
count,1070,1070,1070
unique,2,2,4
top,male,no,southeast
freq,544,856,287


In [62]:
test.describe() 

Unnamed: 0,id,age,bmi,children
count,268.0,268.0,268.0,268.0
mean,658.156716,39.936567,30.395504,1.11194
std,409.029396,14.572809,6.278762,1.209578
min,5.0,18.0,15.96,0.0
25%,261.0,26.0,25.9725,0.0
50%,658.5,40.5,30.21,1.0
75%,1033.5,52.25,34.1,2.0
max,1328.0,64.0,53.13,5.0


In [63]:
test.describe(include="object")

Unnamed: 0,sex,smoker,region
count,268,268,268
unique,2,2,4
top,female,no,southeast
freq,136,208,77


In [64]:
cols = train.select_dtypes(include="object").columns
cols

Index(['sex', 'smoker', 'region'], dtype='object')

In [65]:
train = pd.get_dummies(train, columns=cols).astype(int)
test = pd.get_dummies(test, columns=cols).astype(int)
display(train.head())
display(test.head())

Unnamed: 0,id,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,330,61,36,1,48517,1,0,0,1,1,0,0,0
1,748,47,36,1,8556,1,0,1,0,0,0,0,1
2,684,33,18,1,4766,1,0,1,0,0,0,0,1
3,35,19,20,0,1625,0,1,1,0,0,1,0,0
4,353,33,35,0,12404,0,1,1,0,1,0,0,0


Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,508,24,25,0,1,0,1,0,1,0,0,0
1,1309,41,32,2,0,1,1,0,0,0,0,1
2,766,47,32,1,0,1,1,0,0,0,0,1
3,667,40,32,2,1,0,0,1,0,1,0,0
4,1057,45,31,0,1,0,1,0,0,0,1,0


In [66]:
from sklearn.model_selection import train_test_split 
X_tr, X_val, y_tr, y_val = train_test_split(
  train.drop('charges', axis=1), 
  train["charges"], 
  test_size=0.15, 
  random_state=2024
)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((909, 12), (161, 12), (909,), (161,))

In [68]:
X_tr.head(1)

Unnamed: 0,id,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
657,832,28,23,2,1,0,1,0,0,1,0,0


In [69]:
y_tr.head(1)

657    4719
Name: charges, dtype: int64

In [71]:
from sklearn.metrics import mean_squared_error
import numpy as np 

np.sqrt(9)

def rmse(y_test, pred): 
  return np.sqrt(mean_squared_error(y_test, pred))



In [72]:
from sklearn.linear_model import LinearRegression 
model = LinearRegression()
model.fit(X_tr, y_tr) 
pred = model.predict(X_val)
rmse(y_val, pred)

7341.228343380285

In [74]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 

cols= ['age', 'bmi']
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.fit_transform(test[cols])
train.head()



Unnamed: 0,id,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,330,1.579799,0.950744,1,48517,1,0,0,1,1,0,0,0
1,748,0.573361,0.950744,1,8556,1,0,1,0,0,0,0,1
2,684,-0.433078,-2.015134,1,4766,1,0,1,0,0,0,0,1
3,35,-1.439516,-1.685592,0,1625,0,1,1,0,0,1,0,0
4,353,-0.433078,0.785973,0,12404,0,1,1,0,1,0,0,0
