In [64]:
#age: age of primary beneficiary

#sex: insurance contractor gender, female, male

#bmi: Body mass index

#children: Number of children covered by health insurance / Number of dependents

#smoker: Smoking

#region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

#charges: Individual medical costs billed by health insurance

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [66]:
df=pd.read_csv('insurance.csv')

In [67]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [69]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [70]:
df.describe(include='O')


Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


In [71]:
df.isnull().sum()


Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [72]:
# do the plotting


In [73]:
numbreical_feature=['age','bmi','children','charges']
catagorical_fearture=['sex','smoker','region']



In [74]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [75]:
# transformation mapping for smoker
df['smoker']=df['smoker'].map({'yes':1, 'no':0})


In [76]:
# transformation for sex and region
df_sex=pd.get_dummies(df['sex'],dtype=int)
df_region=pd.get_dummies(df['region'],dtype=int)


In [77]:
# concat to the orginal dataframe
df_new = pd.concat([df,df_sex,df_region],axis=1)

In [78]:
df_new

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,female,male,northeast,northwest,southeast,southwest
0,19,female,27.900,0,1,southwest,16884.92400,1,0,0,0,0,1
1,18,male,33.770,1,0,southeast,1725.55230,0,1,0,0,1,0
2,28,male,33.000,3,0,southeast,4449.46200,0,1,0,0,1,0
3,33,male,22.705,0,0,northwest,21984.47061,0,1,0,1,0,0
4,32,male,28.880,0,0,northwest,3866.85520,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,0,northwest,10600.54830,0,1,0,1,0,0
1334,18,female,31.920,0,0,northeast,2205.98080,1,0,1,0,0,0
1335,18,female,36.850,0,0,southeast,1629.83350,1,0,0,0,1,0
1336,21,female,25.800,0,0,southwest,2007.94500,1,0,0,0,0,1


In [79]:
df_new.drop(['sex','region'],axis=1,inplace=True)

In [80]:
df_new.head(2)

Unnamed: 0,age,bmi,children,smoker,charges,female,male,northeast,northwest,southeast,southwest
0,19,27.9,0,1,16884.924,1,0,0,0,0,1
1,18,33.77,1,0,1725.5523,0,1,0,0,1,0


In [81]:
# standarize the data
from sklearn.preprocessing import StandardScaler
std = StandardScaler()


In [82]:
X=df_new.drop('charges',axis=1)
y=df_new['charges']

In [83]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [84]:
X_train_scaled = std.fit_transform(X_train)
X_test_scaled = std.transform(X_test)

In [85]:
# import KNN
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled,y_train)


In [86]:

# predict using training data
y_pred_train = knn.predict(X_train_scaled)

# predict using test data

y_pred_test = knn.predict(X_test_scaled)

In [87]:
# evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error,root_mean_squared_error

# calculate metrics for train data
mse_train = mean_squared_error(y_train,y_pred_train)
mae_train = mean_absolute_error(y_train,y_pred_train)
rmse_train = root_mean_squared_error(y_train,y_pred_train)
r2_train = r2_score(y_train,y_pred_train)

print(f"The Mean Squared Error for train data is {mse_train}")
print(f"The Mean Absolute Error for train data is {mae_train}")
print(f"The Root Mean Squared Error for train data is {rmse_train}")
print(f"The R2 Score for train data is {r2_train}")

# calculate metrics for test data
mse_test = mean_squared_error(y_test,y_pred_test)
mae_test = mean_absolute_error(y_test,y_pred_test)
rmse_test = root_mean_squared_error(y_test,y_pred_test)
r2_test = r2_score(y_test,y_pred_test)

print("-"*50)
print(f"The Mean Squared Error for test data is {mse_test}")
print(f"The Mean Absolute Error for test data is {mae_test}")
print(f"The Root Mean Squared Error for test data is {rmse_test}")
print(f"The R2 Score for test data is {r2_test}")


The Mean Squared Error for train data is 20372549.481923252
The Mean Absolute Error for train data is 2733.7663003084112
The Root Mean Squared Error for train data is 4513.59606986749
The R2 Score for train data is 0.8588509786627583
--------------------------------------------------
The Mean Squared Error for test data is 31174145.013288442
The Mean Absolute Error for test data is 3540.8607413022387
The Root Mean Squared Error for test data is 5583.381145263902
The R2 Score for test data is 0.7991987134855247


In [88]:
X_test_scaled[0]


array([ 0.40114007, -0.89153925,  0.73433626, -0.50874702,  1.0246016 ,
       -1.0246016 ,  1.73421182, -0.56079971, -0.59966106, -0.5723141 ])

In [89]:
X_train.head(3)


Unnamed: 0,age,bmi,children,smoker,female,male,northeast,northwest,southeast,southwest
560,46,19.95,2,0,1,0,0,1,0,0
1285,47,24.32,0,0,1,0,1,0,0,0
1142,52,24.86,0,0,1,0,0,0,1,0


In [96]:
# Predict with new data
new_data= pd.DataFrame({"age":[30],
              "bmi":[28],
              "children":[2],
              "smoker":[0],
              "female":[1],
              "male":[0],
              "northeast":[1],
              "northwest":[0],
              "southeast":[0],
              "southwest":[0]})

In [97]:
new_data

Unnamed: 0,age,bmi,children,smoker,female,male,northeast,northwest,southeast,southwest
0,30,28,2,0,1,0,1,0,0,0


In [98]:
std_new_data = std.transform(new_data)
std_new_data

array([[-0.66515641, -0.42386745,  0.73433626, -0.50874702,  1.0246016 ,
        -1.0246016 ,  1.73421182, -0.56079971, -0.59966106, -0.5723141 ]])

In [99]:
prediction = knn.predict(std_new_data)
print(f"The predicted charge is {prediction[0]}")

The predicted charge is 11047.789096000002


In [100]:
prediction

array([11047.789096])

In [101]:
#pip install pandas-profiling

In [102]:
pip install numba==0.56.4

Collecting numba==0.56.4
  Downloading numba-0.56.4.tar.gz (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25h  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [105]:
from pandas_profiling import ProfileReport

ModuleNotFoundError: No module named 'pandas_profiling'