In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [None]:
# So let`s look at the distribution of people who smoke and who do not.

In [None]:
import plotly.express as px

In [None]:
figure = px.histogram(df , x='sex',color='smoker',title='Number of Smoker')
figure.show()

In [None]:
# so now we will replace the value of sex and smoker columns with 0 and 1 as
# both these columns contains string values.

In [None]:
df['sex']=df['sex'].map({'female':0,'male':1})
df['smoker']=df['smoker'].map({'no':0,'yes':1})

In [None]:
print(df.head())

   age  sex     bmi  children  smoker     region      charges
0   19    0  27.900         0       1  southwest  16884.92400
1   18    1  33.770         1       0  southeast   1725.55230
2   28    1  33.000         3       0  southeast   4449.46200
3   33    1  22.705         0       0  northwest  21984.47061
4   32    1  28.880         0       0  northwest   3866.85520


In [None]:
# Now let`s have a look at the distribution of the regions where people are living according to the dataset

In [None]:
import plotly.express as px


In [None]:
pie = df['region'].value_counts()
regions=pie.index
population = pie.values
fig=px.pie(values = population , names = regions)
fig.show()

In [None]:
# Now let`s have a look at the coorelation between the features of this dataset

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
lb = LabelEncoder()

In [None]:
df['region']=lb.fit_transform(df['region'])

In [None]:
print(df.corr())

               age       sex       bmi  children    smoker    region   charges
age       1.000000 -0.020856  0.109272  0.042469 -0.025019  0.002127  0.299008
sex      -0.020856  1.000000  0.046371  0.017163  0.076185  0.004588  0.057292
bmi       0.109272  0.046371  1.000000  0.012759  0.003750  0.157566  0.198341
children  0.042469  0.017163  0.012759  1.000000  0.007673  0.016569  0.067998
smoker   -0.025019  0.076185  0.003750  0.007673  1.000000 -0.002181  0.787251
region    0.002127  0.004588  0.157566  0.016569 -0.002181  1.000000 -0.006208
charges   0.299008  0.057292  0.198341  0.067998  0.787251 -0.006208  1.000000


In [None]:
# Now let`s move on trainning a machine leaning model for the task of predicting health insurance policy

In [None]:
x = np.array(df[['age','sex','bmi','smoker']]) # independent
y = np.array(df[['charges']]) #dependent

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,
                                                       test_size=0.2,
                                                       random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor()

In [None]:
rf.fit(x_train , y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [None]:
y_pred = rf.predict(x_test)

In [None]:
data = pd.DataFrame(data = {'Predicted Premium Amount :': y_pred})
print(data.head(10))

   Predicted Premium Amount :
0                11055.331823
1                 5489.860266
2                28216.799082
3                 9571.578285
4                34760.943233
5                 7886.168489
6                 2568.320606
7                14618.070789
8                 6990.292092
9                 8968.138876


In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test , y_pred)

0.8464812023099528

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [None, 10, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(x_train, y_train)

print(grid_search.best_params_)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
