In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/medical-insurance-premium-prediction/Medicalpremium.csv')

# Exploring dataset

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

# Performing EDA to see the distribution of data

In [None]:
sns.displot(x='Age',data=df,aspect=10/6,kde=True)

In [None]:
sns.displot(x='PremiumPrice',data=df,aspect=10/6,kde=True)

## Creating salary-bins to visualize distribution of Premium Price and Age

In [None]:
pr_lab=['Low','Basic','Average','High','SuperHigh']
df['PremiumLabel']=pr_bins=pd.cut(df['PremiumPrice'],bins=5,labels=pr_lab,precision=0)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
sns.barplot(y='Age',x='PremiumLabel',data=df,ax=ax)

In [None]:
sns.jointplot(x='Height',y='Weight',data=df,hue='PremiumLabel',height=8,kind='kde')

In [None]:
age_lab=['Teen','Young','Middle','Old','SuperOld']
df['AgeLabel']=pd.cut(df['Age'],bins=5,labels=age_lab,precision=0)

In [None]:
df.columns

In [None]:
sns.relplot(x='Age',y='PremiumPrice',data=df,col='PremiumLabel',aspect=10/10)

In [None]:
fig,ax=plt.subplots(figsize=(12,6))
sns.swarmplot(x='PremiumLabel',y='Age',data=df,ax=ax)

# Converting new categorical columns to numeric ones

In [None]:
df=pd.get_dummies(data=df,columns=['AgeLabel','PremiumLabel'])

In [None]:
df.columns

In [None]:
df.describe()

# New Dataset

In [None]:
df.head()

In [None]:
df.shape

# Getting Features and Target datasets

In [None]:
x=df.drop('PremiumPrice',axis=1)
y=df['PremiumPrice']

In [None]:
x.columns

### Scalling the dataset

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
xsc=sc.fit_transform(x)

In [None]:
xsc=pd.DataFrame(xsc,columns=x.columns)

In [None]:
from sklearn.feature_selection import mutual_info_regression
mi_scores = mutual_info_regression(xsc,y)
mi_scores = pd.Series(mi_scores, name="MI Scores",index=xsc.columns)
mi_scores = mi_scores.sort_values(ascending=False)
(mi_scores*100).head(15).index

In [None]:
fea=['Age', 'PremiumLabel_Basic', 'PremiumLabel_Average', 'PremiumLabel_Low',
       'NumberOfMajorSurgeries', 'AgeLabel_Teen', 'PremiumLabel_High',
       'Weight', 'AgeLabel_SuperOld', 'AgeLabel_Middle',
       'PremiumLabel_SuperHigh', 'AgeLabel_Young', 'AgeLabel_Old',
       'AnyChronicDiseases', 'AnyTransplants']
xsc=xsc[fea]

In [None]:
y.shape

# Splitting into Test and Train datasets

In [None]:
from sklearn.model_selection import train_test_split

## SPLITTING dataset

In [None]:
xtr,xte,ytr,yte=train_test_split(xsc,y,random_state=33,test_size=0.3)

# Building ML model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
reg=RandomForestRegressor(n_jobs=-1,verbose=2)
param_grid={'n_estimators':[60,50,55],'criterion':['mse','mae'],'max_depth':[7],'min_samples_split':[3],'max_features':['auto']}
gs=GridSearchCV(reg,param_grid=param_grid,cv=3,n_jobs=-1,verbose=1)

In [None]:
gs.fit(xtr,ytr)

In [None]:
gs.best_params_

In [None]:
gs.best_score_

In [None]:
predicted_y=gs.predict(xte)

# Testing the accuracy of model

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(yte,predicted_y)

# 99.4 % Accuracy