Problem statement-
Build a multivariate Linear regression model to predict the medical insurance cost(charges) based on patient information.
1. Perform exploratory data analysis on the dataset
2. Visualize data elements
3. Derive preliminary observations about relationship between the variables (independent, dependent)
4. Perform feature engineering
 - Explore possibility of deriving new features 
 - Transform existing features using techniques like Min-max scaling, One hot encoding(dummy variable creation)
5. Perform feature selection using RFE, VIF, iterative Linear regression model summary (p-value of features coefficients)
6. Perform Residual analysis using the training data
7. Build the final regression equation.


In [22]:
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sb

# importing warnings to ingnore any warnings given by the interpreter
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

# 1. Reading and understanding data

In [3]:
insurance = pd.read_csv('insurance.csv')
insurance.head() # gives the first 5 instances of the table

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance.shape # gives the dimensions of our data
# in this case the data has 1338 rows and 7 columns

(1338, 7)

In [5]:
insurance.dtypes # gives the data types of our data

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [6]:
# changing the data type of children to object from int64
insurance.children = insurance.children.astype('O')
insurance.children.dtype

dtype('O')

In [7]:
insurance.describe(include='all').T 
# gives the statistics for the table 
# .T is to get the transpose of the table

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,1338.0,,,,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
sex,1338.0,2.0,male,676.0,,,,,,,
bmi,1338.0,,,,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,6.0,0.0,574.0,,,,,,,
smoker,1338.0,2.0,no,1064.0,,,,,,,
region,1338.0,4.0,southeast,364.0,,,,,,,
charges,1338.0,,,,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [8]:
# taking general summary of the dataframe
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   object 
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 73.3+ KB


In [9]:
for ins in [insurance.region.unique()]:
    print(ins)
# finding unique values in regions column

['southwest' 'southeast' 'northwest' 'northeast']


In [10]:
# searching for any NA value in the dataframe
print(insurance.age.isna().value_counts())
print(insurance.sex.isna().value_counts())
print(insurance.bmi.isna().value_counts())
print(insurance.children.isna().value_counts())
print(insurance.smoker.isna().value_counts())
print(insurance.region.isna().value_counts())

False    1338
Name: age, dtype: int64
False    1338
Name: sex, dtype: int64
False    1338
Name: bmi, dtype: int64
False    1338
Name: children, dtype: int64
False    1338
Name: smoker, dtype: int64
False    1338
Name: region, dtype: int64


In [11]:
# Crosstab between categorical variables
pd.crosstab(insurance.region, insurance.smoker)

smoker,no,yes
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,257,67
northwest,267,58
southeast,273,91
southwest,267,58


In [12]:
# crosstab between age and smoker
pd.crosstab(insurance.age, insurance.smoker)

smoker,no,yes
age,Unnamed: 1_level_1,Unnamed: 2_level_1
18,57,12
19,50,18
20,20,9
21,26,2
22,22,6
23,21,7
24,22,6
25,23,5
26,25,3
27,19,9


In [13]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [14]:
# crosstab between sex and smoker
pd.crosstab(insurance.sex, insurance.smoker)

smoker,no,yes
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,547,115
male,517,159


In [15]:
# crosstab between bmi and smoker
pd.crosstab(insurance.bmi, insurance.smoker)

smoker,no,yes
bmi,Unnamed: 1_level_1,Unnamed: 2_level_1
15.960,1,0
16.815,2,0
17.195,0,1
17.290,2,1
17.385,1,0
...,...,...
48.070,1,0
49.060,1,0
50.380,1,0
52.580,0,1


In [16]:
# crosstab between children and age
pd.crosstab(insurance.age, insurance.children)

children,0,1,2,3,4,5
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18,51,9,6,2,1,0
19,50,11,5,1,0,1
20,17,6,3,1,0,2
21,16,5,5,1,1,0
22,18,4,2,4,0,0
23,13,6,5,4,0,0
24,23,1,1,2,1,0
25,12,6,4,3,2,1
26,10,9,7,1,1,0
27,14,6,3,5,0,0


In [17]:
# Distribution of values in variables
print(insurance.smoker.value_counts())
insurance.smoker.value_counts(normalize = True)

no     1064
yes     274
Name: smoker, dtype: int64


no     0.795217
yes    0.204783
Name: smoker, dtype: float64

In [18]:
print(insurance.region.value_counts())
insurance.region.value_counts(normalize = True)

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64


southeast    0.272048
southwest    0.242900
northwest    0.242900
northeast    0.242152
Name: region, dtype: float64

In [19]:
# finding minimum and maximum values in continuous variables
print("Age -> ",insurance.age.min(), insurance.age.max())
print("BMI -> ",insurance.bmi.min(), insurance.bmi.max())
print("Children -> ",insurance.children.min(), insurance.children.max())

Age ->  18 64
BMI ->  15.96 53.13
Children ->  0 5


In [20]:
# Average of insurance charges
insurance.charges.mean()

13270.422265141257

In [21]:
# mode of age
insurance.age.mode()

0    18
dtype: int64

In [24]:
# counting the values
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [26]:
# counting smokers, sex, region
print(insurance.smoker.value_counts())
print(insurance.sex.value_counts())
print(insurance.region.value_counts())

no     1064
yes     274
Name: smoker, dtype: int64
male      676
female    662
Name: sex, dtype: int64
southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64
