## 1. Problem Statement

#### The goal of this project is to give people an estimate of how much health risk they have based on their individual health situation. 

## 2. Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# !pip install pymongo
import pymongo

Collecting pymongo
  Downloading pymongo-4.3.3-cp39-cp39-win_amd64.whl (382 kB)
     -------------------------------------- 382.5/382.5 kB 1.8 MB/s eta 0:00:00
Collecting dnspython<3.0.0,>=1.16.0
  Using cached dnspython-2.3.0-py3-none-any.whl (283 kB)
Installing collected packages: dnspython, pymongo
Successfully installed dnspython-2.3.0 pymongo-4.3.3


## 3. Reading data from mongodb

In [10]:
client = pymongo.MongoClient("mongodb+srv://kuldeepgupta2603:mQjVmihIdt6SJ364@clusteralpha.sg9io0n.mongodb.net/test")

In [11]:
for db in client.list_databases():
    print(db)

{'name': 'healthpred', 'sizeOnDisk': 131072, 'empty': False}
{'name': 'admin', 'sizeOnDisk': 344064, 'empty': False}
{'name': 'local', 'sizeOnDisk': 28671197184, 'empty': False}


In [12]:
db = client.healthpred

In [13]:
collection = db.insurance

In [28]:
cursor = collection.find()
df = pd.DataFrame(list(cursor))

In [24]:
df.head()

Unnamed: 0,_id,age,sex,bmi,children,smoker,region,expenses
0,640f5b2fc23c7f24eb5794b6,19,female,27.9,0,yes,southwest,16884.92
1,640f5b2fc23c7f24eb5794b7,18,male,33.8,1,no,southeast,1725.55
2,640f5b2fc23c7f24eb5794b8,28,male,33.0,3,no,southeast,4449.46
3,640f5b2fc23c7f24eb5794b9,33,male,22.7,0,no,northwest,21984.47
4,640f5b2fc23c7f24eb5794ba,32,male,28.9,0,no,northwest,3866.86


## 4. Data Exploration and Cleaning

#### 4.1 Data definition and description

#### 4.2 Dropping _id column

In [30]:
df = df.drop('_id', axis = 1)

#### 4.3 Checking shape of dataframe

In [33]:
df.shape

(1338, 7)

#### 4.4 Checking missing values

In [34]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
expenses    0
dtype: int64

#### 4.5 Checking duplicate data

In [35]:
df.duplicated().sum()

1

In [37]:
df[df.duplicated()]

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
581,19,male,30.6,0,no,northwest,1639.56


#### Dropping duplicated row

In [46]:
df = df[~df.duplicated()]

In [47]:
df.duplicated().sum()

0

#### 4.6 Checking datatypes of each column

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       1338 non-null   object
 1   sex       1338 non-null   object
 2   bmi       1338 non-null   object
 3   children  1338 non-null   object
 4   smoker    1338 non-null   object
 5   region    1338 non-null   object
 6   expenses  1338 non-null   object
dtypes: object(7)
memory usage: 73.3+ KB


#### 4.7 Checking unique value counts in each column

In [39]:
df.nunique()

age           47
sex            2
bmi          275
children       6
smoker         2
region         4
expenses    1337
dtype: int64

#### Based on 4.6 and 4.7, "age", "bmi" and "expenses" must be converted into integer datatypes. 

In [52]:
df['age'] = pd.to_numeric(df['age'])
df['bmi'] = pd.to_numeric(df['bmi'])
df['expenses'] = pd.to_numeric(df['expenses'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age'] = pd.to_numeric(df['age'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['bmi'] = pd.to_numeric(df['bmi'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['expenses'] = pd.to_numeric(df['expenses'])


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   object 
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   expenses  1337 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 83.6+ KB


#### 4.8 Dataframe statistics

In [58]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1337.0,39.222139,14.044333,18.0,27.0,39.0,51.0,64.0
bmi,1337.0,30.66552,6.100664,16.0,26.3,30.4,34.7,53.1
expenses,1337.0,13279.121638,12110.359657,1121.87,4746.34,9386.16,16657.72,63770.43


#### 4.9 Numerical and categorical variables

In [55]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(f'Categorical features: {cat_features}')

num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print(f'Numerical features: {num_features}')

Categorical features: ['sex', 'children', 'smoker', 'region']
Numerical features: ['age', 'bmi', 'expenses']


#### 4.10 Categories in categorical columns

In [56]:
for i, col in enumerate(cat_features):
    print(f'All categories in {col} column:')
    print(df[col].unique())
    print('=='*20)

All categories in sex column:
['female' 'male']
All categories in children column:
['0' '1' '3' '2' '5' '4']
All categories in smoker column:
['yes' 'no']
All categories in region column:
['southwest' 'southeast' 'northwest' 'northeast']


## Data Visualization

### Univariate Analysis