# Importing and Preparing Data


## Importing modules


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Importing Dataset


In [2]:
df = pd.read_csv('dataset/CVD_cleaned.csv')
df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  object 
 3   Heart_Disease                 308854 non-null  object 
 4   Skin_Cancer                   308854 non-null  object 
 5   Other_Cancer                  308854 non-null  object 
 6   Depression                    308854 non-null  object 
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  object 
 9   Sex                           308854 non-null  object 
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [4]:
df.isna().sum()

General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,Height_(cm),Weight_(kg),BMI,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
count,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0,308854.0
mean,170.615249,83.588655,28.626211,5.096366,29.8352,15.110441,6.296616
std,10.658026,21.34321,6.522323,8.199763,24.875735,14.926238,8.582954
min,91.0,24.95,12.02,0.0,0.0,0.0,0.0
25%,163.0,68.04,24.21,0.0,12.0,4.0,2.0
50%,170.0,81.65,27.44,1.0,30.0,12.0,4.0
75%,178.0,95.25,31.85,6.0,30.0,20.0,8.0
max,241.0,293.02,99.33,30.0,120.0,128.0,128.0


## Transforming Data


In [6]:
print('Printing the unique values in the categorical columns:')
print('-'*54)
for col in df.select_dtypes('object').columns:
    print(f'{col}: {df[col].unique()}')

Printing the unique values in the categorical columns:
------------------------------------------------------
General_Health: ['Poor' 'Very Good' 'Good' 'Fair' 'Excellent']
Checkup: ['Within the past 2 years' 'Within the past year' '5 or more years ago'
 'Within the past 5 years' 'Never']
Exercise: ['No' 'Yes']
Heart_Disease: ['No' 'Yes']
Skin_Cancer: ['No' 'Yes']
Other_Cancer: ['No' 'Yes']
Depression: ['No' 'Yes']
Diabetes: ['No' 'Yes' 'No, pre-diabetes or borderline diabetes'
 'Yes, but female told only during pregnancy']
Arthritis: ['Yes' 'No']
Sex: ['Female' 'Male']
Age_Category: ['70-74' '60-64' '75-79' '80+' '65-69' '50-54' '45-49' '18-24' '30-34'
 '55-59' '35-39' '40-44' '25-29']
Smoking_History: ['Yes' 'No']


In [7]:
binary_valued_columns = [
    col
    for col in df.select_dtypes('object').columns
    if len(df[col].unique()) == 2
]

print('Printing the categorical columns which are binary valued:')
print('-'*57)
for col in binary_valued_columns:
    print(f'{col}: {df[col].unique()}')

Printing the categorical columns which are binary valued:
---------------------------------------------------------
Exercise: ['No' 'Yes']
Heart_Disease: ['No' 'Yes']
Skin_Cancer: ['No' 'Yes']
Other_Cancer: ['No' 'Yes']
Depression: ['No' 'Yes']
Arthritis: ['Yes' 'No']
Sex: ['Female' 'Male']
Smoking_History: ['Yes' 'No']


In [8]:
print('Factorizing binary valued columns:')
print('-'*34)
for col in binary_valued_columns:
    df[col] = pd.factorize(df[col])[0]
    print(f'{col}: {df[col].unique()}')

Factorizing binary valued columns:
----------------------------------
Exercise: [0 1]
Heart_Disease: [0 1]
Skin_Cancer: [0 1]
Other_Cancer: [0 1]
Depression: [0 1]
Arthritis: [0 1]
Sex: [0 1]
Smoking_History: [0 1]


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  object 
 1   Checkup                       308854 non-null  object 
 2   Exercise                      308854 non-null  int64  
 3   Heart_Disease                 308854 non-null  int64  
 4   Skin_Cancer                   308854 non-null  int64  
 5   Other_Cancer                  308854 non-null  int64  
 6   Depression                    308854 non-null  int64  
 7   Diabetes                      308854 non-null  object 
 8   Arthritis                     308854 non-null  int64  
 9   Sex                           308854 non-null  int64  
 10  Age_Category                  308854 non-null  object 
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul

In [10]:
print('Printing the unique values in the remaning categorical columns:')
print('-'*63)
for col in df.select_dtypes('object').columns:
    print(f'{col}: {df[col].unique()}')

Printing the unique values in the remaning categorical columns:
---------------------------------------------------------------
General_Health: ['Poor' 'Very Good' 'Good' 'Fair' 'Excellent']
Checkup: ['Within the past 2 years' 'Within the past year' '5 or more years ago'
 'Within the past 5 years' 'Never']
Diabetes: ['No' 'Yes' 'No, pre-diabetes or borderline diabetes'
 'Yes, but female told only during pregnancy']
Age_Category: ['70-74' '60-64' '75-79' '80+' '65-69' '50-54' '45-49' '18-24' '30-34'
 '55-59' '35-39' '40-44' '25-29']


In [11]:
print('Custom factorized "General_Health" values:')
print('-'*42)

# Uncomment the below in case of any warning for usage of deprecated method.
pd.set_option('future.no_silent_downcasting', True)

df["General_Health"] = df["General_Health"].replace(
    ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'],
    [0, 0.75, 0.50, 0.25, 1]
)
df["General_Health"].unique()

Custom factorized "General_Health" values:
------------------------------------------


array([0, 0.25, 0.5, 0.75, 1], dtype=object)

In [12]:
# In case you the dtype is shown as 'object' for the "General_Health" column, you can convert it to float using the below code:
df["General_Health"] = pd.to_numeric(df["General_Health"])
df["General_Health"].unique()

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [13]:
print('Custom factorized "Checkup" values:')
print('-'*34)

# Uncomment the below in case of any warning for usage of deprecated method.
pd.set_option('future.no_silent_downcasting', True)

df["Checkup"] = df["Checkup"].replace(
								['Never', 'Within the past year', 'Within the past 2 years','Within the past 5 years', '5 or more years ago'],
								[0, 0.75, 0.50, 0.25, 1]
							)
df["Checkup"].unique()

Custom factorized "Checkup" values:
----------------------------------


array([0.5, 0.75, 1, 0.25, 0], dtype=object)

In [14]:
# In case you the dtype is shown as 'object' for the "Checkup" column, you can convert it to float using the below code:
df["Checkup"] = pd.to_numeric(df["Checkup"])
df["Checkup"].unique()

array([0.5 , 0.75, 1.  , 0.25, 0.  ])

In [15]:
print('Custom factorized "Age_Category" values:')
print('-'*40)

# Uncomment the below in case of any warning for usage of deprecated method.
pd.set_option('future.no_silent_downcasting', True)

df["Age_Category"] = df["Age_Category"].replace([
    '18-24',
    '25-29',
    '30-34',
    '35-39',
    '40-44',
    '45-49',
    '50-54',
    '55-59',
    '60-64',
    '65-69',
    '70-74',
    '75-79',
    '80+'
], [
    0,
    0.08333333333333333,
    0.16666666666666666,
    0.25,
    0.3333333333333333,
    0.41666666666666663,
    0.5,
    0.5833333333333333,
    0.6666666666666666,
    0.75,
    0.8333333333333333,
    0.9166666666666666,
    1
])
df["Age_Category"].unique()

Custom factorized "Age_Category" values:
----------------------------------------


array([0.8333333333333333, 0.6666666666666666, 0.9166666666666666, 1,
       0.75, 0.5, 0.41666666666666663, 0, 0.16666666666666666,
       0.5833333333333333, 0.25, 0.3333333333333333, 0.08333333333333333],
      dtype=object)

In [16]:
# In case you the dtype is shown as 'object' for the "Age_Category" column, you can convert it to float using the below code:
df["Age_Category"] = pd.to_numeric(df["Age_Category"])
df["Age_Category"].unique()

array([0.83333333, 0.66666667, 0.91666667, 1.        , 0.75      ,
       0.5       , 0.41666667, 0.        , 0.16666667, 0.58333333,
       0.25      , 0.33333333, 0.08333333])

In [17]:
# Since there are is an additional argument attached to the 'Yes' or 'No', we can just factorize as 0/1 like binary valued columns.
# Nor can we cutomize factorize into such as "General_Health", "Checkup" and "Age_Category". So, we'll have to factorize them in sepreated values.
df["Diabetes"] = pd.factorize(df["Diabetes"])[0]
df["Diabetes"].unique()

array([0, 1, 2, 3])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308854 entries, 0 to 308853
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   General_Health                308854 non-null  float64
 1   Checkup                       308854 non-null  float64
 2   Exercise                      308854 non-null  int64  
 3   Heart_Disease                 308854 non-null  int64  
 4   Skin_Cancer                   308854 non-null  int64  
 5   Other_Cancer                  308854 non-null  int64  
 6   Depression                    308854 non-null  int64  
 7   Diabetes                      308854 non-null  int64  
 8   Arthritis                     308854 non-null  int64  
 9   Sex                           308854 non-null  int64  
 10  Age_Category                  308854 non-null  float64
 11  Height_(cm)                   308854 non-null  float64
 12  Weight_(kg)                   308854 non-nul