# End to End Machine Learning Lab

In [464]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [465]:
df = pd.read_csv('../../data/dataset/original_datasets/heart_2020_cleaned.csv')

##  Scikit-Learn Train Test Split

In [466]:
df["HeartDisease"] = df["HeartDisease"].apply(lambda x: 1 if x == 'Yes' else 0)
df["HeartDisease"] = df["HeartDisease"].astype(float) 

`train_test_split` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html">Documentation Link</a>


In [467]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [489]:
test_set.head(40)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
271884,0.0,27.63,Yes,No,No,0.0,25.0,No,Female,25-29,Hispanic,No,Yes,Very good,7.0,No,No,No
270361,0.0,21.95,No,No,No,0.0,20.0,No,Female,30-34,White,No,Yes,Excellent,6.0,No,No,Yes
219060,0.0,31.32,Yes,No,No,0.0,0.0,No,Female,40-44,White,No,Yes,Very good,6.0,Yes,No,No
24010,0.0,40.35,No,No,No,30.0,0.0,No,Female,65-69,White,No,No,Good,8.0,No,No,No
181930,0.0,35.61,Yes,No,No,30.0,30.0,Yes,Female,60-64,White,No,No,Fair,4.0,Yes,No,Yes
24149,1.0,24.63,Yes,No,No,0.0,0.0,No,Female,80 or older,White,Yes,Yes,Very good,8.0,No,No,No
185683,0.0,41.73,No,No,No,0.0,0.0,No,Male,35-39,White,No,Yes,Very good,7.0,No,No,No
316656,0.0,16.3,No,No,No,2.0,0.0,No,Female,25-29,Hispanic,No,No,Excellent,8.0,No,No,No
305719,0.0,21.52,Yes,No,No,30.0,0.0,No,Male,65-69,White,No,No,Good,8.0,No,No,No
56786,0.0,29.53,Yes,No,No,28.0,30.0,Yes,Male,45-49,Black,No,Yes,Fair,5.0,Yes,No,No


stratified sampling based on the income category, to generate the test set with  income category proportions almost identical to those in the full dataset

`StratifiedShuffleSplit` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html">Documentation Link</a>

In [469]:
#Provides train/test indices to split data in train/test sets.
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

In [470]:
for train_index, test_index in split.split(df, df["HeartDisease"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
    

In [471]:
strat_test_set["HeartDisease"].value_counts() / len(strat_test_set) * 100

HeartDisease
0.0    91.439829
1.0     8.560171
Name: count, dtype: float64

In [472]:
df["HeartDisease"].value_counts() / len(df) * 100

HeartDisease
0.0    91.440454
1.0     8.559546
Name: count, dtype: float64

In [473]:
def income_cat_proportions(data):
    return data["HeartDisease"].value_counts() / len(data)

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(df),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [474]:
compare_props

Unnamed: 0_level_0,Overall,Stratified,Random,Rand. %error,Strat. %error
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.914405,0.914398,0.912569,-0.200737,-0.000684
1.0,0.085595,0.085602,0.087431,2.144449,0.007306


In [475]:
# Prepare the Data for Machine Learning Algorithms
heart = strat_train_set.drop("HeartDisease", axis=1)
heart_labels = strat_train_set["HeartDisease"].copy()
heart

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
143355,23.62,Yes,No,No,30.0,30.0,Yes,Female,45-49,White,No,No,Fair,5.0,Yes,No,No
290535,22.43,No,No,No,1.0,0.0,No,Male,25-29,White,No,Yes,Very good,8.0,No,No,No
31528,33.28,Yes,No,No,20.0,30.0,No,Female,30-34,Hispanic,No,No,Fair,7.0,No,No,No
234534,24.96,Yes,No,No,0.0,0.0,No,Female,65-69,White,No,Yes,Very good,8.0,No,No,No
264813,27.25,Yes,No,No,0.0,0.0,No,Female,55-59,White,No,Yes,Very good,5.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52259,30.81,Yes,No,Yes,0.0,0.0,No,Male,65-69,White,No,Yes,Excellent,5.0,Yes,No,No
237009,20.16,Yes,No,No,0.0,8.0,Yes,Male,30-34,White,No,Yes,Good,7.0,No,No,No
89081,18.89,No,No,No,0.0,0.0,No,Female,35-39,White,No,Yes,Excellent,7.0,No,No,No
117490,25.02,No,No,No,0.0,0.0,No,Male,80 or older,White,Yes,Yes,Good,8.0,No,No,No


## Transformation Pipelines

`Pipeline` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html">Documentation Link</a>

There are many data transformation steps that need to be executed in the right order. Fortunately, Scikit-Learn provides the Pipeline class to help with such sequences of transformations.

`StandardScaler` <- Important to learn <a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html">Documentation Link</a>

Standardization used in numerical data preparation, first it subtracts the mean value (so standardized values always have a zero mean), and then it divides by the standard deviation so that the resulting distribution has unit variance.



In [476]:
cat_features=df.select_dtypes(include=['object']).columns
len(cat_features),cat_features

(13,
 Index(['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex',
        'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth',
        'Asthma', 'KidneyDisease', 'SkinCancer'],
       dtype='object'))

In [477]:
num_features=df.select_dtypes("number").columns.to_list()
len(num_features),num_features

(5, ['HeartDisease', 'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'])

In [478]:
binary_features = [feat for feat in cat_features if df[feat].nunique() == 2]
multi_cat_features = [feat for feat in cat_features if df[feat].nunique() > 2]


In [479]:
binary_features,multi_cat_features,num_features

(['Smoking',
  'AlcoholDrinking',
  'Stroke',
  'DiffWalking',
  'Sex',
  'PhysicalActivity',
  'Asthma',
  'KidneyDisease',
  'SkinCancer'],
 ['AgeCategory', 'Race', 'Diabetic', 'GenHealth'],
 ['HeartDisease', 'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'])

In [480]:
multi_cat_features.remove("GenHealth")
ordinal_features = ["GenHealth"]
binary_features,multi_cat_features,num_features,ordinal_features

(['Smoking',
  'AlcoholDrinking',
  'Stroke',
  'DiffWalking',
  'Sex',
  'PhysicalActivity',
  'Asthma',
  'KidneyDisease',
  'SkinCancer'],
 ['AgeCategory', 'Race', 'Diabetic'],
 ['HeartDisease', 'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'],
 ['GenHealth'])

In [481]:
# Custom transformer for dropping duplicate rows
class DropDuplicatesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.drop_duplicates()
        return X

In [482]:
# Define categorical features
categorical_features = heart.select_dtypes(include=['object']).columns.to_list()
categorical_features.remove('GenHealth')

# Define numerical features
numerical_features = heart.select_dtypes("number").columns.to_list()


# Define ordinal features
ordinal_features = ['GenHealth']

# Separate binary and multi-category features
binary_features = [feat for feat in categorical_features if heart[feat].nunique() == 2]
multi_cat_features = [feat for feat in categorical_features if heart[feat].nunique() > 2]

# Numerical pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

# Full pipeline
full_pipeline = Pipeline([
    ("drop_duplicates", DropDuplicatesTransformer()),
    ("preprocess", ColumnTransformer([
        ("num", num_pipeline, numerical_features),
        ("binary_cat", OneHotEncoder(handle_unknown='ignore', drop='first'), binary_features), 
        ("multi_cat", OneHotEncoder(handle_unknown='ignore'), multi_cat_features), 
        ("ord", OrdinalEncoder(categories=[['Poor', 'Fair', 'Good', 'Very good', 'Excellent']]), ordinal_features)
    ]))
])

# Apply the transformations
heart_prepared = full_pipeline.fit_transform(heart)

In [483]:
heart_prepared


<242483x37 sparse matrix of type '<class 'numpy.float64'>'
	with 2461216 stored elements in Compressed Sparse Row format>

In [484]:
heart_prepared.shape

(242483, 37)

In [485]:

heart_labels 

143355    0.0
290535    0.0
31528     0.0
234534    0.0
264813    0.0
         ... 
52259     0.0
237009    0.0
89081     0.0
117490    0.0
241203    0.0
Name: HeartDisease, Length: 255836, dtype: float64

# Select and Train a Model

You will use some regression models: 
- Logistic Regression
- Decision Tree Regression
- Random Forest Regressor
- Support Vector Regression

## Logistic Regression


`LogisticRegression` <- Important to learn

- fit() <- Train the Model 
- predict() <- To predict the value

## Decision Tree Classifier
`DecisionTreeClassifier`

