In [None]:
# !pip install pandas --quiet
# !pip install openpyxl --quiet
# !pip install sklearn --quiet
# !pip install matplotlib --quiet
# !pip install seaborn --quiet
# !pip install xgboost --quiet
# !pip install statsmodels -- quiet


In [None]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import classification_report,confusion_matrix
import statsmodels.api as sm
import statistics as st

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_excel('health_dataset.xlsx')

## 1. Preliminary Analysis

In [None]:
print(data.shape)
print(data.columns)

Missing Values

In [None]:
data.isna().sum()

Duplicates

In [None]:
print('Before dropping duplicates')
print(f'Number of duplicates: {data.shape[0] - data.drop_duplicates().shape[0]}')
data.drop_duplicates(inplace=True)
print('')
print('After dropping duplicates')
print(f'Number of duplicates: {data.shape[0] - data.drop_duplicates().shape[0]}')

Split the dataset into train and test

In [None]:
train,test,trainLab,testLab = train_test_split(data[[col for col in data.columns if col != 'target']],data['target'],test_size=0.2,random_state=314)

## Dictionary sorting dataset into numerical and categorical

In [None]:
train[[col for col in train.columns if 'scaled' not in col]]
var_type = {'num':['age','trestbps','chol','thalach']}
var_type['cat'] = [col for col in train.columns if col not in var_type['num']]

## 2. Prepare a Report About the Data

In [None]:
## Show the distribution of each variable

cols = train.columns
for i in range(0,len(cols),2):
    plt.figure(figsize=(10,5))
    plt.subplot(121)
    plt.title(cols[i])
    sns.histplot(train[cols[i]])
    try:
        plt.subplot(122)
        plt.title(cols[i+1])
        sns.histplot(train[cols[i+1]])
        plt.tight_layout()
        plt.show()
    except:
        pass
    plt.show()


## Measures of Centeral Tendency

In [None]:
def st_err(col):
    return train[col].std()/np.sqrt(train[col].shape[0] - 1)


In [None]:
for col in var_type['num']:
    print(col)
    print(train[col].mean())
    print(st_err(col))
    print('')

for col in var_type['cat']:
    print(col)
    print(st.mode(train[col]))
    print(st.mean(train[col]))

## 3. Modeling

Look at correlations with target variable using heatmap

In [None]:
train_dummy = train.copy()
train_dummy['label'] = trainLab
train_dummy = train_dummy[[col for col in train_dummy.columns if 'scaled' not in col]]

plt.figure(figsize=(15,5))
plt.subplot(121)
plt.title('Correlation Heatmap - All Variables')
sns.heatmap(train_dummy.corr(),cmap='viridis')
plt.subplot(122)
plt.title('Correlation Heatmap - Top 5')
sns.heatmap(train_dummy.corr().nlargest(5,'label'),cmap='viridis')

# print(train_dummy.corr())
train_dummy.corr().nlargest(6,'label')

In [None]:
model = sm.OLS(trainLab,train)
olsres = model.fit()
sum = olsres.summary()

Explore using statsmodels.api.OLS() for feature selection based on p-values

In [None]:
sum

## Pipeline creation
<ul style="list-style:none">
    <li>
    1. Impute Missing Values // Scale/Encode Variables
    </li>
    <li>
    2. Apply Num/Cat Pipeline to Numerical/Categorical Variables
    </li>
    <li>
    3. Apply Classifier (Grid Search)
    </li>

In [None]:
## 1.
num_pipeline = Pipeline([('si',SimpleImputer(strategy='median')),('std',StandardScaler())])
cat_pipeline = Pipeline([('si',SimpleImputer(strategy='most_frequent')),('ohe',OneHotEncoder(handle_unknown='ignore'))])

In [None]:
## 2.
preprocessing_pipeline = ColumnTransformer([('num',num_pipeline,var_type['num']),('cat',cat_pipeline,var_type['cat'])])

See params from logistic regression

In [None]:
help(LogisticRegression)

In [None]:
## 3.
lr_base = LogisticRegression()
l = len(train.columns)
params_lr = {'tol':[1e-4,1e-3,1e-2,1e-1,1,10],'C':[1e-2,1e-1,1,10],'dual':[True,False]}
cvlr = RandomizedSearchCV(lr_base,param_distributions=params_lr)
pipeline_lr_base = Pipeline([('prep',preprocessing_pipeline),('lr_base',lr_base)])
pipeline_lr_opt = Pipeline([('prep',preprocessing_pipeline),('cvlr',cvlr)])

print('Logistic Regression Tuned')
pipeline_lr_opt.fit(train,trainLab)
print(classification_report(testLab,pipeline_lr_opt.predict(test)))

See Params from RandomForestClassifier

In [None]:
help(RandomForestClassifier)

In [None]:
## 3.
rf_base = RandomForestClassifier()
params_rf = {'n_estimators':[100,500,1000,5000],'criterion':['gini','entropy'],'max_depth':[l//3,l//2,(2*l)//3]}
cvrf = RandomizedSearchCV(rf_base,param_distributions=params_rf,n_iter=5)

pipeline_rf_base = Pipeline([('prep',preprocessing_pipeline),('rf_base',rf_base)])
pipeline_rf_opt = Pipeline([('prep',preprocessing_pipeline),('cvrf',cvrf)])

print('Random Forest Tuned Model')
pipeline_rf_opt.fit(train,trainLab)
print(classification_report(testLab,pipeline_rf_opt.predict(test)))


In [None]:
# def mean_enc(col):
#     mean_enc = {}
#     df = train.copy()
#     df['label'] = trainLab
#     x = df.groupby(by=col).mean()['label']
#     return {ind:x[ind] for ind in x.index}

# for col in var_type['cat']:
#     me = mean_enc(col)
#     train[f'{col}_me'] = train[col].map(me)
#     test[f'{col}_me'] = test[col].map(me)