# Heart Disease Classification with DecisionTree

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


## Loading the Data

[UCI Heart-Disease Data](https://archive.ics.uci.edu/ml/datasets/Heart+Disease)

- age
- sex
- chest pain type (4 values)
- resting blood pressure
- serum cholestoral in mg/dl
- fasting blood sugar > 120 mg/dl
- resting electrocardiographic results (values 0,1,2)
- maximum heart rate achieved
- exercise induced angina
- oldpeak = ST depression induced by exercise relative to rest
- the slope of the peak exercise ST segment
- number of major vessels (0-3) colored by flourosopy
- thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

Targets:
- 0: Has Disease
- 1: No Disease

In [29]:
import warnings
warnings.filterwarnings('ignore')

In [31]:
df=pd.read_csv('sentiment_5_class.csv')
train_df = pd.read_csv('sentiment_5_class_train.csv')
test_df = pd.read_csv('sentiment_5_class_test.csv')

train_df.head()

Unnamed: 0,Phrase,Sentiment
0,the prisoner,2
1,The sheer joy and pride they took in their wor...,3
2,has never made a more sheerly beautiful film t...,3
3,the story has the sizzle of old news that has ...,3
4,far superior,4


## Train-test Split

Before we proceed further, we split the data.

Split into `X_train`, `X_test`, `y_train`, `y_test`.

Use: `test_size=0.2`, `random_state=1`

In [32]:
X_train=train_df.Phrase.tolist()
X_test=test_df.Phrase.tolist()

print('No of instances',len(X_train), len(X_test))
y_train=train_df.Sentiment.tolist()
y_test =test_df.Sentiment.tolist()
values_train, counts_train = np.unique(y_train, return_counts=True)
values_test, counts_test = np.unique(y_test, return_counts=True)

print(values_train, values_test)
print(counts_train, counts_test)

No of instances 14711 3678
[0 1 2 3 4] [0 1 2 3 4]
[ 988 1165 1876 7033 3649] [ 247  291  469 1759  912]


In [33]:
# from sklearn.model_selection import train_test_split

# X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [34]:
# xdf =pd.DataFrame({'Country':['Nepal','India','China']})
# pd.get_dummies(xdf)

In [35]:
from pandas_profiling import ProfileReport

train_profile = ProfileReport(train_df, title='Pandas Profiling Train Dataset Report', html={'style':{'full_width':True}})
test_profile = ProfileReport(test_df, title='Pandas Profiling Test Dataset Report', html={'style':{'full_width':True}})


HBox(children=(FloatProgress(value=0.0, description='variables', max=2.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=2.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='variables', max=2.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=2.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




## Feature Extraction

**Count Feature Vectorization<br>
TF-IDF Feature Vectorization**

In [36]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Count Vectorizer Vs TF-IDF Vectorizer

In [37]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
# Fit should be done only on training data BE CAREFULLLLL
count_vectorizer.fit(X_train)
tfidf_vectorizer.fit(X_train)

print(len(count_vectorizer.get_feature_names()))
print(len(tfidf_vectorizer.get_feature_names()))

7115
7115


In [38]:
# COUNT VECTORIZER
X_train_count_v=count_vectorizer.transform(X_train)
X_test_count_v= count_vectorizer.transform(X_test)

#TF-IDF VECTORIZER
X_train_tfidf_v=tfidf_vectorizer.transform(X_train)
X_test_tfidf_v=tfidf_vectorizer.transform(X_test)

print(X_train_count_v.shape, X_test_count_v.shape)

(14711, 7115) (3678, 7115)


## Bagging Classifier

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import model_selection, metrics
from sklearn.metrics import accuracy_score

In [50]:
from sklearn.model_selection import GridSearchCV

grid_params={
    'n_estimators' : (5,10),
    'max_features': (.6,.7,.8),
    'max_samples' : (.6,.7,.8)
            }

In particular, max_samples and max_features control the size of the subsets (in terms of samples and features), while bootstrap and bootstrap_features control whether samples and features are drawn with or without replacement.

In [62]:
clf=GridSearchCV(BaggingClassifier(random_state=1),
                     grid_params)
clf.fit(X_train_count_v,y_train)



In [63]:
print('Best score: ', clf.best_score_,'\n',clf.best_params_)

y_preds_final =clf.predict(X_test_count_v)
accuracy=accuracy_score(y_test,y_preds_final)
# print('Accuracy: ',accuracy)
print('\n\tClassification Report of best feature\n\t with Bagging\n')
print(metrics.classification_report(y_test,y_preds_final))


Best score:  0.6442794699101648 
 {'max_features': 0.6, 'max_samples': 0.8, 'n_estimators': 10}

	Classification Report of best feature
	 with Bagging

              precision    recall  f1-score   support

           0       0.73      0.54      0.62       247
           1       0.63      0.42      0.50       291
           2       0.63      0.54      0.58       469
           3       0.67      0.82      0.74      1759
           4       0.69      0.57      0.62       912

    accuracy                           0.67      3678
   macro avg       0.67      0.58      0.61      3678
weighted avg       0.67      0.67      0.66      3678



## Gradient  Tree Boosting

The number of weak learners (i.e. regression trees) is controlled by the parameter n_estimators; 

The size of each tree can be controlled either by setting the tree depth via max_depth or by setting the number of leaf nodes via max_leaf_nodes. 

The learning_rate is a hyper-parameter in the range (0.0, 1.0] that controls overfitting via shrinkage .

Note: For SVM:

Classification with more than 2 classes requires the induction of n_classes regression trees at each iteration, thus, the total number of induced trees equals n_classes * n_estimators. For datasets with a large number of classes we strongly recommend to use HistGradientBoostingClassifier as an alternative to GradientBoostingClassifier.

## AdaBoostClassifier

The number of weak learners is controlled by the parameter n_estimators. 
The learning_rate parameter controls the contribution of the weak learners in the final combination. 
By default, weak learners are decision stumps. Different weak learners can be specified through the base_estimator parameter. 

The main parameters to tune to obtain good results are n_estimators and the complexity of the base estimators (e.g., its depth max_depth or minimum required number of samples to consider a split min_samples_split).

## Automation

In [53]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

i=0

In [64]:
def grid_search_classifier(classifier, train_x, train_y, test_x, test_y, grid_params, scoring='accuracy'):
    clf=GridSearchCV(classifier(random_state=1),
                     grid_params)
    
    clf.fit(train_x,train_y)
    print('Best score: ', clf.best_score_,clf.best_params_)
    

    best_model=clf.best_estimator_
    best_model.fit(train_x,train_y)

    y_preds_final =best_model.predict(test_x)
    accuracy=accuracy_score(test_y,y_preds_final)

    global i
    name_classifier=['Gradient Boosting Classifier', 'Ada Boost Classifier', 'Random Forest Classifier','Extra Trees Classifier']
    print('\n\tClassification Report of best feature with {}\n'.format(name_classifier[i]))
    print(metrics.classification_report(y_test,y_preds_final))
    i=i+1
    
    return accuracy

In [65]:
list_classifier=[
#             (DecisionTreeClassifier,{'min_samples_split': [2, 4],'min_samples_leaf' : [1, 2]
#                                          }),
                (GradientBoostingClassifier,{'max_leaf_nodes':[2,4,6],'min_samples_split': [2,4,6]
                                            }),
                (AdaBoostClassifier,{'n_estimators':[50,100],'learning_rate':[0.5,1]
                                    }),
                (RandomForestClassifier,{'n_estimators':[50,100],'min_samples_split': [2, 4],'min_samples_leaf' : [1, 2]
                                        }),
                (ExtraTreesClassifier,  {'n_estimators':[50,100],'min_samples_split': [2, 4],'min_samples_leaf' : [1, 2]
                                        })
]

In [66]:
accuracies=[]
for classifier, grid in list_classifier:
    accuracy=grid_search_classifier(classifier, X_train_count_v, y_train, X_test_count_v, y_test, grid)
    accuracies.append({'Classifier: ': str(classifier),'Accuracy':accuracy})
    print('\n')
print(accuracies,'\n')

Best score:  0.5523082921763218 {'max_leaf_nodes': 6, 'min_samples_split': 4}

	Classification Report of best feature with Gradient Boosting Classifier

              precision    recall  f1-score   support

           0       0.86      0.23      0.36       247
           1       0.77      0.10      0.18       291
           2       0.77      0.06      0.11       469
           3       0.52      0.95      0.67      1759
           4       0.68      0.28      0.39       912

    accuracy                           0.55      3678
   macro avg       0.72      0.32      0.34      3678
weighted avg       0.64      0.55      0.47      3678



Best score:  0.517776017618227 {'learning_rate': 1, 'n_estimators': 100}

	Classification Report of best feature with Ada Boost Classifier

              precision    recall  f1-score   support

           0       0.60      0.33      0.43       247
           1       0.54      0.07      0.12       291
           2       0.54      0.03      0.05       469