# CS4661 - Homework 3

Jaquan Jones

In [1]:
# library imports
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
# Due to convergence issues with size of given data set, including preprocessing
# and pipeline libraries for logistic regression implementation
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier


## Question1: Predicting Heart Disease

### A. Read the data file “Hearts_s.csv” (from github using the following command), and assign it to a Pandas DataFrame

In [2]:
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Heart_s.csv")

df.head()

Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,f,typical,145,233,2,150,2.3,fixed,No
1,67,f,asymptomatic,160,286,2,108,1.5,normal,Yes
2,67,f,asymptomatic,120,229,2,129,2.6,reversable,Yes
3,37,f,nonanginal,130,250,0,187,3.5,normal,No
4,41,m,nontypical,130,204,2,172,1.4,normal,No


### B. Check out the dataset. As you see, the dataset contains a number of features including both contextual and biological factors (e.g. age, gender, vital signs, …). The last column “AHD” is the label with “Yes” meaning that a human subject has Heart Disease, and “No” meaning that the subject does not have Heart Disease.



In [3]:
# target column
y = df['AHD']

y.head()

0     No
1    Yes
2    Yes
3     No
4     No
Name: AHD, dtype: object

### C. As you see, there are at least 3 categorical features in the dataset (Gender, ChestPain, Thal). Let’s ignore these categorical features for now, only keep the numerical features and build your feature matrix and label vector.

In [4]:
# feature_cols = ['Age', 'Gender', 'ChestPain', 'RestBP', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak', 'Thal']
selected_features = ['Age', 'RestBP', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak']

X = df[selected_features]

X.head()

Unnamed: 0,Age,RestBP,Chol,RestECG,MaxHR,Oldpeak
0,63,145,233,2,150,2.3
1,67,160,286,2,108,1.5
2,67,120,229,2,129,2.6
3,37,130,250,0,187,3.5
4,41,130,204,2,172,1.4


### D. Split the dataset into testing and training sets with the following parameters: test_size=0.25, random_state=4.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print('\t\t\t***X_train***\n')
print(X_train[:10])
print('\n\t***y_train***\n')
print(y_train[:10])

(225, 6)
(225,)
(76, 6)
(76,)
			***X_train***

     Age  RestBP  Chol  RestECG  MaxHR  Oldpeak
128   62     124   209        0    163      0.0
103   49     120   188        0    139      2.0
212   66     178   228        0    165      1.0
100   34     118   182        2    174      0.0
280   55     128   205        1    130      2.0
172   62     140   394        2    157      1.2
188   69     140   254        2    146      2.0
129   54     120   258        2    147      0.4
91    62     130   231        0    146      1.8
134   55     135   250        2    161      1.4

	***y_train***

128     No
103    Yes
212    Yes
100     No
280    Yes
172     No
188    Yes
129     No
91      No
134     No
Name: AHD, dtype: object


### E. Use KNN (with k=3), Decision Tree (with random_state=5 (this random state is for decision tree and you put it when you define the decision tree classifier. It is different from the random state that you used to split the data in part D)), and Logistic Regression Classifiers to predict Heart Disease based on the training/testing datasets that you built in part (d). **Then check, compare, and report the accuracy of these 3 classifiers. Which one is the best? Which one is the worst?**

---

**Answer E:**
</br>**Logistic Regression** classifier returned the **best accuracy**, with an accuracy score of 0.7368421052631579
</br></br>
**Decision Tree** classifier returned the **worst accuracy**, with an accuracy score of 0.6052631578947368
</br></br>
K-Neighbors Classifier returned an accuracy score of 0.6973684210526315

In [6]:
# Methods returning accuracy scores of KNN, Logistic Regression, & Decision Tree

def accuracy_of_knn(k, X_training, y_training, X_testing, y_testing):
    """Returning accuracy of predictions for KNN classifier,
    given training and testing data and a specific value of knn"""

    # Create Instance of KNeighborsClassifier passing parameter value 'k'
    knn = KNeighborsClassifier(n_neighbors=k)

    # Train model with "fit" method along with training dataset and labels
    knn.fit(X_training, y_training)

    # Generate label predictions vector
    y_predictions = knn.predict(X_testing)

    # print predictions for testing set
    print('Predictions for testing set:\n')
    print(y_predictions)

    # return overall accuracy of predicted labels vs actual labels vectors
    return accuracy_score(y_testing, y_predictions)


def accuracy_of_decision_tree(rs, X_training, y_training, X_testing, y_testing):
    """Returning accuracy of predictions for Decision Tree classifier,
    given training and testing data and a specific value of random state
    value, rs"""

    # Create Instance of DecisionTreeClassifier passing parameter value 'rs'
    dt = DecisionTreeClassifier(random_state=rs)

    # Train model with "fit" method along with training dataset and labels
    dt.fit(X_training, y_training)

    # Generate label predictions vector
    y_predictions = dt.predict(X_testing)

    # print predictions for testing set
    print('Predictions for testing set:\n')
    print(y_predictions)

    return accuracy_score(y_testing, y_predictions)


def accuracy_of_logistic_regression(X_training, y_training, X_testing, y_testing):
    """Returning accuracy of predictions for Logistic Regression classifier,
    given training and testing data"""

    # Implement Instance of LogisticRegression with pipeline of StandardScaler
    # to avoid convergence errors with size of data set
    lr = make_pipeline(StandardScaler(), LogisticRegression())

    # Train model with "fit" method along with training dataset and labels .
    # Apply scaling on training data with pipeline instance
    lr.fit(X_training, y_training)

    # Generate label predictions vector
    y_predictions = lr.predict(X_testing)

    # print predictions for testing set
    print('Predictions for testing set:\n')
    print(y_predictions)

    # return overall accuracy of predicted labels vs actual labels vectors
    return accuracy_score(y_testing, y_predictions)

In [7]:
# KNN Classifier

# Using method defined above to determine accuracy of KNN Classifier, with k=3
k = 3
knn_accuracy = accuracy_of_knn(k, X_train, y_train, X_test, y_test)

print(f'\nAccuracy of KNN classifier, k={k}: {knn_accuracy}')

Predictions for testing set:

['Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No'
 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No'
 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes'
 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No']

Accuracy of KNN classifier, k=3: 0.6973684210526315


In [8]:
# Decision Tree Classifier

# Using method defined above to determine accuracy of Decision Tree Classifier,
# with random_state=5
random_state = 5
dt_accuracy = accuracy_of_decision_tree(random_state, X_train, y_train, X_test, y_test)

print(f'\nAccuracy of Decision Tree classifier, random_state={random_state}: {dt_accuracy}')

Predictions for testing set:

['Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes'
 'No' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'Yes'
 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes'
 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes']

Accuracy of Decision Tree classifier, random_state=5: 0.6052631578947368


In [9]:
# Logistic Regression Classifier

# Using accuracy method created above Logistic Regression
lr_accuracy = accuracy_of_logistic_regression(X_train, y_train, X_test, y_test)

print(f'\nAccuracy of Logistic Regression classifier: {lr_accuracy}')

Predictions for testing set:

['Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'Yes'
 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No'
 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes'
 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'Yes'
 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No'
 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No']

Accuracy of Logistic Regression classifier: 0.7368421052631579


### F. Now, we want to use the categorical features as well! To this end, we have to perform a feature engineering process called OneHotEncoding for the categorical features. </br></br>To do this, each categorical feature should be replaced with dummy columns in the feature table (one column for each possible value of a categorical feature), and then encode it in a binary manner such that only one of the dummy columns can take “1” at a time (and zero for the rest). </br></br>For example, “Gender” can take two values “m” and “f”. Thus, we need to replace this feature (in the feature table) by 2 columns titled “m” and “f”.  Wherever we have a male subject, we can put “1” and ”0” in the columns “m” and “f”.  Wherever we have a female subject, we can put “0” and ”1” in the columns “m” and “f”. (Hint: you will need 4 columns to encode “ChestPain” and 3 columns to encode “Thal”).

In [10]:
# unique_chest_pain_statuses = set(df['ChestPain'])
# # print(unique_chest_pain_values)

# chest_pain_encoding = {}
# encoding_count = 0

# for status in unique_chest_pain_statuses:
#         chest_pain_encoding[status] = encoding_count
#         encoding_count += 1

# print('Chest Pain Status Codes:')
# for status, code in chest_pain_encoding.items():
#         print(f'- {status}: {code}')

# get_chest_pain_encoding = lambda status: chest_pain_encoding[status]

In [11]:
# defining new dataframe, 'new_X' to append new feature columns of One Hot Encoding
selected_features = ['Age', 'RestBP', 'Chol', 'RestECG', 'MaxHR', 'Oldpeak']

new_X = df[selected_features]

new_X

Unnamed: 0,Age,RestBP,Chol,RestECG,MaxHR,Oldpeak
0,63,145,233,2,150,2.3
1,67,160,286,2,108,1.5
2,67,120,229,2,129,2.6
3,37,130,250,0,187,3.5
4,41,130,204,2,172,1.4
...,...,...,...,...,...,...
296,45,110,264,0,132,1.2
297,68,144,193,0,141,3.4
298,57,130,131,0,115,1.2
299,57,130,236,2,174,0.0


In [26]:
# one hot encoding gender columns

is_male = []
is_female = []

for gender in df['Gender']:
    if gender == 'm':
        is_male.append(1)
        is_female.append(0)
    else:
        is_male.append(0)
        is_female.append(1)

new_X['IsMale'] = is_male
new_X['IsFemale'] = is_female

new_X

Unnamed: 0,Age,RestBP,Chol,RestECG,MaxHR,Oldpeak,IsMale,IsFemale,HasTypicalChestPain,HasAsymptomaticChestPain,HasNonanginalChestPain,HasNontypicalChestPain,HasNormalThal,HasFixedThal,HasReversibleThal
0,63,145,233,2,150,2.3,0,1,1,0,0,0,0,1,0
1,67,160,286,2,108,1.5,0,1,0,1,0,0,1,0,0
2,67,120,229,2,129,2.6,0,1,0,1,0,0,0,0,1
3,37,130,250,0,187,3.5,0,1,0,0,1,0,1,0,0
4,41,130,204,2,172,1.4,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,45,110,264,0,132,1.2,0,1,1,0,0,0,0,0,1
297,68,144,193,0,141,3.4,0,1,0,1,0,0,0,0,1
298,57,130,131,0,115,1.2,0,1,0,1,0,0,0,0,1
299,57,130,236,2,174,0.0,1,0,0,0,0,1,1,0,0


In [22]:
# one hot encoding gender columns


chest_pain_dict = {}
chest_pain_statuses = df['ChestPain']

print(f'Unique values of ChestPain: {set(chest_pain_statuses)}\n')

for i in range(0, len(chest_pain_statuses)):
    # initiate values of chest pain as  dictionary of sypmtom status codes
    chest_pain_dict[i] = {
        'typical': 0,
        'asymptomatic': 0,
        'nonanginal': 0,
        'nontypical': 0
    }
    # change symptom value of  chest pain to match status of df
    chest_pain_dict[i][chest_pain_statuses[i]] = 1

new_X['HasTypicalChestPain'] = [chest_pain_dict[i]['typical'] for i in range(0, len(chest_pain_statuses))]
new_X['HasAsymptomaticChestPain'] = [chest_pain_dict[i]['asymptomatic'] for i in range(0, len(chest_pain_statuses))]
new_X['HasNonanginalChestPain'] = [chest_pain_dict[i]['nonanginal'] for i in range(0, len(chest_pain_statuses))]
new_X['HasNontypicalChestPain'] = [chest_pain_dict[i]['nontypical'] for i in range(0, len(chest_pain_statuses))]

new_X

Unique values of ChestPain: {'nontypical', 'nonanginal', 'typical', 'asymptomatic'}



Unnamed: 0,Age,RestBP,Chol,RestECG,MaxHR,Oldpeak,IsMale,IsFemale,HasTypicalChestPain,HasAsymptomaticChestPain,HasNonanginalChestPain,HasNontypicalChestPain,HasNormalThal,HasFixedThal,HasReversibleThal
0,63,145,233,2,150,2.3,0,1,1,0,0,0,0,1,0
1,67,160,286,2,108,1.5,0,1,0,1,0,0,1,0,0
2,67,120,229,2,129,2.6,0,1,0,1,0,0,0,0,1
3,37,130,250,0,187,3.5,0,1,0,0,1,0,1,0,0
4,41,130,204,2,172,1.4,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,45,110,264,0,132,1.2,0,1,1,0,0,0,0,0,1
297,68,144,193,0,141,3.4,0,1,0,1,0,0,0,0,1
298,57,130,131,0,115,1.2,0,1,0,1,0,0,0,0,1
299,57,130,236,2,174,0.0,1,0,0,0,0,1,1,0,0


In [23]:
# one hot encoding thal statuses
thal_dict = {}
thal_statuses = df['Thal']

print(f'Unique values of Thal: {set(thal_statuses)}\n')

for i in range(0, len(chest_pain_statuses)):
    # initiate values of chest pain as  dictionary of sypmtom status codes
    thal_dict[i] = {
        'normal': 0,
        'fixed': 0,
        'reversable': 0,
    }
    # change symptom value of  chest pain to match status of df
    thal_dict[i][thal_statuses[i]] = 1

new_X['HasNormalThal'] = [thal_dict[i]['normal'] for i in range(0, len(thal_statuses))]
new_X['HasFixedThal'] = [thal_dict[i]['fixed'] for i in range(0, len(thal_statuses))]
new_X['HasReversibleThal'] = [thal_dict[i]['reversable'] for i in range(0, len(thal_statuses))]

new_X

Unique values of Thal: {'normal', 'reversable', 'fixed'}



Unnamed: 0,Age,RestBP,Chol,RestECG,MaxHR,Oldpeak,IsMale,IsFemale,HasTypicalChestPain,HasAsymptomaticChestPain,HasNonanginalChestPain,HasNontypicalChestPain,HasNormalThal,HasFixedThal,HasReversibleThal
0,63,145,233,2,150,2.3,0,1,1,0,0,0,0,1,0
1,67,160,286,2,108,1.5,0,1,0,1,0,0,1,0,0
2,67,120,229,2,129,2.6,0,1,0,1,0,0,0,0,1
3,37,130,250,0,187,3.5,0,1,0,0,1,0,1,0,0
4,41,130,204,2,172,1.4,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,45,110,264,0,132,1.2,0,1,1,0,0,0,0,0,1
297,68,144,193,0,141,3.4,0,1,0,1,0,0,0,0,1
298,57,130,131,0,115,1.2,0,1,0,1,0,0,0,0,1
299,57,130,236,2,174,0.0,1,0,0,0,0,1,1,0,0


In [15]:
df

Unnamed: 0,Age,Gender,ChestPain,RestBP,Chol,RestECG,MaxHR,Oldpeak,Thal,AHD
0,63,f,typical,145,233,2,150,2.3,fixed,No
1,67,f,asymptomatic,160,286,2,108,1.5,normal,Yes
2,67,f,asymptomatic,120,229,2,129,2.6,reversable,Yes
3,37,f,nonanginal,130,250,0,187,3.5,normal,No
4,41,m,nontypical,130,204,2,172,1.4,normal,No
...,...,...,...,...,...,...,...,...,...,...
296,45,f,typical,110,264,0,132,1.2,reversable,Yes
297,68,f,asymptomatic,144,193,0,141,3.4,reversable,Yes
298,57,f,asymptomatic,130,131,0,115,1.2,reversable,Yes
299,57,m,nontypical,130,236,2,174,0.0,normal,Yes


### G. Repeat parts (d) and (e) with the new dataset that you built in part (f). How does the prediction accuracy change for each method?

---
**Answer G:**
</br></br>
* **Logistic Regression** classifier increased its accuracy, and again returned the **best accuracy**
    * **Old accuracy score:** 0.7368421052631579
    * **New accuracy score** 0.8026315789473685
* **Decision Tree** classifier increased its accuracy, and again returned the **worst accuracy**
    * **Old accuracy score:** 0.6052631578947368
    * **New accuracy score** 0.6842105263157895
* **K-Neighbors** classifier accuracy score did not change
    * **Old accuracy score:** 0.6973684210526315
    * **New accuracy score** 0.6973684210526315


In [16]:
# re-training data set using 'new_X' in place of X (part D)
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.25, random_state=4)

print(X_train.shape)
print(y_train.shape)
print('\t\t\t***X_train***\n')
print(X_train)
print('\n\t***y_train***\n')
print(y_train)

(225, 15)
(225,)
			***X_train***

     Age  RestBP  Chol  RestECG  MaxHR  Oldpeak  IsMale  IsFemale  \
128   62     124   209        0    163      0.0       1         0   
103   49     120   188        0    139      2.0       0         1   
212   66     178   228        0    165      1.0       1         0   
100   34     118   182        2    174      0.0       0         1   
280   55     128   205        1    130      2.0       1         0   
..   ...     ...   ...      ...    ...      ...     ...       ...   
58    54     125   273        2    152      0.5       0         1   
87    53     138   234        2    160      0.0       1         0   
197   50     120   244        0    162      1.1       1         0   
174   57     152   274        0     88      1.2       0         1   
122   55     140   217        0    111      5.6       0         1   

     HasTypicalChestPain  HasAsymptomaticChestPain  HasNonanginalChestPain  \
128                    0                         1        

In [17]:
# KNN Classifier (part E, using new_X)

# Using method defined above to determine accuracy of KNN Classifier, with k=3
k = 3
knn_accuracy = accuracy_of_knn(k, X_train, y_train, X_test, y_test)

print(f'\nAccuracy of KNN classifier, k={k}: {knn_accuracy}')

Predictions for testing set:

['Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No'
 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No'
 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes'
 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No']

Accuracy of KNN classifier, k=3: 0.6973684210526315


In [18]:
# Decision Tree Classifier (part E, using new_X)

# Using method defined above to determine accuracy of Decision Tree Classifier,
# with random_state=5
random_state = 5
dt_accuracy = accuracy_of_decision_tree(random_state, X_train, y_train, X_test, y_test)

print(f'\nAccuracy of Decision Tree classifier, random_state={random_state}: {dt_accuracy}')

Predictions for testing set:

['Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes'
 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes'
 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes'
 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes'
 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No']

Accuracy of Decision Tree classifier, random_state=5: 0.6842105263157895


In [19]:
# Logistic Regression Classifier (part E, using new_X)

# Using accuracy method created above Logistic Regression
lr_accuracy = accuracy_of_logistic_regression(X_train, y_train, X_test, y_test)

print(f'\nAccuracy of Logistic Regression classifier: {lr_accuracy}')

Predictions for testing set:

['Yes' 'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No'
 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'No' 'No'
 'No' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes'
 'No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes'
 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No']

Accuracy of Logistic Regression classifier: 0.8026315789473685


### H. Now, repeat part (e) with the new dataset that you built in part (f), but this time using Cross-Validation. Thus, rather than splitting the dataset into testing and training, use 10-fold Cross-Validation (as we learned in Lab4) to evaluate the classification methods and report the final prediction accuracy. 

---
**Answer H:**

* KNN average accuracy: 0.6343010752688172
* Decision Tree average accuracy: 0.7240860215053764
* Logistic Regression average accuracy: 0.8074193548387096


In [20]:
k = 3
knn = KNeighborsClassifier(n_neighbors=k)
accuracy_list = cross_val_score(knn, new_X, y, cv=10, scoring='accuracy')
print(f'\nKNN accuracy score list:')
print(accuracy_list)
print(f'\nKNN average accuracy: {accuracy_list.mean()}\n')
print('-----------------------------------------------------------------------')

rs = 5
dt = DecisionTreeClassifier(random_state=rs)
accuracy_list = cross_val_score(dt, new_X, y, cv=10, scoring='accuracy')
print(f'\nDecision Tree accuracy score list:')
print(accuracy_list)
print(f'\nDecision Tree average accuracy: {accuracy_list.mean()}\n')
print('-----------------------------------------------------------------------')

lr = make_pipeline(StandardScaler(), LogisticRegression())
accuracy_list = cross_val_score(lr, new_X, y, cv=10, scoring='accuracy')
print(f'\nLogistic Regression accuracy score list:')
print(accuracy_list)
print(f'\nLogistic Regression average accuracy: {accuracy_list.mean()}\n')


KNN accuracy score list:
[0.70967742 0.63333333 0.56666667 0.66666667 0.6        0.5
 0.66666667 0.7        0.56666667 0.73333333]

KNN average accuracy: 0.6343010752688172

-----------------------------------------------------------------------

Decision Tree accuracy score list:
[0.77419355 0.73333333 0.76666667 0.76666667 0.76666667 0.73333333
 0.6        0.63333333 0.66666667 0.8       ]

Decision Tree average accuracy: 0.7240860215053764

-----------------------------------------------------------------------

Logistic Regression accuracy score list:
[0.77419355 0.8        0.8        0.86666667 0.9        0.73333333
 0.8        0.83333333 0.8        0.76666667]

Logistic Regression average accuracy: 0.8074193548387096

