# In this project we will build a Machine Learning model to predict whether an indiviudal will have a stroke.  The data used in this project can be found on kaggle at the following link: https://www.kaggle.com/asaumya/healthcare-data#train_2v.csv

# In this notebook, we build and implement our Machine Learning model.  To view our initial data analysis, please see the notebook titled "Data_Analysis."

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os

# Define file path to our data
stoke_data_relevant_features_and_label_file_path = os.path.join("..", "Data", "stroke_data_relevant_features_and_label.csv")

# Create dataframe from local csv file 
stroke_data_relevant_features_and_label = pd.read_csv(stoke_data_relevant_features_and_label_file_path)

# Previe dataframe
stroke_data_relevant_features_and_label.head()

Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,0,0,No,children,smokes,3.0,95.12,18.0,0
1,1,0,Yes,other,never smoked,58.0,87.96,39.2,0
2,0,0,No,other,smokes,8.0,110.89,17.6,0
3,0,0,Yes,other,formerly smoked,70.0,69.04,35.9,0
4,0,0,No,other,smokes,14.0,161.28,19.1,0


#### We want to one hot encode our categorical columns, so we will convert each 0 to "No," and each 1 to "Yes."

In [3]:
# Before we replace 0 and 1 with "no" and "yes",
# we should check to see if either of these numbers are present in the age column
number_of_people_age_0 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 0])
number_of_people_age_1 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 1])

print(f"Number of people of age 0: {number_of_people_age_0}")
print(f"Number of people of age 1: {number_of_people_age_1}")

Number of people of age 0: 0
Number of people of age 1: 34


In [4]:
# When we replace all values of 0 and 1 with "No" and "Yes,"
# we are going to replace ages of 1 with a value of "Yes"
# We will also replace the binary data in the stroke column with strings.
# We will therefore make copies of these rows to put back in the dataframe after our initial replacement

copy_of_data = pd.DataFrame()

# copy_of_data["age"] = stroke_data_relevant_features_and_label["age"]
# copy_of_data["stroke"] = stroke_data_relevant_features_and_label["stroke"]

copy_of_data_age = [stroke_data_relevant_features_and_label["age"]]
copy_of_data_stroke = [stroke_data_relevant_features_and_label["stroke"]]

In [5]:
# Replace each 0 with "No," and each 1 with "Yes."
stroke_data_relevant_features_and_label[["hypertension", "heart_disease"]].replace(0, "No", inplace=True)
stroke_data_relevant_features_and_label[["hypertension", "heart_disease"]].replace(1, "Yes", inplace=True)

# Preview dataframe after converting binary data to strings
stroke_data_relevant_features_and_label.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


Unnamed: 0,hypertension,heart_disease,ever_married,work_type,smoking_status,age,average_glucose_level,bmi,stroke
0,0,0,No,children,smokes,3.0,95.12,18.0,0
1,1,0,Yes,other,never smoked,58.0,87.96,39.2,0
2,0,0,No,other,smokes,8.0,110.89,17.6,0
3,0,0,Yes,other,formerly smoked,70.0,69.04,35.9,0
4,0,0,No,other,smokes,14.0,161.28,19.1,0


In [6]:
# Check to see if either if the values of 1 in the age column were changed
number_of_people_age_1 = len(stroke_data_relevant_features_and_label[stroke_data_relevant_features_and_label["age"] == 1])

print(f"Number of people of age 1: {number_of_people_age_1}")

Number of people of age 1: 34


In [7]:
# # Replace the values in the post-replacement age and stroke columns with the original values
# stroke_data_relevant_features_and_label["age"] = copy_of_data_age
# stroke_data_relevant_features_and_label["stroke"] = copy_of_data_stroke

# # Preview dataframe to confirm values in stroke column were fixed
# stroke_data_relevant_features_and_label.head()

In [8]:
# Confirm binary data proplerly converted
print(stroke_data_relevant_features_and_label["hypertension"].value_counts())
print(100*"-")
print(stroke_data_relevant_features_and_label["heart_disease"].value_counts())

0    39339
1     4061
Name: hypertension, dtype: int64
----------------------------------------------------------------------------------------------------
0    41338
1     2062
Name: heart_disease, dtype: int64


In [9]:
# Transform data to one hot encoded data
machine_ready_stroke_data = pd.get_dummies(stroke_data_relevant_features_and_label, columns=["hypertension", "heart_disease", "ever_married", "work_type", "smoking_status"])
machine_ready_stroke_data.head()

Unnamed: 0,age,average_glucose_level,bmi,stroke,hypertension_0,hypertension_1,heart_disease_0,heart_disease_1,ever_married_No,ever_married_Yes,work_type_Self-employed,work_type_children,work_type_other,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,3.0,95.12,18.0,0,1,0,1,0,1,0,0,1,0,0,0,1
1,58.0,87.96,39.2,0,0,1,1,0,0,1,0,0,1,0,1,0
2,8.0,110.89,17.6,0,1,0,1,0,1,0,0,0,1,0,0,1
3,70.0,69.04,35.9,0,1,0,1,0,0,1,0,0,1,1,0,0
4,14.0,161.28,19.1,0,1,0,1,0,1,0,0,0,1,0,0,1


In [10]:
# Import Maching Learning algorithms will we try out
from sklearn.linear_model import LogisticRegression
# from sklearn import tree
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB

In [11]:
# from sklearn.datasets import make_blobs

# X, y = make_blobs(centers=2, random_state=42)

# print(f"Labels: {y[:10]}")
# print(f"Data: {X[:10]}")

In [12]:
# Create our features and label
X = np.array(machine_ready_stroke_data.drop(["stroke"], axis=1))
y = np.array(machine_ready_stroke_data["stroke"].values.reshape(-1,1))

We now have our features and labels, but the data is still imbalanced.  We will try employing SMOTE to handle this issue.

### In the following section, we will try running several loops to see what the effect of changing several parameters is.

In [13]:
# Import SMOTE to handle the imbalanced data issue
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Import tree to use the DecisionTreeClassifier() algorithm
# from sklearn import  tree
from sklearn.linear_model import LogisticRegression

#### SMOTE parameters

##### In the cell below we examine how accuracy changes when adjusting the SMOTE parameter sampling_strategy

In [14]:
multilist = ['ovr', 'multinomial', 'auto']
           
for multi in multilist:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegression(random_state=0, solver= "newton-cg" , multi_class=multi)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the multi parameter to {multi} yields an accuracy of {score}")



Setting the multi parameter to ovr yields an accuracy of 0.7774113767518549
Setting the multi parameter to multinomial yields an accuracy of 0.7768406366922442
Setting the multi parameter to auto yields an accuracy of 0.7795675058659395


In [15]:
 solverlist = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
                
        
for solver in solverlist:
    
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegression(random_state=0, solver=solver)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the solver parameter to {solver} yields an accuracy of {score}")



Setting the solver parameter to newton-cg yields an accuracy of 0.7772845456274969




Setting the solver parameter to lbfgs yields an accuracy of 0.7730357029615067
Setting the solver parameter to liblinear yields an accuracy of 0.7815968038556662




Setting the solver parameter to sag yields an accuracy of 0.7748747542646965
Setting the solver parameter to saga yields an accuracy of 0.7766503900057073




In [16]:
random_statelist = np.arange(2, 20, 2)
                  
    
for random_state in random_statelist:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=random_state)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegression(random_state=0, solver= "lbfgs")
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the random_state to {random_state} yields an accuracy of {score}")





Setting the random_state to 2 yields an accuracy of 0.7809626482338766




Setting the random_state to 4 yields an accuracy of 0.7764601433191706




Setting the random_state to 6 yields an accuracy of 0.780391908174266




Setting the random_state to 8 yields an accuracy of 0.7810894793582345




Setting the random_state to 10 yields an accuracy of 0.7808358171095187




Setting the random_state to 12 yields an accuracy of 0.7768406366922442




Setting the random_state to 14 yields an accuracy of 0.7792504280550447




Setting the random_state to 16 yields an accuracy of 0.7781723634980025
Setting the random_state to 18 yields an accuracy of 0.7805821548608028




In [17]:
sampling_strategy_arguments = np.arange(0.5,1 , .05)
                  
    
for sampling_strategy_argument in sampling_strategy_arguments:
    smote = SMOTE(sampling_strategy_argument, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegression(random_state=0, solver= "lbfgs")
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the sampling_strategy_argument to {sampling_strategy_argument} yields an accuracy of {score}")





Setting the sampling_strategy_argument to 0.5 yields an accuracy of 0.7931951505670708




Setting the sampling_strategy_argument to 0.55 yields an accuracy of 0.7911746896760521




Setting the sampling_strategy_argument to 0.6000000000000001 yields an accuracy of 0.7901451825780906
Setting the sampling_strategy_argument to 0.6500000000000001 yields an accuracy of 0.7819965870307167




Setting the sampling_strategy_argument to 0.7000000000000002 yields an accuracy of 0.7797791580400276




Setting the sampling_strategy_argument to 0.7500000000000002 yields an accuracy of 0.7783588093322606




Setting the sampling_strategy_argument to 0.8000000000000003 yields an accuracy of 0.7783209490288098




Setting the sampling_strategy_argument to 0.8500000000000003 yields an accuracy of 0.774177183080728




Setting the sampling_strategy_argument to 0.9000000000000004 yields an accuracy of 0.7785118863846866
Setting the sampling_strategy_argument to 0.9500000000000004 yields an accuracy of 0.7814812586486974




In [18]:
# Create an array of arguments to iteratively try out
sampling_strategy_arguments = np.arange(0.6, 1, 0.05)

for sampling_strategy_argument in sampling_strategy_arguments:
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the sampling_strategy parameter to {solver} yields an accuracy of {score}")

classifier.feature_importances_



Setting the sampling_strategy parameter to saga yields an accuracy of 0.7898518844405338




Setting the sampling_strategy parameter to saga yields an accuracy of 0.7818543799772468




Setting the sampling_strategy parameter to saga yields an accuracy of 0.7828847481021394




Setting the sampling_strategy parameter to saga yields an accuracy of 0.7811075355323143




Setting the sampling_strategy parameter to saga yields an accuracy of 0.7758440881241038




Setting the sampling_strategy parameter to saga yields an accuracy of 0.7777284545627497




Setting the sampling_strategy parameter to saga yields an accuracy of 0.777894411855511




Setting the sampling_strategy parameter to saga yields an accuracy of 0.7835268636062812


AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

##### In the cell below we examine how accuracy changes when adjusting the SMOTE parameter k_neighbors

In [None]:
# Create an array of arguments to iteratively try out
k_neighbors_arguments = np.arange(1, 102, 10)

for k_neighbors_argument in k_neighbors_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=k_neighbors_argument)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegressionClassifier()
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the k_neighbor parameter to {k_neighbors_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

#### train_test_split parameters

##### In the cell below we examine how accuracy changes when adjusting the train_test_split parameter test_size

In [None]:
# Create an array of arguments to iteratively try out
test_size_arguments = np.arange(0.05, 0.5, 0.05)

for test_size_argument in test_size_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = test_size_argument, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegressionClassifier()
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the train_test_split parameter to {test_size_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

##### In the cell below we examine how accuracy changes when adjusting the train_test_split parameter random_state

In [None]:
# Create an array of arguments to iteratively try out
random_state_arguments = np.arange(1, 10, 1)

for random_state_argument in random_state_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = test_size_argument, random_state=random_state_argument)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegressionClassifier()
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the random_state parameter to {random_state_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

#### DecisionTreeClassifier() parameters

##### In the cell below we examine how accuracy changes when adjusting the DecisionTreeClassifier() parameter max_depth

In [None]:
# Create an array of arguments to iteratively try out
max_depth_arguments = np.arange(1, 102, 10)

for max_depth_argument in max_depth_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegressionClassifier(max_depth=max_depth_argument, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the max_depth parameter to {max_depth_argument} yields an accuracy of {score}")
    
classifier.feature_importances_

##### In the cell below we examine how accuracy changes when adjusting the DecisionTreeClassifier() parameter max_depth

In [None]:
# Create an array of arguments to iteratively try out
max_nodes_arguments = np.arange(2, 153, 10)

for max_nodes_argument in max_nodes_arguments:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegression()
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the max_node parameter to {max_nodes_argument} yields an accuracy of {score}")

classifier.feature_importances_

#### Now that we have a better idea of what impact each parameter does, we will try one final test below

In [None]:
smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
# Split the data into training and testing sets
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)

# Create, fit, and score the decision tree classifier
classifier = tree.DecisionTreeClassifier(max_depth=100, max_leaf_nodes=100)
classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
score = classifier.score(X_smote_test, y_smote_test)

print(score, "\n")
print(classifier.feature_importances_)

In [None]:
import graphviz

feature_names = ["age",
                 "average_glucose_levels",
                 "bmi",
                 "hypertension_0",
                 "hypertension_1",
                 "heart_disease_0",
                 "heart_disease_1",
                 "ever_married_No",
                 "ever_married_Yes",
                 "work_type_Self-employed",
                 "work_type_children",
                 "work_type_other",
                 "smoking_status_formerly_smoked",
                 "smoking_status_never_smoked",
                 "smoking_status_smokes"
                ]
class_names=["did_not_have_a_stroke", "had_a_stroke"]

dot_data = tree.export_graphviz(classifier, out_file=None, 
                      feature_names=feature_names,
                      class_names=class_names,  
                      filled=True, rounded=True,  
                      special_characters=True)
graph = graphviz.Source(dot_data)  
graph 

In [21]:




  
    



print("2. split, scale, SMOTE")
scores = []
for sampling_strategy_argument in sampling_strategy_arguments:
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)
    
    # Create scaler for features and label
    X_train_scaler = StandardScaler().fit(X_train)
    X_test_scaler = StandardScaler().fit(X_test)
#     y_scaler = StandardScaler().fit(y_train)
    
    # Scale features and labels
    X_train_scaled = X_train_scaler.transform(X_train)
    X_test_scaled = X_test_scaler.transform(X_test)
#     y_train_scaled = y_scaler.transform(y_train)

    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
    X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_sample(X_train_scaled, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

    # Create, fit, and score the decision tree classifier
    classifier = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10)
    classifier = classifier.fit(X=X_train_scaled_SMOTE, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    scores.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")
    
average_accuracy = sum(scores)/len(scores)
print(f"Average accuracy: {average_accuracy}")
    


     
    print("2. split, scale, SMOTE")
    scores = []
    for sampling_strategy_argument in sampling_strategy_arguments:
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)
    
    # Create scaler for features and label
    X_train_scaler = StandardScaler().fit(X_train)
    X_test_scaler = StandardScaler().fit(X_test)
#     y_scaler = StandardScaler().fit(y_train)
    
    # Scale features and labels
    X_train_scaled = X_train_scaler.transform(X_train)
    X_test_scaled = X_test_scaler.transform(X_test)
#     y_train_scaled = y_scaler.transform(y_train)

    # Use SMOTE to handle class imbalance
    smote = SMOTE(sampling_strategy=sampling_strategy_argument, k_neighbors=8)
    X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_sample(X_train_scaled, y_train.ravel())
    y_train_SMOTE = y_train_SMOTE.reshape(-1,1)

    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegressionClassifier()
    classifier = classifier.fit(X=X_train_scaled_SMOTE, y=y_train_SMOTE)
    score = classifier.score(X_test_scaled, y_test)
    scores.append(score)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the sampling_strategy parameter to {sampling_strategy_argument} yields an accuracy of {score}")
    
average_accuracy = sum(scores)/len(scores)
print(f"Average accuracy: {average_accuracy}")
    

IndentationError: unexpected indent (<ipython-input-21-c26f9f9aa2d9>, line 47)

In [14]:
multilist = ['ovr', 'multinomial', 'auto']
           
for multi in multilist:
    smote = SMOTE(sampling_strategy=0.85, k_neighbors=4)
    X_smote, y_smote = smote.fit_resample(X, y.ravel())
    
    # Split the data into training and testing sets
    X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state=3)
    
    # Create, fit, and score the decision tree classifier
    classifier = LogisticRegression(random_state=0, solver= "newton-cg" , multi_class=multi)
    classifier = classifier.fit(X=X_smote_train, y= y_smote_train)
    score = classifier.score(X_smote_test, y_smote_test)
    
    # Print a list of accuracies based on the current argument
    print(f"Setting the multi parameter to {multi} yields an accuracy of {score}")



Setting the multi parameter to ovr yields an accuracy of 0.7774113767518549
Setting the multi parameter to multinomial yields an accuracy of 0.7768406366922442
Setting the multi parameter to auto yields an accuracy of 0.7795675058659395


In [None]:
from sklearn.preprocessing import StandardScaler

# Create scale for features and label
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

# Scale features and labels
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
# models = []
# models.append(("LR", LogisticRegression()))
# models.append(("CART", DecisionTreeClassifier()))
# models.append(("CART", RandomForestClassifier()))
# models.append(("SVM", SVC()))
# models.append(("NB", GaussianNB()))

# from sklearn import model_selection

# # Evaluate each model in turn
# results = []
# names = []

# for name, model in models:
#     kfold = model_selection.KFold(n_splits=10, random_state=42)
#     cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
#     results.append(cv_results)
#     names.append(name)
#     print(f"{name}: {cv_results.mean()}, {cv_results.std()}")

In [None]:
# # look at this
# y_train.shape

In [None]:
classifier.feature_importances_

In [None]:
# import graphviz

# decision_tree_data = tree.export_graphviz(
#   classifier,
#   out_file=None,
#   feature_names=["age",
#                  "average_glucose_levels",
#                  "bmi",
#                  "hypertension_0",
#                  "hypertension_1",
#                  "heart_disease_0",
#                  "heart_disease_1",
#                  "ever_married_No",
#                  "ever_married_Yes",
#                  "work_type_Self-employed",
#                  "work_type_children",
#                  "work_type_other",
#                  "smoking_status_formerly_smoked",
#                  "smoking_status_never_smoked",
#                  "smoking_status_smokes"
#                 ],
#     class_names=["did_not_have_a_stroke", "had_a_stroke"],
#     filled=True,
#     rounded=False
# )

# graph = graphviz.Source(decision_tree_data)
# #graph

# #graph[size="7.75,10.25"]

In [None]:
import graphviz

decision_tree_data = tree.export_graphviz(
  classifier,
  out_file=None,
  feature_names=["age",
                 "average_glucose_levels",
                 "bmi",
                 "hypertension_0",
                 "hypertension_1",
                 "heart_disease_0",
                 "heart_disease_1",
                 "ever_married_No",
                 "ever_married_Yes",
                 "work_type_Self-employed",
                 "work_type_children",
                 "work_type_other",
                 "smoking_status_formerly_smoked",
                 "smoking_status_never_smoked",
                 "smoking_status_smokes"
                ],
    class_names=["did_not_have_a_stroke", "had_a_stroke"],
    filled=True,
    rounded=False
)



graph = graphviz.Source(decision_tree_data)
graph

#graph[size="7.75,10.25"]

In [None]:
dir(graph)

In [None]:
dir(tree.export_graphviz)

In [None]:
graph.render(format="png")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Visualizing both classes
plt.scatter(X[:, 0], X[:, 1])

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=1)
rf = rf.fit(X_train, np.array(y_train))
rf.score(X_test, np.array(y_test))