In [1]:
# Feature selection by SelectKBest method...
import pandas as pd


In [2]:
def select_k_best(indep, dep, k):
    
    from sklearn.feature_selection import SelectKBest, chi2
    skb=SelectKBest(score_func=chi2, k=3)
    
    # fit the selectkbest object to the data
    skb_fit = skb.fit(indep, dep)
    
    # trasform the data to the Select top  K features
    x_new = skb_fit.transform(indep)
    
    # Get the names of the selected features...
    selected_features = indep.columns[skb.get_support()]
    
    return selected_features, x_new

def standard_scalar(xtrain, xtest):
    ### standard scalar

    from sklearn.preprocessing import StandardScaler
    scx = StandardScaler()
    x_train_scaled = scx.fit_transform(xtrain)
    x_test_scaled = scx.fit_transform (xtest)
    return x_train_scaled, x_test_scaled

def metrices(ytest, y_pred):
    # making the confusion matrix, classification_report, accuracy_score

    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
    cm = confusion_matrix(ytest, y_pred)
    clf_report = classification_report(ytest, y_pred)
    acc_score = accuracy_score(ytest, y_pred)
    return cm , clf_report, acc_score
    
#### 1. logistic regression    
def logistic(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.linear_model import LogisticRegression
    classifier =LogisticRegression(random_state=0)
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)

    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score  
    
### 2. SVM_linear

def svm_linear(x_train_scaled, x_test_scaled, ytrain, ytest): 
    from sklearn.svm import SVC
    classifier = SVC(kernel="linear", random_state=0)
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)

    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score 

### 3. SVM_linear
def svm_nonLinear(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.svm import SVC
    classifier= SVC(kernel='rbf', random_state=0)
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)

    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score
    
### 4. Decision Tree
def decision(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)

    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score

### 5. RandomForest
def random(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score

## 6. KNN (KNearestNeighbours)

##Power parameter for the Minkowski metric. When p = 1, 
#####   this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
######    For arbitrary p, minkowski_distance (l_p) is used.
## Metric to use for distance computation. Default is "minkowski", which
##### results in the standard Euclidean distance when p = 2. 
def knn(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski', p=2 )
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)

    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score

### 7. Naive Bayes
def naive(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score


#### display the accuracy in a table

def view_acc_score(acc_logistic,acc_svmlinear,acc_svm_nonlinear,acc_decision,acc_random,acc_knn,acc_naive):

    table= pd.DataFrame(index=["chi square"], columns=["Logistic","svmlinear","svm_nonlinear","decision","random","knn","naive"])
    # Loop through each index label and assign values from the lists to the corresponding row
    for i, label in enumerate(table.index):
        table.loc[label] = [
            acc_logistic[i], 
            acc_svmlinear[i], 
            acc_svm_nonlinear[i], 
            acc_decision[i], 
            acc_random[i], 
            acc_knn[i], 
            acc_naive[i]
    ]

    return table

In [3]:
dataset=pd.read_csv("prep_ds.csv")
#dataset

### separation of indep and dep vars

indep = dataset.drop("HeartDisease", axis=1)
dep = dataset["HeartDisease"]

# Applying the SelectKbest...
selected_features, x_new = select_k_best(indep, dep, 5)

### Separation of Training and Test dataset...
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_new, dep, test_size=0.30, random_state=1)

# Standard scalar
x_train_scaled, x_test_scaled = standard_scalar(xtrain, xtest)



In [4]:
acc_logistic= []
acc_svmlinear = []
acc_svm_nonlinear = []
acc_decision = []
acc_random = []
acc_knn = []
acc_naive = []


cm ,clf_report,acc_score= logistic(x_train_scaled, x_test_scaled, ytrain, ytest)
acc_logistic.append(acc_score)

cm ,clf_report,acc_score= svm_linear(x_train_scaled, x_test_scaled, ytrain, ytest)
acc_svmlinear.append(acc_score)

cm ,clf_report,acc_score= svm_nonLinear(x_train_scaled, x_test_scaled, ytrain, ytest)
acc_svm_nonlinear.append(acc_score)

cm ,clf_report,acc_score= decision(x_train_scaled, x_test_scaled, ytrain, ytest)
acc_decision.append(acc_score)

cm ,clf_report,acc_score= random(x_train_scaled, x_test_scaled, ytrain, ytest)
acc_random.append(acc_score)

cm ,clf_report,acc_score= knn(x_train_scaled, x_test_scaled, ytrain, ytest)
acc_knn.append(acc_score)

cm ,clf_report,acc_score= naive(x_train_scaled, x_test_scaled, ytrain, ytest)
acc_naive.append(acc_score)


In [5]:
#### selected features:

print ("Selected Features: ", selected_features)

Selected Features:  Index(['MaxHR', 'ST_Slope_Flat', 'ST_Slope_Up'], dtype='object')


In [6]:
dataset[selected_features]

Unnamed: 0,MaxHR,ST_Slope_Flat,ST_Slope_Up
0,156,0,1
1,156,1,0
2,120,0,1
3,120,1,0
4,122,0,1
...,...,...,...
913,132,1,0
914,141,1,0
915,120,1,0
916,156,1,0


In [7]:
#### Disply the accuracy score in a table

table =view_acc_score(acc_logistic,acc_svmlinear,acc_svm_nonlinear,acc_decision,acc_random,acc_knn,acc_naive)
table

Unnamed: 0,Logistic,svmlinear,svm_nonlinear,decision,random,knn,naive
chi square,0.82971,0.82971,0.818841,0.717391,0.757246,0.797101,0.811594


In [8]:
### Conclusion: Accuracy score for LogisticRegression and SVMLinear is high than the other ones...
###################so we will save any one of these two models for deployment


