In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_recall_fscore_support
import random
import matplotlib.pyplot as plt

# k-nearest neighbors

This dataset was obtained from https://archive.ics.uci.edu/ml/datasets/Heart+Disease (this is a great resource for datasets to try machine learning on). It has data on patients that are and are not diagnosed with heart disease.

The attributes are:
* age: age in years 
* sex: sex (1 = male; 0 = female) 
* cp: chest pain type 
 * -- Value 1: typical angina 
 * -- Value 2: atypical angina 
 * -- Value 3: non-anginal pain 
 * -- Value 4: asymptomatic 
* trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
* chol: serum cholestoral in mg/dl 
* fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
* restecg: resting electrocardiographic results 
 * -- Value 0: normal 
 * -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
 * -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
* thalach: maximum heart rate achieved 
* exang: exercise induced angina (1 = yes; 0 = no) 
* oldpeak = ST depression induced by exercise relative to rest 
* slope: the slope of the peak exercise ST segment 
 * -- Value 1: upsloping 
 * -- Value 2: flat 
 * -- Value 3: downsloping 
* ca: number of major vessels (0-3) colored by flourosopy 
* thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
* num: diagnosis of heart disease (angiographic disease status) 
 * -- Value 0: absence.
 * -- Value 1,2,3,4: presence of heart disease


# Explore the data

Read in the data, modify the dependent variable name and plot a histogram of the ages of patients, both healthy and those with heart disease.

In [None]:
df = pd.read_csv('cleveland.csv')

# Collapse all values 1-4 into a single value so that "num" is boolean
df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
display(df.head(5))

# multiple dimensions


In [None]:
def renameMe(data, attributes, numOfNeighbors, target='disease'):
    nn = NearestNeighbors(n_neighbors=numOfNeighbors, metric='euclidean', algorithm='auto')
    
    # Standardize
    newAttributes = []
    for attribute in attributes:
        standard = data[attribute]-data[attribute].mean()/data[attribute].std()
        standard.name = standard.name+'_s'
        newAttributes.append(standard.name)
        data = pd.concat([data,standard],axis=1)
        
    # Build underlying structure with standardized data   
    X = data[newAttributes].values
    y = data[target].values
    fit = nn.fit(X)
    
    # Choose a random patient
    i = random.randint(0,len(X)-1)
    patientX = X[i]
    patienty = y[i]
    # display('Our patient',data.iloc[i])
    
    # Find the k nearest neighbors, not including self
    distances, indices = fit.kneighbors([patientX],numOfNeighbors+1)
    nbrs = data.iloc[indices[0]]
    # Delete self from data
    nbrs = nbrs.drop(data.iloc[i].name)
    # display(nbrs)
    
    # Count the number of neighbors that have target
    have = nbrs[nbrs[target] == 0].count()[target]
    # Count the number of neighbors that DON'T have target
    dontHave = nbrs[nbrs[target] == 1].count()[target]
    # print('Have: {}\ndontHave: {}'.format(have, dontHave))
    
    # Predict that our random patient is like the majority of its neighbors
    predict = 0 if (dontHave > have) else 1
    # According to the records, did our patient have the target
    actual = 0 if (patienty == 0) else 1
    success = predict == actual
    # print('Sucess:',success)
    return success, patienty == 0

In [None]:
def testMeSenpai(data, attributes, numOfNeighbors, numOfTests=1, target='disease'):
    y_pred = []
    y_true = []
    for test in range(numOfTests):
        yPred, yAct = renameMe(data, attributes, numOfNeighbors)
        y_pred.append(yPred)
        y_true.append(yAct)
    return y_pred ,y_true

In [None]:
def crossValidate(data, attributes, numOfNeighbors, numOfTests, target='disease', level = 10):
    folds = []
    for i in range(level):
        folds.append(data.sample(frac = 1/level))
        
    # Train missing one fold
    for i in range(level-1):
        temp = pd.concat([data,folds[i]])
        temp = temp.drop_duplicates(keep=False)
        
        y_pred,y_true = testMeSenpai(temp, attributes, numOfNeighbors, numOfTests, target='disease')
        (p,r,f,s) = precision_recall_fscore_support(y_pred,y_true)
        display(p,r,f,s)
        
df1 = df.copy()
k = 5
numOfTests = 5
attributes = ['age','trestbps']

crossValidate(df1,attributes,k,numOfTests)
# y_pred, y_true = testMeSenpai(df1,attributes,k,numOfTests)
# (p,r,f,s) = precision_recall_fscore_support(y_pred,y_true)
# display(p,r,f,s)

In [17]:
def testAgainstTest(testData,attributes,k):
    y_pred, y_true = testMeSenpai(testData,attributes,k)
    (p,r,f,s) = precision_recall_fscore_support(y_pred,y_true)
    return p,r,f,s

The model is the value of k (nearest neigbors) and set of attributes

## The challenge

In [16]:
# Day of the challenge set, all we need to do is change this file name and run it
new_df = pd.read_csv('cleveland-test-sample.csv')

# Collapse all values 1-4 into a single value so that "num" is boolean
new_df = new_df.rename({'num':'disease','Unnamed: 0':'id'}, axis=1)
new_df['disease'] = new_df.disease.apply(lambda x: min(x, 1))

# Build a bigger data set to 
temp = pd.concat([df,new_df]).reset_index()


Index(['index', 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'disease', 'id'],
      dtype='object')

Unnamed: 0,index,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease,id
0,0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0,
1,1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1,
2,2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1,
3,3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,
4,4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,35,45.0,1.0,4.0,142.0,309.0,0.0,2.0,147.0,1.0,0.0,2.0,3.0,7.0,1,205.0
339,36,44.0,0.0,3.0,108.0,141.0,0.0,0.0,175.0,0.0,0.6,2.0,0.0,3.0,0,93.0
340,37,47.0,1.0,4.0,110.0,275.0,0.0,2.0,118.0,1.0,1.0,2.0,1.0,3.0,1,247.0
341,38,62.0,0.0,4.0,150.0,244.0,0.0,0.0,154.0,1.0,1.4,2.0,0.0,3.0,1,209.0
