In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_recall_fscore_support
import random
import matplotlib.pyplot as plt
import itertools

# k-nearest neighbors

This dataset was obtained from https://archive.ics.uci.edu/ml/datasets/Heart+Disease (this is a great resource for datasets to try machine learning on). It has data on patients that are and are not diagnosed with heart disease.

The attributes are:
* age: age in years 
* sex: sex (1 = male; 0 = female) 
* cp: chest pain type 
 * -- Value 1: typical angina 
 * -- Value 2: atypical angina 
 * -- Value 3: non-anginal pain 
 * -- Value 4: asymptomatic 
* trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
* chol: serum cholestoral in mg/dl 
* fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
* restecg: resting electrocardiographic results 
 * -- Value 0: normal 
 * -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
 * -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
* thalach: maximum heart rate achieved 
* exang: exercise induced angina (1 = yes; 0 = no) 
* oldpeak = ST depression induced by exercise relative to rest 
* slope: the slope of the peak exercise ST segment 
 * -- Value 1: upsloping 
 * -- Value 2: flat 
 * -- Value 3: downsloping 
* ca: number of major vessels (0-3) colored by flourosopy 
* thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
* num: diagnosis of heart disease (angiographic disease status) 
 * -- Value 0: absence.
 * -- Value 1,2,3,4: presence of heart disease


# Explore the data

Read in the data, modify the dependent variable name and plot a histogram of the ages of patients, both healthy and those with heart disease.

In [2]:
df = pd.read_csv('cleveland.csv')

# Collapse all values 1-4 into a single value so that "num" is boolean
df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
df.drop(df[df['thal'] == '?'].index, inplace=True)
display(df.head(5))

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


# multiple dimensions


In [3]:
def knnPredict(data, attributes, numOfNeighbors, target='disease'):
    nn = NearestNeighbors(n_neighbors=numOfNeighbors, metric='euclidean', algorithm='auto')
    
    # Standardize
    newAttributes = []
    for attribute in attributes:
        standard = data[attribute]-data[attribute].mean()/data[attribute].std()
        standard.name = standard.name+'_s'
        newAttributes.append(standard.name)
        data = pd.concat([data,standard],axis=1)
        
    # Build underlying structure with standardized data   
    X = data[newAttributes].values
    y = data[target].values
    fit = nn.fit(X)
    
    # Choose a random patient
    i = random.randint(0,len(X)-1)
    patientX = X[i]
    patienty = y[i]
    # display('Our patient',data.iloc[i])
    
    # Find the k nearest neighbors, not including self
    distances, indices = fit.kneighbors([patientX],numOfNeighbors+1)
    nbrs = data.iloc[indices[0]]
    # Delete self from data
    nbrs = nbrs.drop(data.iloc[i].name)
    # display(nbrs)
    
    # Count the number of neighbors that have target
    have = nbrs[nbrs[target] == 0].count()[target]
    # Count the number of neighbors that DON'T have target
    dontHave = nbrs[nbrs[target] == 1].count()[target]
    # print('Have: {}\ndontHave: {}'.format(have, dontHave))
    
    # Predict that our random patient is like the majority of its neighbors
    predict = 0 if (dontHave > have) else 1
    # According to the records, did our patient have the target
    actual = 0 if (patienty == 0) else 1
    success = predict == actual
    # print('Sucess:',success)
    return success, patienty == 0

In [4]:
def testMeSenpai(data, attributes, numOfNeighbors, numOfTests=1, target='disease'):
    y_pred = []
    y_true = []
    for test in range(numOfTests):
        yPred, yAct = knnPredict(data, attributes, numOfNeighbors)
        y_pred.append(yPred)
        y_true.append(yAct)
    return y_pred ,y_true

In [5]:
def crossValidate(data, attributes, numOfNeighbors, numOfTests, target='disease', level = 10):
    folds = []
    for i in range(level):
        folds.append(data.sample(frac = 1/level))
        
    # Train missing one fold
    for i in range(level-1):
        temp = pd.concat([data,folds[i]])
        temp = temp.drop_duplicates(keep=False)
        
        y_pred,y_true = testMeSenpai(temp, attributes, numOfNeighbors, numOfTests, target='disease')
        (p,r,f,s) = precision_recall_fscore_support(y_pred,y_true,zero_division=0)
        print(p,r,f,s,'\n')
        
df1 = df.copy()
k = 5
numOfTests = 5
attributes = ['age','trestbps']

crossValidate(df1,attributes,k,numOfTests)
# y_pred, y_true = testMeSenpai(df1,attributes,k,numOfTests)
# (p,r,f,s) = precision_recall_fscore_support(y_pred,y_true)
# display(p,r,f,s)

[0.5 0. ] [0.25 0.  ] [0.33333333 0.        ] [4 1] 

[0.   0.75] [0.   0.75] [0.   0.75] [1 4] 

[0.25 0.  ] [0.5 0. ] [0.33333333 0.        ] [2 3] 

[0.4 0. ] [1. 0.] [0.57142857 0.        ] [2 3] 

[0.5 0. ] [0.25 0.  ] [0.33333333 0.        ] [4 1] 

[0.  0.5] [0.         0.66666667] [0.         0.57142857] [2 3] 

[0. 0.] [0. 0.] [0. 0.] [2. 3.] 

[0.  0.6] [0. 1.] [0.   0.75] [2 3] 

[0.5 0. ] [0.66666667 0.        ] [0.57142857 0.        ] [3 2] 



In [6]:
def testAgainstTest(testData,attributes,k):
    y_pred, y_true = testMeSenpai(testData,attributes,k)
    (p,r,f,s) = precision_recall_fscore_support(y_pred,y_true)
    return p,r,f,s

The model is the value of k (nearest neigbors) and set of attributes

## The challenge

In [7]:
# Day of the challenge set, all we need to do is change this file name and run it
new_df = pd.read_csv('cleveland-test-sample.csv')

# Collapse all values 1-4 into a single value so that "num" is boolean
new_df = new_df.rename({'num':'disease','Unnamed: 0':'id'}, axis=1)
new_df['disease'] = new_df.disease.apply(lambda x: min(x, 1))

# Build a bigger data set to 
temp = pd.concat([df,new_df]).reset_index()


## chosen features

In [8]:
chosen_features = ['cp', 'trestbps', 'chol', 'restecg', 'thalach', 'exang', 'thal', 'age']
numFeatures = 5
attributes = ['age','trestbps','chol', 'thalach']
# standardizing noncatagorical data
newAttributes = []
data1 = df.copy()
for attribute in attributes:
    standard = (data1[attribute]-data1[attribute].mean())/data1[attribute].std()
    chosen_features.remove(standard.name)
    chosen_features.append(standard.name+'_s')
    standard.name = standard.name+'_s'
    newAttributes.append(standard.name)
    data1 = pd.concat([data1,standard],axis=1)
    data1.drop(attribute,axis=1,inplace=True)

In [9]:
def makefeaturesList(chosen_features, n):
    featureslist = list(itertools.combinations(chosen_features,n)) #
    featureslist = [list(x) for x in featureslist]
    return featureslist
# data 1 is processed

## monte carlo data split with selected features

In [10]:
# make random split for one monte carlo cross validation run
def monteCarloCVSplit(data : pd.DataFrame , level = 10):
    CVtest = data.sample(frac = 1/level)
    CVtrain = data.drop(CVtest.index)
    return CVtrain, CVtest

### run knn on given attributes and CV split

In [11]:
def knnPredictions(CVtrain, CVTest, attributes, k, target='disease'):
    nn = NearestNeighbors(n_neighbors=k , metric='euclidean', algorithm='auto')
    
    # Build underlying structure with standardized data   
    X = CVtrain[attributes].values
    y = CVtrain[target].values
    
    testX = CVTest[attributes].values
    testy = CVTest[target].values
    fit = nn.fit(X) # fits training data to model
    
    # nbrs = neighbors
    #this fit finds the nearest neigbors from the training set to the test set 
    distances, nbrs = fit.kneighbors(testX, n_neighbors = k) 
    
    # Count the number of neighbors that have target

    diseasdedNbrCount = []
    for nbrList in nbrs:
        diseasdedNbrCount.append([1 if y[nbr] == 1 else 0 for nbr in nbrList])
    
    predictions = []
    for l in diseasdedNbrCount:
        numOfDiseased = sum(l)
        if numOfDiseased > k/2:
            predictions.append(1)
        else:
            predictions.append(0)
    # print(predictions,'\n',diseasdedNbrCount)
    return predictions , testy
    

In [12]:
# make combinations of up to 8 attriubutes
def getBestknn(data, chosen_features, numOfNeighbors = 7,target='disease'):
    bestModel= [0,[]]
    for i in range(8):
        featuresList = makefeaturesList(chosen_features,i+1)
        # for each combination of size i
        for j in range(len(featuresList)):
            # Do monte carlo validation 10 times
            pSum = 0
            rSum = 0
            f1Sum = 0
            for k in range(10):
                CVtrain1, CVtest1 = monteCarloCVSplit(data)
                #train model and report scores    
                preds, actuals = knnPredictions(CVtrain1, CVtest1, featuresList[j] , numOfNeighbors,target=target)
                (p,r,f,s) = precision_recall_fscore_support(preds,actuals,zero_division=0,average='binary')
                f1 = (p*r)/(p + r)
                pSum += p
                rSum += r
                f1Sum += f1
            avgP = pSum/10
            avgR = rSum/10
            avgF1 = f1Sum/10
            # Update our best model
            if avgF1 > bestModel[0]:
                bestModel = [avgF1,featuresList[j],avgP,avgR]
    return bestModel

In [13]:
print(getBestknn(data1,chosen_features))

  f1 = (p*r)/(p + r)
  f1 = (p*r)/(p + r)


[0.4030991311426094, ['cp', 'restecg', 'exang', 'thal', 'chol_s', 'thalach_s'], 0.8206318681318683, 0.8042586580086579]


make changes

In [14]:
power = pd.read_csv('city_power_consumption.csv')
power = power.rename(columns={'Zone 1 Power Consumption':'Zone_1_Power',
                            'Zone 2  Power Consumption':'Zone_2_Power',
                            'Zone 3  Power Consumption':'Zone_3_Power',
                            'general diffuse flows':'general_diffuse_flows',
                            'Wind Speed':'Wind_Speed' })
powerMean = power['Zone_1_Power'].mean()
power['highPowerUse'] = power.Zone_1_Power>powerMean
power
# power.plot(power['Temperature'])
# sns.lineplot(data = power, x = 'Temperature')

Unnamed: 0,DateTime,Temperature,Humidity,Wind_Speed,general_diffuse_flows,diffuse flows,Zone_1_Power,Zone_2_Power,Zone_3_Power,highPowerUse
0,1/1/2017 0:00,6.559,73.8,0.083,0.051,0.119,34055.69620,16128.87538,20240.96386,True
1,1/1/2017 0:10,6.414,74.5,0.083,0.070,0.085,29814.68354,19375.07599,20131.08434,False
2,1/1/2017 0:20,6.313,74.5,0.080,0.062,0.100,29128.10127,19006.68693,19668.43373,False
3,1/1/2017 0:30,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711,False
4,1/1/2017 0:40,5.921,75.7,0.081,0.048,0.085,27335.69620,17872.34043,18442.40964,False
...,...,...,...,...,...,...,...,...,...,...
52411,12/30/2017 23:10,7.010,72.4,0.080,0.040,0.096,31160.45627,26857.31820,14780.31212,False
52412,12/30/2017 23:20,6.947,72.6,0.082,0.051,0.093,30430.41825,26124.57809,14428.81152,False
52413,12/30/2017 23:30,6.900,72.8,0.086,0.084,0.074,29590.87452,25277.69254,13806.48259,False
52414,12/30/2017 23:40,6.758,73.0,0.080,0.066,0.089,28958.17490,24692.23688,13512.60504,False


Standardize and prepare data

In [19]:
chosen_features = ['Temperature','Humidity','Wind_Speed', 'general_diffuse_flows', 'diffuse flows','Zone_2_Power','Zone_3_Power']
numFeatures = 4
# standardizing noncatagorical data
newAttributes = []
stdData = power.copy()
for attribute in chosen_features:
    standard = (stdData[attribute]-stdData[attribute].mean())/stdData[attribute].std()
    chosen_features.remove(standard.name)
    chosen_features.append(standard.name+'_s')
    standard.name = standard.name+'_s'
    newAttributes.append(standard.name)
    stdData = pd.concat([stdData,standard],axis=1)
    stdData.drop(attribute,axis=1,inplace=True)
print(getBestknn(stdData,chosen_features,numOfNeighbors = 100,target='highPowerUse'))

[0.43279497419150514, ['Humidity', 'general_diffuse_flows', 'Zone_2_Power', 'Zone_3_Power_s_s_s'], 0.8993935386231702, 0.8342825093492368]


In [None]:
sns.scatterplot(data = power.sample(frac=.01), x = 'Temperature', y = 'Zone_1_Power')

In [None]:
sns.scatterplot(data = power.sample(frac=.01), x = 'Humidity', y = 'Zone_1_Power')

# Challenge set

In [None]:
# Day of the challenge set, all we need to do is change this file name and run it
new_df = pd.read_csv('cleveland-test-sample.csv')

# Collapse all values 1-4 into a single value so that "num" is boolean
new_df = new_df.rename({'num':'disease','Unnamed: 0':'id'}, axis=1)
new_df['disease'] = new_df.disease.apply(lambda x: min(x, 1))

# Build a bigger data set to 
temp = pd.concat([df,new_df]).reset_index()