In [16]:
#importing the necessary sklearn, pandas, and numpy libraries. 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split #to split the dataset into testing and training sets
from sklearn.metrics import accuracy_score #To return the percent of correct predictions
import pandas as pd
import numpy as np


In [17]:
#Creating a pandas dataframe from the provided dataset
df = pd.read_csv('https://raw.githubusercontent.com/mpourhoma/CS4661/master/iris.csv')

#creating a python list of feature names to pick from dataset
feature_cols = ['sepal_length','sepal_width','petal_length','petal_width']

X = df[feature_cols] #using feature_cols list to select features from original DataFrame
y = df['species'] #using 'species' as a label

In [18]:
#Splitting the dataset into testing and training sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=6)

In [19]:
#Instantiating a KNN Object with k = 3 to use in training and testing the dataset
k=3
knn = KNeighborsClassifier(n_neighbors=k)

#training only on the training set:
knn.fit(X_train, y_train)

#Testing on the testing set:
y_predict = knn.predict(X_test)

#Accuracy calculation for our previous prediction
accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

0.9666666666666667


In [20]:
#loop to check the accuracy when using different k values for training

knnVals = 0
accuracy_list = [] #list to keep the accuracy scores
kVals = [1, 5, 7, 11, 15, 27, 59] #our given values in step D of assignment

for index in range(len(kVals)):
    knnVals = kVals[index] #set knnVals variable to the current element in the kVals list
    knn = KNeighborsClassifier(n_neighbors = knnVals) #instantiate the current knn object 
    knn.fit(X_train, y_train) #using the fit method along with training dataset and labels to train model
    y_predict = knn.predict(X_test) #performing prediction and assigning it to y_predict
    
    #comparing the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 
    accuracy = accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)
    
    print('Current value:', kVals[index]) #printing values just for our reference
    print(accuracy) #printing accuracy scores for each index value
    
print(accuracy_list) #printing full list of accuracy values

#We find that the accuracy does not always get better by increasing the value of K

Current value: 1
0.95
Current value: 5
0.9833333333333333
Current value: 7
0.9666666666666667
Current value: 11
0.9666666666666667
Current value: 15
0.9333333333333333
Current value: 27
0.9166666666666666
Current value: 59
0.8166666666666667
[0.95, 0.9833333333333333, 0.9666666666666667, 0.9666666666666667, 0.9333333333333333, 0.9166666666666666, 0.8166666666666667]


In [21]:
#Here we will look to find the best single feature, so i used a loop in order to try each feature individually, then calucalte the accuracy.
#I then place the accuracy scores in a list called singleFeatList. I use the Numpy array because I kept getting an error about the dimensions of my array,
#therefore I reshaped the array using Numpy. I used the constant 3 for the K value.

singleFeatList = []

for index in range(len(feature_cols)):
    X = df[[feature_cols[index]]]
    y = df['species']
    k=3
    knn = KNeighborsClassifier(n_neighbors=k)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=6)
    knn.fit(X_train, y_train)
    y_predict = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_predict)
    singleFeatList.append(accuracy)
    
    print(feature_cols[index], '- Accuracy : ', accuracy)
    
print(singleFeatList)

#petal length was the individual feature that provided the best accuracy at 96.6% , followed closesly by petal width at 95%.
#the list singleFeatList holds the accuracy results for each feature
    

sepal_length - Accuracy :  0.7166666666666667
sepal_width - Accuracy :  0.55
petal_length - Accuracy :  0.9666666666666667
petal_width - Accuracy :  0.95
[0.7166666666666667, 0.55, 0.9666666666666667, 0.95]


In [22]:
#Repeating part e, but this time using two features instead of 1
#To do this, I combined the feature pairs into a single element which I then placed into a list. 
#I also made another list of strings which held the names of the feature pairs to avoid confusion. 
#Then I created another list to hold the accuracies of the pairs found on each iteration
#The for looped stepped through the list holding the feature pairs and calculated the accuracy of them, ultimately storing in the twoFeatAccuracy list

sepal_length_sepal_width = df[feature_cols[0:2]] 
sepal_length_petal_length = df[feature_cols[0:3:2]]
sepal_length_petal_width = df[feature_cols[0:4:3]]
sepal_width_petal_length = df[feature_cols[1:4:2]]
sepal_width_petal_width = df[feature_cols[1:4:2]]
petal_length_petal_width = df[feature_cols[2:4]]

twoFeatList = [sepal_length_sepal_width, sepal_length_petal_length, sepal_length_petal_width,
              sepal_width_petal_length, sepal_width_petal_width, petal_length_petal_width]

twoFeatNames = ['sepal_length+sepal_width', 'sepal_length+petal_length', 'sepal_length+petal_width',
              'sepal_width+petal_length', 'sepal_width+petal_width', 'petal_length+petal_width']

twoFeatAccuracy = []

for index in range(len(twoFeatList)):
    X = twoFeatList[index]
    y = df['species']
    k=3
    knn = KNeighborsClassifier(n_neighbors=k)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=6)
    knn.fit(X_train, y_train)
    y_predict = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_predict)
    twoFeatAccuracy.append(accuracy)
    print('Accuracy for features:', twoFeatNames[index], 'is:', accuracy)
    
print(twoFeatAccuracy)

#We can see with the outputted accuracies that the combination of the 1st and 3rd features (sepal length and petal length) gives us the
#most accuracy at 98%.


Accuracy for features: sepal_length+sepal_width is: 0.7833333333333333
Accuracy for features: sepal_length+petal_length is: 0.9833333333333333
Accuracy for features: sepal_length+petal_width is: 0.95
Accuracy for features: sepal_width+petal_length is: 0.95
Accuracy for features: sepal_width+petal_width is: 0.95
Accuracy for features: petal_length+petal_width is: 0.9666666666666667
[0.7833333333333333, 0.9833333333333333, 0.95, 0.95, 0.95, 0.9666666666666667]


In [23]:
#While the most accurate "best feature pair" does include the "first best feature" from part E, petal length, it
#does not include the "second best [individual] feature", which is petal width.
#Interestingly though, the second most accurate feature pair, does contain the two best individual features, petal length and petal width.

In [24]:
#I believe that the two best individual features do not combine to make the best feature pair because the k value may have been too low for testing feature pairs.