In [35]:
# Imports and pip installations (if needed)
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics, svm, linear_model, datasets, neural_network, neighbors

# Part 1: Load the dataset

In [7]:
# Load the dataset (load remotely, not locally)
iris = datasets.load_iris()
# Output the first 15 rows of the data
listTarget_names = [f'{iris.target_names[0]}']*50 + [f'{iris.target_names[1]}']*50 + [f'{iris.target_names[2]}']*50

iris_data_frame = pd.DataFrame(data=np.c_[iris['data'],iris['target']],
                              columns=iris['feature_names']+['target'])
iris_data_frame.head(15)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
5,5.4,3.9,1.7,0.4,0.0
6,4.6,3.4,1.4,0.3,0.0
7,5.0,3.4,1.5,0.2,0.0
8,4.4,2.9,1.4,0.2,0.0
9,4.9,3.1,1.5,0.1,0.0


In [8]:
# Display a summary of the table information (number of datapoints, etc.)
iris_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    float64
dtypes: float64(5)
memory usage: 6.0 KB


In [9]:
iris_data_frame.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## About the dataset
Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?

# Part 2: Split the dataset into train and test

In [10]:
# Take the dataset and split it into our features (X) and label (y)
X = iris_data_frame[iris.feature_names]
y = iris_data_frame['target']



In [11]:
# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, train_size = 0.9, test_size = 0.1,
                                                                     shuffle = True, random_state = 6174)

# Part 3: Logistic Regression

In [12]:
# i. Use sklearn to train a LogisticRegression model on the training set
logistic_regression = linear_model.LogisticRegression(solver='lbfgs', max_iter=300)
logistic_regression.fit(X_train.values, y_train.values)



LogisticRegression(max_iter=300)

In [13]:
# ii. For a sample datapoint, predict the probabilities for each possible class
logistic_regression.predict(np.array([[5, 3, 1, 0]]))
logistic_regression.predict_proba(np.array([[5, 3, 1, 0]]))

array([[9.87254451e-01, 1.27455451e-02, 3.82708938e-09]])

In [14]:
# iii. Report on the score for Logistic regression model, what does the score measure?
logistic_regression.score(X_train.values, y_train.values)


0.9703703703703703

- The score measures the accuracy of the model on the given test data and labels.

In [15]:
# iv. Extract the coefficents and intercepts for the boundary line(s)
print('The coefficents for the Logistic Regression is: ', logistic_regression.coef_)
print('The intercepts for the Logistic Regression are: ',logistic_regression.intercept_)

The coefficents for the Logistic Regression is:  [[-0.39676549  0.90417794 -2.43728678 -1.08205138]
 [ 0.49734947 -0.28240878 -0.19264721 -0.88275555]
 [-0.10058398 -0.62176916  2.62993399  1.96480692]]
The intercepts for the Logistic Regression are:  [  9.61460702   2.10940681 -11.72401383]


# Part 4: Support Vector Machine

In [22]:
# i. Use sklearn to train a Support Vector Classifier on the training set
SVM = svm.SVC(kernel = 'linear',probability=True) 
SVM.fit(X_train.values, y_train.values)


SVC(kernel='linear', probability=True)

In [23]:
# ii. For a sample datapoint, predict the probabilities for each possible class
SVM_probabilities = pd.DataFrame(SVM.predict_proba(X_test), columns=iris['target_names'])
SVM_probabilities




Unnamed: 0,setosa,versicolor,virginica
0,0.009132,0.075199,0.915669
1,0.003262,0.971654,0.025084
2,0.001095,0.000532,0.998372
3,0.00665,0.920265,0.073085
4,0.008074,0.061559,0.930367
5,0.021962,0.969114,0.008924
6,0.936189,0.046923,0.016888
7,0.002726,0.002362,0.994912
8,0.039471,0.948485,0.012044
9,0.916567,0.063837,0.019596


In [24]:
# iii. Report on the score for the SVM, what does the score measure?
SVM.score(X_train.values, y_train.values)

0.9925925925925926

- The score measure the accuracy of the model on the given test data and labels.

# Part 5: Neural Network

In [26]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set
MLP_Classifier = neural_network.MLPClassifier(solver='lbfgs', hidden_layer_sizes=(3, 3))
MLP_Classifier.fit(X_train, y_train.values.ravel())


MLPClassifier(hidden_layer_sizes=(3, 3), solver='lbfgs')

In [29]:
# ii. For a sample datapoint, predict the probabilities for each possible class
ML_Probalities = pd.DataFrame(MLP_Classifier.predict_proba(X_test), columns=iris['target_names'])
ML_Probalities


Unnamed: 0,setosa,versicolor,virginica
0,0.0,0.004186527,0.9958135
1,0.0,0.9999855,1.454187e-05
2,0.0,7.019758e-09,1.0
3,0.0,0.9997964,0.0002035863
4,0.0,0.0001779296,0.9998221
5,0.0,0.9999999,1.450242e-07
6,0.999754,0.000246362,2.8803139999999996e-20
7,0.0,1.319738e-08,1.0
8,0.0,0.9999999,7.485332e-08
9,0.999754,0.000246362,2.8803139999999996e-20


In [30]:
# iii. Report on the score for the Neural Network, what does the score measure?
MLP_Classifier.score(X_test, y_test)


1.0

In [None]:
# iv: Experiment with different options for the neural network, report on your best configuration (the highest score I was able to achieve was 0.8666)


- After experimenting with several different options for the neural network, the highest confidence score that I was able to get was 98%.

# Part 6: K-Nearest Neighbors

In [41]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier
KNN_Model = neighbors.KNeighborsClassifier(n_neighbors=3)
KNN_Model.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(n_neighbors=3)

In [42]:
# ii. For a sample datapoint, predict the probabilities for each possible class
KNN_Probabilities = pd.DataFrame(KNN_Model.predict_proba(X_train), columns=iris['target_names'])
KNN_Probabilities


Unnamed: 0,setosa,versicolor,virginica
0,0.0,1.000000,0.000000
1,1.0,0.000000,0.000000
2,0.0,1.000000,0.000000
3,0.0,1.000000,0.000000
4,1.0,0.000000,0.000000
...,...,...,...
130,0.0,0.000000,1.000000
131,0.0,0.333333,0.666667
132,1.0,0.000000,0.000000
133,1.0,0.000000,0.000000


In [43]:
# iii. Report on the score for kNN, what does the score measure?
KNN_Model.score(X_test, y_test)

1.0

- The score measure the accuracy of the model on the given test data and labels.

# Part 7: Conclusions and takeaways

In your own words describe the results of the notebook. Which model(s) performed the best on the dataset? Why do you think that is? Did anything surprise you about the exercise?

- The two models that performed the best on the dataset were the Neural Network Classifier and K-nearest Neighbors with a perfect score on 1.0 (for both training and test dataset). However, in predicting the probability the K-nearest Neighbors model was 100% certain about the correct answer on the test data set. 
- I believe this is because KNN is an example of a nonparametric model.