In [1]:
# Imports and pip installations (if needed)
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# SK learn ML
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

# Part 1: Load the dataset

In [2]:
# Load the dataset (load remotely, not locally)

# Output the first 15 rows of the data
# Display a summary of the table information (number of datapoints, etc.)
iris = load_iris()
X, y = load_iris(return_X_y=True, as_frame=True)
# Create y df with target column name 
y = pd.DataFrame(y).rename(columns = {0: 'target'})
# Join X and y into one df to show entire dataset as a df 
df = pd.concat([X, y], axis = 1)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


## About the dataset
Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?

In [3]:
# A peek into our dataset as a data_frame
df.head(15)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [4]:
# Mapping label value to corrsponding class
print_target_names = [print(x, iris['target_names'][x]) for x in range(0,3)]
# Show previously constructed df information using the describe method
df.describe()

0 setosa
1 versicolor
2 virginica


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


# Part 2: Split the dataset into train and test

In [5]:
# Take the dataset and split it into our features (X) and label (y)

# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

# Part 3: Logistic Regression

In [6]:
# i. Use sklearn to train a LogisticRegression model on the training set
logistic_reg = LogisticRegression(max_iter=300)
logistic_reg.fit(X_train, y_train.values.ravel())

LogisticRegression(max_iter=300)

In [7]:
# ii. For a sample datapoint, predict the probabilities for each possible class
logistic_probabilities = pd.DataFrame(logistic_reg.predict_proba(X_test), columns=iris['target_names'])
logistic_probabilities

Unnamed: 0,setosa,versicolor,virginica
0,0.00126,0.419632,0.5791075
1,0.005961,0.924646,0.0693927
2,0.000142,0.157974,0.8418837
3,0.011538,0.754691,0.2337712
4,0.985348,0.014652,2.102463e-08
5,0.962074,0.037926,1.002e-07
6,0.954363,0.045637,1.220263e-07
7,0.000732,0.431375,0.5678932
8,6e-06,0.017853,0.9821407
9,0.003203,0.912115,0.08468276


In [8]:
# iii. Report on the score for Logistic regression model, what does the score measure?
logistic_reg.score(X_test, y_test) 

1.0

In [9]:
# iv. Extract the coefficents and intercepts for the boundary line(s)
print("Coeff:\t", logistic_reg.coef_, end="\n\n")
print("Intercept:\t",logistic_reg.intercept_)

Coeff:	 [[-0.41951467  0.93638026 -2.43359806 -1.05279545]
 [ 0.53214964 -0.33185085 -0.22767761 -0.79000683]
 [-0.11263497 -0.60452942  2.66127567  1.84280227]]

Intercept:	 [  9.59016141   2.11571737 -11.70587877]


# Part 4: Support Vector Machine

In [10]:
# i. Use sklearn to train a Support Vector Classifier on the training set
SVM = SVC(kernel = 'linear', probability=True)
SVM.fit(X_train, y_train.values.ravel())

SVC(kernel='linear', probability=True)

In [11]:
# ii. For a sample datapoint, predict the probabilities for each possible class
SVM_probabilities = pd.DataFrame(SVM.predict_proba(X_test), columns=iris['target_names'])
SVM_probabilities

Unnamed: 0,setosa,versicolor,virginica
0,0.016866,0.396382,0.586753
1,0.007576,0.983088,0.009336
2,0.013018,0.094407,0.892575
3,0.011579,0.859233,0.129187
4,0.976491,0.014734,0.008775
5,0.957054,0.029658,0.013288
6,0.95915,0.027192,0.013658
7,0.016422,0.313062,0.670517
8,0.005607,0.00297,0.991423
9,0.005549,0.945263,0.049188


In [12]:
# iii. Report on the score for the SVM, what does the score measure?
SVM.score(X_test, y_test)

1.0

# Part 5: Neural Network

In [13]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set
MLP_Classifier = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(3, 3))
MLP_Classifier.fit(X_train, y_train.values.ravel())

MLPClassifier(hidden_layer_sizes=(3, 3), solver='lbfgs')

In [14]:
# ii. For a sample datapoint, predict the probabilities for each possible class
ML_Probalities = pd.DataFrame(MLP_Classifier.predict_proba(X_test), columns=iris['target_names'])
ML_Probalities

Unnamed: 0,setosa,versicolor,virginica
0,2.1157840000000002e-55,0.2647675,0.7352325
1,7.404321e-27,0.9999914,8.634397e-06
2,1.506299e-73,0.0003536866,0.9996463
3,1.052592e-40,0.9954219,0.004578142
4,0.999983,1.7032e-05,3.706461e-14
5,0.999983,1.7032e-05,3.706461e-14
6,0.999983,1.7032e-05,3.706461e-14
7,1.00358e-58,0.08983464,0.9101654
8,1.000026e-93,1.694298e-07,0.9999998
9,3.474118e-36,0.9994071,0.0005928608


In [15]:
# iii. Report on the score for the Neural Network, what does the score measure?
MLP_Classifier.score(X_test, y_test)

1.0

In [16]:
# iv: Experiment with different options for the neural network, report on your best configuration (the highest score I was able to achieve was 0.8666)

# Part 6: K-Nearest Neighbors

In [17]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier
KNN_Model = KNeighborsClassifier(n_neighbors=3)
KNN_Model.fit(X, y.values.ravel())

KNeighborsClassifier(n_neighbors=3)

In [18]:
# ii. For a sample datapoint, predict the probabilities for each possible class
KNN_Probabilities = pd.DataFrame(KNN_Model.predict_proba(X_train), columns=iris['target_names'])
KNN_Probabilities

Unnamed: 0,setosa,versicolor,virginica
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
...,...,...,...
130,0.0,0.0,1.0
131,0.0,1.0,0.0
132,0.0,1.0,0.0
133,1.0,0.0,0.0


In [19]:
# iii. Report on the score for kNN, what does the score measure?
KNN_Model.score(X_test, y_test)

1.0

# Part 7: Conclusions and takeaways

In your own words describe the results of the notebook. Which model(s) performed the best on the dataset? Why do you think that is? Did anything surprise you about the exercise?