In [3]:
# Imports section
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

# Part 1: Load the dataset

In [4]:
# Load the dataset (load remotely, not locally)
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns = iris.feature_names)

# Output the first 15 rows of the data
df.head(15)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [5]:
# Display a summary of the table information (number of datapoints, etc.)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [6]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
print(iris['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [8]:
# names of the four features
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [9]:
# integers representing the species of each observation
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
# encoding scheme for species: 0 = setosa, 1 = versocolor, 2 = virginica
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

## About the dataset
Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?

- There are 150 samples/rows in the dataset. None of them has null values. There are four attributes/features. The attributes are sepal length, sepal width, petal length, and petal width, respectively. They are all in centimeters. There’s also a column called target. The label for this assignment is the target. The label is split into 3 subclasses with numeric values - 0 for Iris-Setosa, 1 for Iris-Versicolour, and 2 for Iris-Virginica.

# Part 2: Split the dataset into train and test

In [11]:
# Take the dataset and split it into our features (X) and label (y)
features_x = iris.data
label_y = iris.target

# Use sklearn to split the features and labels into a training/test set. (90% train, 10% test)
x_train, x_test, y_train, y_test = train_test_split(features_x, label_y, train_size = 0.9, test_size = 0.1, random_state=42)
y_train

array([1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1,
       0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0, 0, 0,
       1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2, 1, 1, 2, 1, 0, 1, 2,
       0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1, 0, 0, 2, 2, 0, 0, 0,
       1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2, 1, 1, 1, 0, 1, 1, 0,
       1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2,
       0, 1, 2])

# Part 3: Logistic Regression

In [12]:
# i. Use sklearn to train a LogisticRegression model on the training set
logreg = LogisticRegression()
logreg.fit(x_train, y_train)

# ii. For a sample datapoint, predict the probabilities for each possible class
predictions = logreg.predict_proba(x_test[:1])
print("Predictions: ", predictions)

Predictions:  [[0.00345824 0.83719406 0.1593477 ]]


In [13]:
# iii. Report on the score for Logistic regression model, what does the score measure?
print(logreg.score(x_test, y_test))

1.0


#### What does the score measure?
- The score of our Logistic Regression Model is 1.0 (100%). It means that the model will always predict the right class of iris.

In [14]:
# iv. Extract the coefficents for the boundary line(s)
logreg.coef_

array([[-0.42671215,  0.97262703, -2.44459812, -1.03197327],
       [ 0.51311126, -0.22370085, -0.21512542, -0.85154854],
       [-0.08639911, -0.74892618,  2.65972354,  1.88352181]])

In [15]:
# iv. Extract the intercepts for the boundary line(s)
logreg.intercept_

array([  9.50102574,   1.91208599, -11.41311173])

# Part 4: Support Vector Machine

In [16]:
# i. Use sklearn to train a Support Vector Classifier on the training set
vm = svm.SVC(probability=True)
vm.fit(x_train, y_train)
# ii. For a sample datapoint, predict the probabilities for each possible class
predictions = vm.predict_proba(x_test[:1])
print("Predictions: ", predictions)

Predictions:  [[0.00726481 0.89442773 0.09830746]]


In [17]:
# iii. Report on the score for the SVM, what does the score measure?
vm.score(x_test, y_test)

1.0

#### What does the score measure?
- The score of our Support Vector Machine Model is 1.0 (100%). It means that the model will always predict the right class of iris.

# Part 5: Neural Network

In [72]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set
nn = MLPClassifier(hidden_layer_sizes=(12, 6), max_iter=1000)
nn.fit(x_train, y_train)

# ii. For a sample datapoint, predict the probabilities for each possible class
predictions = nn.predict_proba(x_test[:1])
print("Predictions: ", predictions)

Predictions:  [[0.00106688 0.96002677 0.03890635]]


In [73]:
# iii. Report on the score for the Neural Network, what does the score measure?
nn.score(x_test, y_test)

1.0

#### What does the score measure?
- The score of our Neural Network model is 1.0 (100%). It means that the model will always predict the right class of iris.

### iv: Experiment with different options for the neural network, report on your best configuration (the highest score I was able to achieve was 0.8666)


In [74]:
# Quasi-Newton Method
nn = MLPClassifier(solver = 'lbfgs', hidden_layer_sizes=(12, 6), max_iter=1000)
nn.fit(x_train, y_train)

# ii. For a sample datapoint, predict the probabilities for each possible class
predictions = nn.predict_proba(x_test[:1])
print("Predictions: ", predictions)

Predictions:  [[0.00000000e+000 1.00000000e+000 3.30890683e-298]]


In [75]:
# iii. Report on the score for the Neural Network
nn.score(x_test, y_test)

0.9333333333333333

In [76]:
# Stochastic Gradient Descent
nn = MLPClassifier(solver = 'sgd', hidden_layer_sizes=(12, 6), max_iter=1000)
nn.fit(x_train, y_train)

# ii. For a sample datapoint, predict the probabilities for each possible class
predictions = nn.predict_proba(x_test[:1])
print("Predictions: ", predictions)

Predictions:  [[0.2813378  0.40240653 0.31625567]]


In [77]:
nn.score(x_test, y_test)

0.4

# Report on your best configuration.
- I kept all the configurations the same except for the solvers. For the first configuration, the solver is adam, which is the default, and the score I got is 1.0. "adam" refers to a stochastic gradient-based optimizer. This is the highest score I was able to achieve.
- Quasi-Newton Method (lbfgs) took second place with a score of 0.93.
- Stochastic Gradient Descent (sgd) took last place with a score of 0.40.

# Part 6: K-Nearest Neighbors

In [87]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier
k_neighbors = KNeighborsClassifier(n_neighbors=5)
k_neighbors.fit(x_train, y_train)

KNeighborsClassifier()

In [88]:
# ii. For a sample datapoint, predict the probabilities for each possible class
k_neighbors.predict_proba(x_test[:1])

array([[0., 1., 0.]])

In [89]:
# iii. Report on the score for kNN, what does the score measure?
k_neighbors.score(x_test, y_test)

1.0

#### What does the score measure?
- The score of our K-Nearest Neighbors model (with n_neighbors = 15) is 1.0 (100%). It means that the model will always predict the right class of iris.

# Part 7: Conclusions and takeaways

In your own words describe the results of the notebook. Which model(s) performed the best on the dataset? Why do you think that is? Did anything surprise you about the exercise?

- In this assignment, we have to use 4 different Classifiers to analyze which type gives the best score. The 4 types of Classifiers are Logistic Regression, Support Vector Machine, Neural Network, and K-Nearest Neighbors. To my surprise, all 4 models give a score of 1. It could be due to the dataset. Another very surprising thing is that K-Nearest Neighbors is giving 100% probability for each class. It is 100% certain that my sample data point is in class 1 - Versicolor. K-Nearest Neighbors can predict with 100% probability is because it is analyzing if features are in range for a particular class.