In [1]:
# Imports and pip installations (if needed)
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn import metrics 
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

# Part 1: Load the dataset

In [2]:
# Load the dataset (load remotely, not locally)
dataset = pd.read_csv("https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv")
# Output the first 15 rows of the data
dataset.head(15)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [3]:
# Display a summary of the table information (number of datapoints, etc.)
df = pd.DataFrame(dataset)
print("Summary of the basic information about this DataFrame and its data:")
display(df.describe())
print(df.info())

Summary of the basic information about this DataFrame and its data:


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None


In [4]:
print("List of species in the Iris dataset and their count:")
df['species'].value_counts()

List of species in the Iris dataset and their count:


setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

## About the dataset
Explain what the data is in your own words. What are your features and labels? What is the mapping of your labels to the actual classes?

The dataset is comprised of 150 different flowers of 3 species. The flowers being setosa, versicolor and virginica. The data takes in the sepal length, sepal width, petal length and petal width for each flower and classifies the data to its respective flower.

My features are the length and width while my label is the species. 

The classes or columns 0, 1, and 2 that can be seen in my probability report, represent the species setosa, versicolor and virginica flower respectivly. 

# Part 2: Split the dataset into train and test

In [5]:
train, test =train_test_split(df, test_size=.10, random_state=42)
print(train.shape)
print(test.shape)

(135, 5)
(15, 5)


In [6]:
# Take the dataset and split it into our features (X) and label (y)
train_X = train[['sepal_length','sepal_width','petal_length','petal_width']] 
train_y = train.species

test_X = test[['sepal_length','sepal_width','petal_length','petal_width']]
test_y = test.species 

In [7]:
train_X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
56,6.3,3.3,4.7,1.6
104,6.5,3.0,5.8,2.2
69,5.6,2.5,3.9,1.1
55,5.7,2.8,4.5,1.3
132,6.4,2.8,5.6,2.2


In [8]:
 test_X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4


In [9]:
train_y.head()

56     versicolor
104     virginica
69     versicolor
55     versicolor
132     virginica
Name: species, dtype: object

In [10]:
test_y.head()

73     versicolor
18         setosa
118     virginica
78     versicolor
76     versicolor
Name: species, dtype: object

# Part 3: Logistic Regression

In [11]:
# i. Use sklearn to train a LogisticRegression model on the training set
model = LogisticRegression(solver='lbfgs', max_iter=300)
model.fit(train_X.values, train_y.values)

LogisticRegression(max_iter=300)

In [12]:
# ii. For a sample datapoint, predict the probabilities for each possible class
model.predict(np.array([[5, 3, 1, 0]]))

array(['setosa'], dtype=object)

In [13]:
model.predict_proba(np.array([[5, 3, 1, 0]]))

array([[9.86236096e-01, 1.37638998e-02, 4.12425948e-09]])

The model has a 98% chance of being in class 0 with my sample data points, which is a Setosa flower. My other prediction score labeled the flower as a Setosa. 

In [14]:
# iii. Report on the score for Logistic regression model, what does the score measure?
model.score(train_X.values, train_y.values)

0.9777777777777777

In [15]:
# iv. Extract the coefficents and intercepts for the boundary line(s)
print('The coefficents for the Logistic Regression is: ', model.coef_)
print('The intercepts for the Logistic Regression are: ',model.intercept_)

The coefficents for the Logistic Regression is:  [[-0.42636918  0.96639438 -2.44675782 -1.04043474]
 [ 0.51242433 -0.21926281 -0.2141773  -0.8456555 ]
 [-0.08605515 -0.74713156  2.66093512  1.88609023]]
The intercepts for the Logistic Regression are:  [  9.53420938   1.89278634 -11.42699572]


# Part 4: Support Vector Machine

In [16]:
# i. Use sklearn to train a Support Vector Classifier on the training set
model = svm.SVC(probability=True) 
model.fit(train_X.values, train_y.values)

SVC(probability=True)

In [17]:
# ii. For a sample datapoint, predict the probabilities for each possible class
model.predict([[6, 3, 5, 2]])

array(['virginica'], dtype=object)

In [18]:
model.predict_proba([[6, 3, 5, 2]])

array([[0.01023569, 0.13456991, 0.8551944 ]])

The model has a 85% chance of being in class 2 with my sample data points, which is a virginica flower. My other prediction score labeled the flower as a virginica.

In [19]:
# iii. Report on the score for the SVM, what does the score measure?
model.score(train_X.values, train_y.values)

0.9703703703703703

# Part 5: Neural Network

In [20]:
# i. Use sklearn to train a Neural Network (MLP Classifier) on the training set
model = MLPClassifier(max_iter=1000) 
model.fit(train_X.values, train_y.values)

MLPClassifier(max_iter=1000)

In [21]:
# ii. For a sample datapoint, predict the probabilities for each possible class
model.predict([[5, 3, 4, 1]])

array(['versicolor'], dtype='<U10')

In [22]:
model.predict_proba([[5, 3, 4, 1]])

array([[0.00664948, 0.99031164, 0.00303888]])

The model has a 98% chance of being in class 1 with my sample data points, which is a versicolor flower. My other prediction score labeled the flower as a versicolor.

In [23]:
# iii. Report on the score for the Neural Network, what does the score measure?
model.score(train_X.values, train_y.values)

0.9851851851851852

In [24]:
# iv: Experiment with different options for the neural network, report on your best configuration (the highest score I was able to achieve was 0.8666)

After experimenting with several different options for the neural network, the highest confidence score that I was able to get was 98%.

# Part 6: K-Nearest Neighbors

In [25]:
# i. Use sklearn to 'train' a k-Neighbors Classifier
# Note: KNN is a nonparametric model and technically doesn't require training
# fit will essentially load the data into the model see link below for more information
# https://stats.stackexchange.com/questions/349842/why-do-we-need-to-fit-a-k-nearest-neighbors-classifier
model = KNeighborsClassifier() 
model.fit(train_X.values, train_y)

KNeighborsClassifier()

In [26]:
# ii. For a sample datapoint, predict the probabilities for each possible class
model.predict([[0, 4, 5, 2]])

array(['versicolor'], dtype=object)

In [27]:
model.predict_proba([[0, 4, 5, 2]])

array([[0. , 0.8, 0.2]])

The model has a 80% chance of being in class 1 with my sample data points, which is a versicolor flower. My other prediction score labeled the flower as a versicolor.

In [28]:
# iii. Report on the score for kNN, what does the score measure?
model.score(train_X.values, train_y)

0.9703703703703703

# Part 7: Conclusions and takeaways

In your own words describe the results of the notebook. Which model(s) performed the best on the dataset? Why do you think that is? Did anything surprise you about the exercise?

Each of the models accurately predicted the correct flower. I attempted either numbers close to the values in the dataset or the actual numbers in the dataset, and got the correct species of flower returned. I used two different prediction models as well and got the same results for each prediction model. 

The Neural Network model peformed the best with a score of 99% accuracy. Both the Logistical Regression and the Neural Network performed the best out of all of the models. If we were to look at the number of times I ran the notebook, both scores competed with the highest accuracy scores, with Neural Network performing the best.

I think this is the case because Neural Networks have a large number of free parameters which gives them the flexibility to fit highly complex data.

What surprised me the most was how close the results of the Logistical Regression and the Neural Network performed.