In [4]:
#trying to import the iris dataset exisiting in the scikit-learn dataset module
from sklearn.datasets import load_iris
iris_dataset=load_iris()

In [5]:
#The iris object that is returned by load_iris is a Bunch object, which is very similar to a dictionary.
#It contains keys and values
print("Keys of iris_dataset: \n{}".format(iris_dataset.keys()))

Keys of iris_dataset: 
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [6]:
#The value of the key DESCR is a short description of the dataset
print(iris_dataset['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [7]:
#The value of the key target_names is an array of strings, containing the species of flower that we want to predict
print("Target names: {}".format(iris_dataset['target_names']))

Target names: ['setosa' 'versicolor' 'virginica']


In [8]:
#The value of feature_names is a list of strings, giving the description of each feature
print("Feature names: \n{}".format(iris_dataset['feature_names']))

Feature names: 
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [9]:
#The data itself is contained in the target and data fields. data contains the numeric measurements of sepal length,
#sepal width, petal length, and petal width in a NumPy array:
print("Type of data: {}".format(type(iris_dataset['data'])))

Type of data: <class 'numpy.ndarray'>


In [10]:
#The rows in the data array correspond to flowers, while the columns represent the
#four measurements that were taken for each flower
print("Shape of data: {}".format(iris_dataset['data'].shape))

Shape of data: (150, 4)


In [11]:
#Here are the feature values for the first five samples:
print("First five columns of data:\n{}".format(iris_dataset['data'][:5]))

First five columns of data:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [12]:
print("Shape of target: {}".format(iris_dataset['target'].shape))#this is a 1-D array

Shape of target: (150,)


In [13]:
#The species are encoded as integers from 0 to 2:
#0 means setosa, 1 means versicolor, and 2 means virginica
print("Target:\n{}".format(iris_dataset['target']))

Target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [14]:
#Let’s call train_test_split on our data and assign the outputs using this nomenclature
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
iris_dataset['data'], iris_dataset['target'], random_state=0)

In [15]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

X_train shape: (112, 4)
y_train shape: (112,)


In [16]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_test shape: (38, 4)
y_test shape: (38,)


In [22]:
#model building 
#choosing k- nearest neighnour 
#taking n=1 i.e number of nearest neighbour =1
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

In [23]:
#fitting the model on train data 
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [24]:
#evaluating model accuracy using score 
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
#model is 97 percent accurate

Test set score: 0.97


In [26]:
import numpy as np
#making a prediction 
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape: {}".format(X_new.shape))
prediction = knn.predict(X_new)
print("Prediction: {}".format(prediction))
print("Predicted target name: {}".format(iris_dataset['target_names'][prediction]))

X_new.shape: (1, 4)
Prediction: [0]
Predicted target name: ['setosa']
