In [2]:
# Import the packages required
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, metrics
from sklearn.model_selection import train_test_split

In [3]:
# load the data from the datasets available in scikit learn
dataset = datasets.load_iris()
print(type(dataset))

<class 'sklearn.utils.Bunch'>


In [4]:
# print the data structure information
print(dataset.keys())
print(dataset.data.shape)
print(dataset.feature_names)
print(dataset.target_names)
print(dataset.DESCR)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
(150, 4)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1

In [5]:
# select the x and y data
x = dataset.data
y = dataset.target
print(x.shape, y.shape)
print(x[0], y[0])

(150, 4) (150,)
[5.1 3.5 1.4 0.2] 0


In [6]:
# Split the dataset in training data and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(120, 4) (120,)
(30, 4) (30,)


In [7]:
# Define the algorithm to be used, fit the model and generate a prediction
alg = linear_model.LinearRegression()
alg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
# Generate a prediction over the test data using the model
y_pred = alg.predict(x_test)
print(y_pred.shape)
print(y_pred[:10])
print(y_test[:10])

(30,)
[1.1978288  1.92512183 0.07221077 0.1235969  0.88009456 0.10603513
 2.2283338  2.04972986 1.48711527 0.94565793]
[1 2 0 0 1 0 2 2 2 1]


In [9]:
# Map the predicted linear values to discrete values
y_pred_round = np.round(y_pred).astype(int)
print(y_pred_round[:10])
print(np.unique(y_pred_round))

# Fix the values outside te valid classes
min_y, max_y = np.min(y_train), np.max(y_train)
fixer = lambda i : min_y if i < min_y else max_y if i > max_y else i
y_pred_fix = np.array([i for i in map(fixer, y_pred_round)])
print(y_pred_fix[:10])
print(np.unique(y_pred_fix))

[1 2 0 0 1 0 2 2 1 1]
[0 1 2]
[1 2 0 0 1 0 2 2 1 1]
[0 1 2]


In [10]:
# Validate, using the confusion matrix
matrix = metrics.confusion_matrix(y_test, y_pred_fix)
print(matrix)

[[ 6  0  0]
 [ 0 11  0]
 [ 0  1 12]]


In [12]:
# Verify the model error based on R²
print('certainty:', alg.score(x_train, y_train) * 100, '%')

certainty: 93.16445903377857 %
