In [1]:
import pandas as pd
import numpy as np
import matplotlib as mplt
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# DATA LOADING & EXPLORATION.

In [2]:
from sklearn.datasets import load_iris # <-- Import dataset from SKLEARN.

dataset = load_iris() # <-- Load dataset.

In [3]:
dataset

 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
      

In [4]:
features = dataset.data # <-- Extract features matrix.
target = dataset.target # <-- Extract target vector.
target_names = dataset.target_names # <-- Extract names of the species.
feature_names = dataset.feature_names # <-- Extract names of the features.

In [5]:
target[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [6]:
target.shape

(150,)

In [7]:
features[:10]

array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1]])

In [8]:
# - Dataset has 150 rows/samples & 4 columns/features.
features.shape

(150, 4)

In [9]:
feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

# DATA WRANGLING.

In [10]:
# - Create a DataFrame object of the dataset.
iris_dataset = pd.DataFrame(features, columns = feature_names) # <-- Create DataFrame using the 'features' matrix & column names using the 'feature_names' array variable.
iris_dataset['species'] = target # <-- Add another column called 'SPECIES' to be used as the target vector.

iris_dataset.head() # <-- View the first 5 rows of the dataset.

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
# - Split the dataset by separating the feautures from the targets by dropping the 'species' column.
X_iris = iris_dataset.drop('species', axis = 1)
X_iris.shape

(150, 4)

In [12]:
# - Save the 'species' column in a variable to use as the target vector.
y_iris = iris_dataset['species']
y_iris.shape

(150,)

# MODEL TRAINING & EVALUATION.

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [14]:
# Split the dataset into TRAINING (80%) and TESTING (20%) for both the features matrix and the target vector.
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 1)

## 1. Gaussian Naive Bayes (97%).

In [15]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)
naive_bayes_model_predictions = naive_bayes_model.predict(X_test)
print("Accuracy Score of Gaussian Naive Bayes Model: ", accuracy_score(y_test, naive_bayes_model_predictions))

Accuracy Score of Gaussian Naive Bayes Model:  0.973684210526


## 2. K-Nearest Neighbors (100%).

In [16]:
k_nearest_neighbors_model = KNeighborsClassifier()
k_nearest_neighbors_model.fit(X_train, y_train)
k_nearest_neighbors_model_predictions = k_nearest_neighbors_model.predict(X_test)
print('Accuracy Score of K-Nearest Neighbors Model: ', accuracy_score(y_test, k_nearest_neighbors_model_predictions))

Accuracy Score of K-Nearest Neighbors Model:  1.0


# FEATURE REDUCTION & EXTRACTION.

In [17]:
# - Reduce features to 2 most appropriate features using the StandardScaler & Principal Component Analysis (PCA).
standardized_features = StandardScaler().fit_transform(X_train)

pca_model = PCA(n_components = 2) # <-- Reduce to 2 features.
pca_features = pca_model.fit_transform(standardized_features)

print('Original Number of Features: ', standardized_features.shape[1])
print('Reduced Number of Features: ', pca_features.shape[1])

Original Number of Features:  4
Reduced Number of Features:  2


In [18]:
standardizer = StandardScaler() # <-- For feature reduction.
standardizer.fit(X_train)

X_train_standardized_features = standardizer.transform(X_train) # <-- Reducing features on the TRAINING dataset.
X_test_standardized_features = standardizer.transform(X_test) # <-- Reducing features on the TESTING dataset.

# - Creating a Logistic Regression object.
logistic_regression = LogisticRegression() 


# - Creating a pipeline that both standardizes the dataset and trains the model using the Logistic Regressor.
pipeline = make_pipeline(standardizer, logistic_regression)

# - Create K-Fold cross validation.
kfold = KFold(n = 10, shuffle = True, random_state = 1)

# - Conduct K-Fold cross validation.
cross_validation_results = cross_val_score(pipeline, X_train, y_train, cv = kfold, scoring = 'accuracy', n_jobs = -1)

# - Calculate mean of the K-Fold cross validation results. In this case, mean of the 10 K-Folds cross validation results.
cross_validation_results.mean()

0.80555555555555547