In [2]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Iris Dataset

In [3]:
# load data
from sklearn import datasets
iris = datasets.load_iris() 
list(iris.keys())

['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']

In [4]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [5]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
# adds new column with species names
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [10]:
X = iris_df[iris_df.columns[:4]]
y = pd.factorize(iris_df['species'])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [12]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20 ,random_state=0)

In [13]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)

neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [14]:
y_pred = neigh.predict(X_test)

In [16]:
from sklearn import metrics
# confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[11,  0,  0],
       [ 0, 12,  1],
       [ 0,  0,  6]])

In [17]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='macro'))

Accuracy: 0.9666666666666667
Precision: 0.9523809523809524
Recall: 0.9743589743589745


## MTCARS Dataset

In [18]:
MTCARS_LOCATION = "datasets/mtcars.csv"

In [19]:
mtcars = pd.read_csv(MTCARS_LOCATION)

In [20]:
features = np.array(mtcars.columns)
features = np.delete(features, [0,1]) # remove 'Unnamed: 1' and 'Unnamed: 0'

In [22]:
cars = pd.DataFrame(mtcars, columns=features)

In [23]:
cars['make'] = mtcars['Unnamed: 0']

In [24]:
# Converts car names to a unique digit in order to identify them
y = pd.factorize(cars['make'])[0]

y

array([ 0,  1,  1,  2,  3,  4,  4,  2,  5,  6,  7,  8,  8,  9, 10,  1,  1,
       11,  9, 12,  1,  1,  1, 13, 14, 15, 16, 17, 18, 19, 20, 21])

In [25]:
X = cars[features]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20 ,random_state=0)

In [27]:
neigh = KNeighborsClassifier(n_neighbors=3)

neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [28]:
y_pred = neigh.predict(X_test)

In [29]:
# confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 2, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0]])

In [30]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test, y_pred, average='macro'))

Accuracy: 0.2857142857142857
Precision: 0.0625
Recall: 0.08333333333333333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
