# KNN

En este ejercicio vamos a practicar con la implementación de KNN de scikitlearn en Python. 

El set de datos es Iris, es muy popular, contiene 4 mediciones sobre flores recolectadas y a que especie pertenecen, el objetivo es poder predecir la especie de una flor en base a sus mediciones. El set de datos se baja de: 

https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data 

El código BASE a usar es el siguiente:

In [1]:
# loading libraries
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score




In [2]:
iris_path = "/Users/ignacio.iglesias/Dev/datos/data/"
# Load the data from the TXT file (no header), column names passed
df = pd.read_csv(iris_path + 'iris.data', header=None, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])


In [4]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [104]:
(df.loc[df['class'] == "Iris-setosa"]).shape

(50, 5)

In [108]:
# X will have the numeric data, Y will have the target class
X = np.array(df.ix[:, 0:4]) 
y = np.array(df['class']) 

# Split into train and test, use only 10% as test (15 flowers)
test_size_parameter = 0.10
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_parameter, random_state=7506)

# Create KNN classifier
k_parameter = 7
knn = KNeighborsClassifier(n_neighbors=k_parameter)

# FIT (this does nothing in normal KNN)
knn.fit(X_train, y_train)

# predict the response 
pred = knn.predict(X_test)

# evaluate accuracy
print(accuracy_score(y_test, pred))

1.0


In [15]:
pep


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


Sobre estos puntos vamos a calcular varias distancias diferentes, en base a los cálculos realizados identificar la opción correcta entre las siguientes. Nota: La distancia coseno se define como 1-cos(u,v)

In [5]:
# Dados los siguientes puntos en 6 dimensiones: 

v1 = [-0.03523597, 0.32230245, -0.12863445, -0.05499823, 0.44394174, -0.64023494] 
v2 = [-0.42261242, -0.94746882, -0.42385565, -0.57185003, 0.30857046, -0.06489488] 
v3 = [ 0.64302075, 0.42868685, 0.30156031, -0.69333788, -0.32130466, -0.12057523] 
v4 = [ 0.83048764, -0.31629304, -0.29266495, -0.83020893, 0.81144827, 0.27869104] 
v5 = [-0.291947, 0.47880295, -0.20378554, -0.45709396, -0.56440462, -0.40772986] 

In [3]:
import math
from fractions import Fraction 

def manhattan(x, y):
    d = 0
    for c in range(len(x)):
        d = d + math.fabs(x[c] - y[c])
    return d

def eucli(x, y):
    d = 0
    for c in range(len(x)):
        d = d + math.pow((x[c] - y[c]), 2)
    return math.sqrt(d)

def minkowsky(x, y, p):
    d = 0
    expo = Fraction(p)
    expo2 = Fraction(1/p)
    for c in range(len(x)):
        value = math.fabs(x[c] - y[c]) 
        d = d + value**expo
    return d**expo2

def norma(x):
    n = 0
    for c in range(len(x)):
        n = n + math.pow(x[c], 2)
    return math.sqrt(n)

def cos(x, y):
    pi = 0
    for c in range(len(x)):
        pi = pi + x[c] * y[c]
    nx = norma(x)
    ny = norma(y)
    d = 1 - (pi / (nx * ny))
    return d  
    

In [6]:
# La distancia Manhattan entre los vectores 4 y 5 es 3.1075
manhattan(v4, v5)

4.4417988

In [7]:
# Tanto para la distancia Euclideana 
# como para la distancia Manhattan 
# el vector mas cercano al 1 es el 3
print('Eculi')
print('d(v1, v1) =', eucli(v1, v1))
print('d(v1, v2) =', eucli(v1, v2))
print('d(v1, v3) =', eucli(v1, v3))
print('d(v1, v4) =', eucli(v1, v4))
print('d(v1, v5) =', eucli(v1, v5))
print('Manhattan')
print('d(v1, v1) =', manhattan(v1, v1))
print('d(v1, v2) =', manhattan(v1, v2))
print('d(v1, v3) =', manhattan(v1, v3))
print('d(v1, v4) =', manhattan(v1, v4))
print('d(v1, v5) =', manhattan(v1, v5))

Eculi
d(v1, v1) = 0.0
d(v1, v2) = 1.570354259495079
d(v1, v3) = 1.3854757760395318
d(v1, v4) = 1.6627162143300682
d(v1, v5) = 1.1526242300886216
Manhattan
d(v1, v1) = 0.0
d(v1, v2) = 3.1799320600000005
d(v1, v3) = 3.1380816400000002
d(v1, v4) = 3.7299928099999997
d(v1, v5) = 2.13130979


In [8]:
# La distancia Minkowski con p=0.25 entre los vectores 1 y 2 es 582.602
print('minkowsky')
print('d(v1, v2) =', minkowsky(v1, v2, 0.25))

minkowsky
d(v1, v2) = 582.6017798274008


In [106]:
# La distancia Coseno Mínima es 0.0366
print('cos')
print('d(v1, v2) =', cos(v1, v2))
print('d(v1, v3) =', cos(v1, v3))
print('d(v1, v4) =', cos(v1, v4))
print('d(v1, v5) =', cos(v1, v5))
print('d(v2, v3) =', cos(v2, v3))
print('d(v2, v4) =', cos(v2, v4))
print('d(v2, v5) =', cos(v2, v5))
print('d(v3, v4) =', cos(v3, v4))
print('d(v3, v5) =', cos(v3, v5))
print('d(v4, v5) =', cos(v4, v5))


cos
d(v1, v2) = 1.0234088778354677
d(v1, v3) = 0.9490798021715844
d(v1, v4) = 0.8967913358024667
d(v1, v5) = 0.7416246981470784
d(v2, v3) = 1.3402243501902538
d(v2, v4) = 0.6036411430600046
d(v2, v5) = 1.0979762692221466
d(v3, v4) = 0.6562388853431695
d(v3, v5) = 0.5667631454908189
d(v4, v5) = 1.3387122316735178


In [107]:
# La distancia Manhattan mínima es 2.5218
print('Manhattan')
print('d(v1, v2) =', manhattan(v1, v2))
print('d(v1, v3) =', manhattan(v1, v3))
print('d(v1, v4) =', manhattan(v1, v4))
print('d(v1, v5) =', manhattan(v1, v5))
print('d(v2, v3) =', manhattan(v2, v3))
print('d(v2, v4) =', manhattan(v2, v4))
print('d(v2, v5) =', manhattan(v2, v5))
print('d(v3, v4) =', manhattan(v3, v4))
print('d(v3, v5) =', manhattan(v3, v5))
print('d(v4, v5) =', manhattan(v4, v5))


Manhattan
d(v1, v2) = 3.1799320600000005
d(v1, v3) = 3.1380816400000002
d(v1, v4) = 3.7299928099999997
d(v1, v5) = 2.13130979
d(v2, v3) = 3.9742481199999995
d(v2, v4) = 3.1202891699999995
d(v2, v5) = 3.1075734300000004
d(v3, v4) = 3.19556229
d(v3, v5) = 2.25692821
d(v4, v5) = 4.4417988


In [109]:
# La distancia Euclideana máxima es 3.07
print('Euclideana')
print('d(v1, v2) =', eucli(v1, v2))
print('d(v1, v3) =', eucli(v1, v3))
print('d(v1, v4) =', eucli(v1, v4))
print('d(v1, v5) =', eucli(v1, v5))
print('d(v2, v3) =', eucli(v2, v3))
print('d(v2, v4) =', eucli(v2, v4))
print('d(v2, v5) =', eucli(v2, v5))
print('d(v3, v4) =', eucli(v3, v4))
print('d(v3, v5) =', eucli(v3, v5))
print('d(v4, v5) =', eucli(v4, v5))

Euclideana
d(v1, v2) = 1.570354259495079
d(v1, v3) = 1.3854757760395318
d(v1, v4) = 1.6627162143300682
d(v1, v5) = 1.1526242300886216
d(v2, v3) = 1.9925383464236965
d(v2, v4) = 1.556772405594008
d(v2, v5) = 1.729897608707687
d(v3, v4) = 1.5506511439018618
d(v3, v5) = 1.1530036023147976
d(v4, v5) = 2.0984032766902128
