# Writing Our First Classifier - Machine Learning Recipes #5
https://youtu.be/AoeEHqVSNOw


這一課說的是一個自制的 scrappy classifier 
當它, predict(), 用 random 亂猜時，得到 30％ 左右的準確率，真好玩，好好笑。


In [16]:
from scipy.spatial import distance
import random

def euc(a, b):
    return distance.euclidean(a, b)


class ScrappyKNN():
    """
    Barebones KNN
    """

    def fit(self, X_train, y_train):
        """
        Takes features and labels for training set as input
        :param X_train:
        :param y_train:
        :return:
        """
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        """
        Receives features for testing data
        Output predictions for labels
        :param X_test:
        :return:
        """
        predictions = []
        for row in X_test:
            # label = random.choice(self.y_train)  # Random decision
            label = self.closest(row)
            predictions.append(label)

        return predictions

    def closest(self, row):
        """
        Find the closest training point
        :param row:
        :return:
        """
        # Distance from test point to first training point
        best_dist = euc(row, self.X_train[0])  # Shortest distance found so far
        best_index = 0  # index of closest training point
        for i in range(1, len(self.X_train)):  # Iterate over all other training points
            dist = euc(row, self.X_train[i])
            if dist < best_dist:  # Found closer, update
                best_dist = dist
                best_index = i
        return self.y_train[best_index]  # closest example


from sklearn import datasets

iris = datasets.load_iris()

X = iris.data
y = iris.target

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

# from sklearn.neighbors import KNeighborsClassifier
# my_classifier = KNeighborsClassifier() # ScrappyKNN()
my_classifier = ScrappyKNN()

my_classifier.fit(X_train, y_train)

predictions = my_classifier.predict(X_test)

from sklearn.metrics import accuracy_score

print (accuracy_score(y_test, predictions))


0.9866666666666667


# 同一目錄下的另一個 KNN 實作範例

這個範例包括 kNNClassifier.py, check.py, Datasets/Fisher.csv, Datasets/titanic.csv 等 files 
我查了一下 [Wikipedia](https://en.wikipedia.org/wiki/Iris_flower_data_set)，得知一般習知的 Iris dataset 也稱為 Fisher's Iris dataset:

"The Iris flower data set or Fisher's Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper"

Iris 之外，另一組 Dataset, [titanic.csv](https://www.kaggle.com/c/titanic), 可就沉重了 - 根據 manifest 上兩千多名旅客的艙等、年齡、性別、票價、等資料預估存活機率。這組 dataset 裡有很多 string type 的 feature 而 KNN 處裡的是 vector 必須是數字 type 因此原文並沒有實際用上這組 dataset. 可能留作練習題了？


In [7]:
%run kNNClassifier.py

96.0
