# K-Nearest Neighbors (K-NN)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset
* 從三個資料夾中讀取資料

In [3]:
all_data = []
paths = [r'spam_data/spam', r'spam_data/easy_ham', r'spam_data/hard_ham'] 
for path in paths:
    for fn in glob.glob(path+"/*"):
        if "ham" not in fn:
            is_spam = 1
        else:
            is_spam = 0
        #codecs.open可以避開錯誤，用errors='ignore'
        with codecs.open(fn, encoding='utf-8', errors='ignore') as file:
            for line in file:
                #這個line的開頭為Subject:
                if line.startswith("Subject:"):
                    subject = re.sub(r"^Subject:", "", line).strip()
                    all_data.append([subject, is_spam])
all_data = np.array(all_data)

In [6]:
all_data[:10]

array([['Friend, Copy ANY DVD or Playstation Game with this software......',
        '1'],
       ['5% Guaranteed for Eight Years', '1'],
       ['Congratulations! You Get a Free Handheld Organizer!', '1'],
       ['One of a kind Money maker! Try it for free!', '1'],
       ['Online Doctors will fill your Viagra Prescription Now!!!                QEEB',
        '1'],
       ['Take your Marketing to the Next Level', '1'],
       ['One Sale - Three Commission Streams', '1'],
       ['Find Peace, Harmony, Tranquility, And Happiness Right Now!',
        '1'],
       ['ADV: Extended Auto Warranties Here                                                    undoc',
        '1'],
       ['Definitely the answer many have been waiting for!!', '1']],
      dtype='<U109')

### 取出訓練內文與標註

In [7]:
X = all_data[:, 0]
Y = all_data[:, 1].astype(np.uint8)

In [8]:
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
['Friend, Copy ANY DVD or Playstation Game with this software......'
 '5% Guaranteed for Eight Years'
 'Congratulations! You Get a Free Handheld Organizer!'
 'One of a kind Money maker! Try it for free!'
 'Online Doctors will fill your Viagra Prescription Now!!!                QEEB']


In [9]:
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
[1 1 1 1 1]


### 文字預處理
* 細節可以參考前面章節

In [10]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk

nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

## 創建Lemmatizer
lemmatizer = WordNetLemmatizer() 
def get_wordnet_pos(word):
    """將pos_tag結果mapping到lemmatizer中pos的格式"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def clean_content(X):
    # remove non-alphabet characters
    X_clean = [re.sub('[^a-zA-Z]', ' ', x).lower() for x in X]
    
    # tokenize
    X_word_tokenize = [nltk.word_tokenize(x) for x in X_clean]
    
    # stopwords_lemmatizer
    X_stopwords_lemmatizer = []
    stop_words = set(stopwords.words('english'))
    for content in X_word_tokenize:
        content_clean = []
        for word in content:
            if word not in stop_words:
                word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
                content_clean.append(word)
        X_stopwords_lemmatizer.append(content_clean)
    
    X_output = [' '.join(x) for x in X_stopwords_lemmatizer]
    
    return X_output

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiaping/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
X = clean_content(X)

In [13]:
X[:10]

['friend copy dvd playstation game software',
 'guaranteed eight year',
 'congratulation get free handheld organizer',
 'one kind money maker try free',
 'online doctor fill viagra prescription qeeb',
 'take marketing next level',
 'one sale three commission stream',
 'find peace harmony tranquility happiness right',
 'adv extend auto warranty undoc',
 'definitely answer many wait']

### Bag of words

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的頻率高低去篩選，1500並沒有特別含義，大家可以自己嘗試不同數值或不加入限制
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(X).toarray()

In [15]:
# 有 3423 個樣本，每個樣本用 1500 維表示
X.shape 

(3423, 1500)

## 將資料拆成 train/test set

In [16]:
from sklearn.model_selection import train_test_split
# random_state 是為了讓各為學員得到相同的結果，平時可以移除
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

## Training the K-NN model on the Training set

In [17]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

## 測試 train/testset的 Accuracy

In [18]:
print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.91672753834916


In [19]:
print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.8875912408759125


## 獲得 testset 上的結果

In [20]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[581   6]
 [ 71  27]]


0.8875912408759125

### 運用K-fold尋找適合K值

In [22]:
# Applying k-Fold Cross Validation
#n-jobs=-1，是指cpu全開
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

n_neighbors = [5, 10, 25, 50, 100] ## 可自行嘗試不同K值

for k in n_neighbors:
    classifier = KNeighborsClassifier(n_neighbors=k, metric='minkowski', p=2)
    
    # cv = 10 代表切成10等分
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10, n_jobs=-1)
    
    print('設置K值:{}'.format(k))
    print('Average Accuracy: {}'.format(accuracies.mean()))
    print('Accuracy STD: {}'.format(accuracies.std()))

設置K值:5
Average Accuracy: 0.8816649287452207
Accuracy STD: 0.010229532974285832
設置K值:10
Average Accuracy: 0.8615758936926821
Accuracy STD: 0.005323209137229294
設置K值:25
Average Accuracy: 0.8528140958797893
Accuracy STD: 0.002212221313254954
設置K值:50
Average Accuracy: 0.8520828320098393
Accuracy STD: 0.0017278270972811813
設置K值:100
Average Accuracy: 0.8520828320098393
Accuracy STD: 0.0017278270972811813
