# K-Nearest Neighbors (K-NN)

### 參考課程實作並在datasets_483_982_spam.csv的資料集中獲得90% 以上的 accuracy (testset)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import codecs
import re

## Importing the dataset

In [2]:
dataset = pd.read_csv(r'datasets_483_982_spam.csv', encoding = 'latin-1')

dataset

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


### 取出訓練內文與標註

In [3]:
X = dataset['v2']
Y = np.where(dataset['v1'] == 'ham', 0, 1) 

In [4]:
print('Training Data Examples : \n{}'.format(X[:5]))

Training Data Examples : 
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object


In [5]:
print('Labeling Data Examples : \n{}'.format(Y[:5]))

Labeling Data Examples : 
[0 0 1 0 0]


### 文字預處理

In [6]:
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

import nltk

nltk.download('stopwords')

# Lemmatize with POS Tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 

"""可以參考課程練習方式清理文字，或是使用自己的方式"""
def get_wordnet_pos(word):
    """將pos_tag結果mapping到lemmatizer中pos的格式"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def clean_content(X):
    # remove non-alphabet characters
    X_clean = [re.sub ('[a-zA-Z]',' ',x).lower() for x in X]
    # tokenize
    X_word_tokenize = [nltk.word_tokenize(x) for x in X_clean]
    # stopwords_lemmatizer
    X_stopwords_lemmatizer = []
    stop_words = set (stopwords.words ('english'))
    for content in X_word_tokenize :
        content_clean = []
        for word in content:
            if word not in stop_words:
                word = lemmatizer.lemmatize (word, get_wordnet_pos(word))
                content_clean.append(word)
        X_stopwords_lemmatizer.append(content_clean)

    X_output = [' '.join(x) for x in X_stopwords_lemmatizer]

    return X_output


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/katnyeung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
X = clean_content(X)

In [8]:

X

[', .. ... ...',
 '... ...',
 "2 21 2005 . 87121 ( ) & ' 08452810075 18 '",
 '... ...',
 "' ,",
 "' 3 ' ! ' ? ! , å£1.50",
 '. .',
 "' ( ) ' . * 9",
 '! ! å£900 ! 09061701461 . 341 . 12 .',
 '11 ? ! 08002986030',
 "' ' , ? ' .",
 '! 100 20,000 > 11 87575 . 150 / , 6 , 16+ 4',
 '! 1 å£100,000 ! : : 81010 & . . 4403 1 7 18',
 "' . . .",
 '! !',
 ': , > > : // . . ? =',
 "... ' : )",
 '2 ... . .',
 'åõ . åõ',
 '- / . 87077 87077 : , 4 /ì¼1.20 36504 45 16+',
 '?',
 '\x89û÷ 2',
 'ì_ ... ...',
 '. 3 . ?',
 '. ?',
 ". ' . . . ' .",
 '.',
 "? ? ? ' ? ?",
 "' & ; ' , ' '",
 '. . ! ?',
 "' , ' '",
 "2 . . 2 ! ' ! . ?",
 '.',
 '?',
 'å£5/ .',
 '... ì_ ... 2 8',
 ", ' '",
 '',
 '... ...',
 "! ' ? ' . ' !",
 '. . .',
 '? , , ... ... ... ...',
 '07732584351 - - = + . 08000930705',
 '?',
 '! . & ; # & ; ...',
 '.. ..',
 "' .",
 ', ?',
 ", '",
 "' . ' . ' . .",
 '. .',
 '& ; # & ; , & ; # & ;',
 "'",
 ". . , \\ \\ '' . ' ' ' . ' . ' . ''",
 '. : . ? ?',
 '? @ & ; & ;',
 '! 1 2 . 09061209465 ! , 3 , 3 

### Bag of words

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features = 100)
X=cv.fit_transform(X).toarray()

In [19]:
X.shape

(5572, 100)

## Splitting the dataset into the Training set and Test set

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the K-NN model on the Training set

In [21]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

## Predicting a new result

In [22]:
print('Trainset Accuracy: {}'.format(classifier.score(X_train, y_train)))

Trainset Accuracy: 0.9535562037244784


In [23]:
print('Testset Accuracy: {}'.format(classifier.score(X_test, y_test)))

Testset Accuracy: 0.9372197309417041


## Predicting the Test set results

In [24]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[945   4]
 [ 66 100]]


0.9372197309417041