## KNN Classifier Implemented by Joscandy Nunez

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

# read in data
data = pd.read_csv('dataset_phishing.csv')
# observe a few samples of the data
data[:,['https_token', 'ratio_digits_url', 'nb_hyphens', 'nb_dots', 'nb_underscore', 'nb_slash', 'status']].head()


: 

In [None]:
# observe entire data
data.head()

In [None]:
data.info() # further analysis for data cleaning purposes

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

# encode data
# temporarily turn text like "phishing" and "legitimate" into numbers 1 and 0, respectively
le = preprocessing.LabelEncoder()
le.fit(["phishing", "legitimate"]) # status of url is the corresponding feature, our target class
transformedClass = le.transform(data.status[:].values) # get the new values
print("text values ", data.status[:].values, "become ", transformedClass)
data['status'] = transformedClass # swap the old values with the new encoded values

In [None]:
# features we are using for training the model, which is a mixture of ints and floats
X = data[['https_token', 'ratio_digits_url', 'nb_hyphens', 'nb_dots', 'nb_underscore', 'nb_slash']]
# target class
y = data['status']

print("---------------FEATURES---------------------")
print(X)

print("---------------TARGET---------------------")
print(y)

In [None]:
# split our data into train and test data groups, default is train size 75%, test size 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# code to find our best k to optimize our classifier's number of neighbors k
k_values = [i for i in range (1,61)]
scores = []

from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
import seaborn as sns

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn, X, y, cv=5)
    scores.append(np.mean(score))
    
best_index = np.argmax(scores)
best_k = k_values[best_index]
    
# visualize the k findings 
sns.lineplot(x = k_values, y = scores, marker = 'o')
plt.xlabel("K Values")
plt.ylabel("Accuracy Score")

# create instance of KNN classifier
#### ran algorithm above to find best k ###
knn = KNeighborsClassifier(n_neighbors = best_k)

In [None]:
# train our model using X and y train values
knn.fit(X_train.values, y_train)

In [None]:
# test how "good" the model does at seeing unseen test data
# get accuracy, precision and recall score to measure the performance of classifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

y_pred = knn.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred) # same as knn.score()
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

In [None]:
## set of URLs to use for testing prediction
#url_to_predict = 'https://www.coursera.org/learn/python-machine-learning/lecture/MwsUM/k-nearest-neighbors-classification'
# url_to_predict = 'https://app.simplenote.com/publish/XhRdVc'
#url_to_predict = 'https://stackoverflow.com/questions/24878174/how-to-count-digits-letters-spaces-for-a-string-in-python'
# url_to_predict = 'https://www.lionsgateplus.com/es/es/movies'
url_to_predict = 'https://www.simplilearn.com/tutorials/python-tutorial/python-if-else-statement'
# url_to_predict = 'https://tubitv.com/category/peliculas_en_espanol'

# function to get how many digits there are in the url string, returns that number as an int
def countDigits(string):
    alpha = 0
    for i in string:
        if (i.isalpha()):
            alpha+=1
    return alpha

# function to extract the features of the url used in the KNN classifier
# takes url string as input
# returns int for counts, and boolean for https_token being present or not
def extract_features(url_to_predict):
    hyphens = url_to_predict.count('-')  # grab how many hyphens
    dots = url_to_predict.count('.')  # grab how many dots
    slashes = url_to_predict.count('/')  # grab how many slashes
    underscore = url_to_predict.count('_')  # grab how many underscores
    ratio_of_digits = countDigits(url_to_predict)/len(url_to_predict)  # divide amount of digits by length of url
    # determine whether https is present or not, true or false 
    https_token = url_to_predict.__contains__('https') 
    if (https_token):
        https_token = 1
    else:
        https_token = 0
        
    # print statements for visualizing totals
    print("There are:")
    print("Hyphens: ", hyphens)
    print("Dots: ", dots)
    print("Slashes: ", slashes)
    print("Underscore: ", underscore)
    print("Ratio of digits: ", ratio_of_digits)
    print("https token: ", https_token)
    
    return https_token, ratio_of_digits, hyphens, dots, underscore, slashes
    
https_token, ratio_digits_url, nb_hyphens, nb_dots, nb_underscore, nb_slash = extract_features(url_to_predict)


# make prediction using extracted features returned from our function call
url_prediction = knn.predict([[https_token, ratio_digits_url, nb_hyphens, nb_dots, nb_underscore, nb_slash]])
# print what that prediction is, raw
print(url_prediction)

In [None]:
# de-encode this KNN classifier prediction value, resulting in the text-string that corresponds to that value
list(le.inverse_transform(url_prediction))