# K-Nearest Neighbors Algorithm
https://towardsdatascience.com/machine-learning-basics-with-the-k-nearest-neighbors-algorithm-6a6e71d01761

https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

In [25]:
from collections import Counter
import math

def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []
    
    # 3. For each example in the data
    for index, example in enumerate(data):
        # 3.1 Calculate the distance between the query example and the current
        # example from the data.
        distance = distance_fn(example[:-1], query)
        
        # 3.2 Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))
    
    # 4. Sort the ordered collection of distances and indices from
    # smallest to largest (in ascending order) by the distances
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    
    # 5. Pick the first K entries from the sorted collection
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    # 6. Get the labels of the selected K entries
    k_nearest_labels = [data[i][-1] for distance, i in k_nearest_distances_and_indices]

    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]

def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)

def hamming_distance(s1, s2):
    n = min(len(s1), len(s2))
    return sum(c1 != c2 for c1, c2 in zip(s1[:n], s2[:n])) + abs(len(s1)-len(s2))

def main():
    '''
    # Regression Data
    # 
    # Column 0: height (inches)
    # Column 1: weight (pounds)
    '''
    reg_data = [
       [65.75, 112.99],
       [71.52, 136.49],
       [69.40, 153.03],
       [68.22, 142.34],
       [67.79, 144.30],
       [68.70, 123.30],
       [69.80, 141.49],
       [70.01, 136.46],
       [67.90, 112.37],
       [66.49, 127.45],
    ]
    
    # Question:
    # Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?
    reg_query = [60]
    reg_k_nearest_neighbors, reg_prediction = knn(
        reg_data, reg_query, k=3, distance_fn=euclidean_distance, choice_fn=mean
    )
    print(f"\nQuery: {reg_query}")
    print(f"k_nearest_neighbors: {reg_k_nearest_neighbors}")
    print(f"prediction: {reg_prediction}")
    
    '''
    # Classification Data
    # 
    # Column 0: age
    # Column 1: likes pineapple
    '''
    clf_data = [
       [22, 1],
       [23, 1],
       [21, 1],
       [18, 1],
       [19, 1],
       [25, 0],
       [27, 0],
       [29, 0],
       [31, 0],
       [45, 0],
    ]
    # Question:
    # Given the data we have, does a 33 year old like pineapples on their pizza?
    clf_query = [33]
    clf_k_nearest_neighbors, clf_prediction = knn(
        clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode
    )
    print(f"\nQuery: {clf_query}")
    print(f"k_nearest_neighbors: {clf_k_nearest_neighbors}")
    print(f"prediction: {clf_prediction}")

if __name__ == '__main__':
    main()


Query: [60]
k_nearest_neighbors: [(5.75, 0), (6.489999999999995, 9), (7.790000000000006, 4)]
prediction: 128.24666666666667

Query: [33]
k_nearest_neighbors: [(2.0, 8), (4.0, 7), (6.0, 6)]
prediction: 0


In [26]:
#Movie ID,Movie Name,IMDB Rating,Biography,Drama,Thriller,Comedy,Crime,Mystery,History,Label
raw_movies_data = [
    [58,"The Imitation Game",8,1,1,1,0,0,0,0,0],
    [8,"Ex Machina",7.7,0,1,0,0,0,1,0,0],
    [46,"A Beautiful Mind",8.2,1,1,0,0,0,0,0,0],
    [62,"Good Will Hunting",8.3,0,1,0,0,0,0,0,0],
    [97,"Forrest Gump",8.8,0,1,0,0,0,0,0,0],
    [98,21,6.8,0,1,0,0,1,0,1,0],
    [31,"Gifted",7.6,0,1,0,0,0,0,0,0],
    [3,"Travelling Salesman",5.9,0,1,0,0,0,1,0,0],
    [51,"Avatar",7.9,0,0,0,0,0,0,0,0],
    [47,"The Karate Kid",7.2,0,1,0,0,0,0,0,0],
    [50,"A Brilliant Young Mind",7.2,0,1,0,0,0,0,0,0],
    [49,"A Time To Kill",7.4,0,1,1,0,1,0,0,0],
    [30,"Interstellar",8.6,0,1,0,0,0,0,0,0],
    [94,"The Wolf of Wall Street",8.2,1,0,0,1,1,0,0,0],
    [6,"Black Panther",7.4,0,0,0,0,0,0,0,0],
    [73,"Inception",8.8,0,0,0,0,0,0,0,0],
    [44,"The Wind Rises",7.8,1,1,0,0,0,0,0,0],
    [65,"Spirited Away",8.6,0,0,0,0,0,0,0,0],
    [48,"Finding Forrester",7.3,0,1,0,0,0,0,0,0],
    [27,"The Fountain",7.3,0,0,0,0,0,0,0,0],
    [57,"The DaVinci Code",6.6,0,0,1,0,0,1,0,0],
    [57,"Stand and Deliver",7.3,0,1,0,0,0,0,0,0],
    [14,"The Terminator",8,0,0,0,0,0,0,0,0],
    [69,"21 Jump Street",7.2,0,0,0,1,1,0,0,0],
    [98,"The Avengers",8.1,0,0,0,0,0,0,0,0],
    [17,"Thor: Ragnarok",7.9,0,0,0,1,0,0,0,0],
    [12,"Spirit: Stallion of the Cimarron",7.1,0,0,0,0,0,0,0,0],
    [1,"Hacksaw Ridge",8.2,1,1,0,0,0,0,1,0],
    [86,"12 Years a Slave",8.1,1,1,0,0,0,0,1,0],
    [46,"Queen of Katwe",7.4,1,1,0,0,0,0,0,0]
]

In [27]:
def recommend_movies(movie_query, k_recommendations):
    # Prepare the data for use in the knn algorithm by picking
    # the relevant columns and converting the numeric columns
    # to numbers since they were read in as strings
    movies_recommendation_data = []
    for row in raw_movies_data:
        data_row = list(map(float, row[2:]))
        movies_recommendation_data.append(data_row)

    # Use the KNN algorithm to get the 5 movies that are most
    # similar to The Post.
    recommendation_indices, _ = knn(
        movies_recommendation_data, movie_query, k=k_recommendations,
        distance_fn=euclidean_distance, choice_fn=lambda x: None
    )

    movie_recommendations = []
    for _, index in recommendation_indices:
        movie_recommendations.append(raw_movies_data[index])

    return movie_recommendations

if __name__ == '__main__':
    the_post = [7.2, 1, 1, 0, 0, 0, 0, 1, 0] # feature vector for The Post
    recommended_movies = recommend_movies(movie_query=the_post, k_recommendations=5)

    # Print recommended movie titles
    for recommendation in recommended_movies:
        print(recommendation[1])

12 Years a Slave
Hacksaw Ridge
Queen of Katwe
The Wind Rises
A Beautiful Mind
