# Assignment 1

### KNN algorithm

Reference: https://towardsdatascience.com/machine-learning-basics-with-the-k-nearest-neighbors-algorithm-6a6e71d01761

In [None]:
%%writefile source/knn_from_scratch.py

from collections import Counter
import math

def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []
    
    for index, example in enumerate(data):
        # print("index: {}, example: {}".format(index, example))
        distance = distance_fn(example[:-1], query)
        
        neighbor_distances_and_indices.append((distance, index))
    
    print("neighbor_distances_and_indices: ", neighbor_distances_and_indices)
    
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    print("sorted_neighbor_distances_and_indices: ", sorted_neighbor_distances_and_indices)
    
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    k_nearest_labels = [data[i][-1] for distance, i in k_nearest_distances_and_indices]
    
    print("mean/mode k_nearest_labels: ", choice_fn(k_nearest_labels))
    print("k_nearest_labels: ", k_nearest_labels)
    
    return k_nearest_distances_and_indices, choice_fn(k_nearest_labels)

def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]

def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)

def main():
    reg_data = [
        [65.75, 112.99],
        [71.52, 136.49],
        [69.40, 153.03],
        [68.22, 142.34],
        [67.79, 144.30],
        [68.70, 123.30],
        [69.80, 141.49],
        [70.01, 136.46],
        [67.90, 112.37],
        [66.49, 127.45],
    ]
    
    reg_query = [60]
    reg_k_nearest_neighbors, reg_prediction = knn(reg_data, reg_query, k=5, distance_fn=euclidean_distance, choice_fn=mean)
    print("Regression prediction: ", reg_k_nearest_neighbors)
    
    clf_data = [
        [22, 1],
        [23, 1],
        [21, 1],
        [18, 1],
        [19, 1],
        [25, 0],
        [27, 0],
        [29, 0],
        [31, 0],
        [45, 0],
    ]
    
    clf_query = [33]
    clf_k_nearest_neighbors, clf_prediction = knn(clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode)
    print("Classification prediction: ", clf_k_nearest_neighbors)
    
if __name__=='__main__':
    main()

In [None]:
import cfxmagic

In [None]:
%%qsub
cd $PBS_O_WORKDIR
python source/knn_from_scratch.py

In [None]:
%ls STDIN.*
%cat STDIN.o876009

### Use algorithm

In [None]:
%%writefile source/knn_movies_recommender.py

from knn_from_scratch import knn, euclidean_distance

def recommend_movies(movie_query, k_recommendations):
    raw_movies_data = []
    with open('movies_recommendation_data.csv', 'r') as md:
        next(md)
        
        for line in md.readlines():
            data_row = line.strip().split(',')
            raw_movies_data.append(data_row)
    
    movies_recommendation_data = []
    for row in raw_movies_data:
        data_row = list(map(float, row[2:]))
        movies_recommendation_data.append(data_row)
    
    recommendation_indices, _ = knn(movies_recommendation_data, movie_query, k=k_recommendations, distance_fn=euclidean_distance, choice_fn=lambda x:None)
    
    movie_recommendations = []
    for _, index in recommendation_indices:
        movie_recommendations.append(raw_movies_data[index])
    return movie_recommendations

if __name__=='__main__':
    # the_post = [7.2, 1, 1, 0, 0, 0, 0, 1, 0] 
    the_post = [8, 1, 1, 1, 0, 0, 0, 1, 0]
    recommended_movies = recommend_movies(movie_query=the_post, k_recommendations=5)
    
    for recommendation in recommended_movies:
        print("Recommended movies: ", recommendation[1])

### Run movie recommendation

In [None]:
import cfxmagic

In [None]:
%%qsub
cd $PBS_O_WORKDIR
python source/knn_movies_recommender.py

Show output

In [None]:
%ls STDIN.*
%cat STDIN.o876010

Then, convert to dpcpp.