Given the rating data of movies, let's implement 'get_k_neighbors', a function that saves k neighbors similar to a particular user.
I'm going to find user0's movie3. 

In [23]:
import pandas as pd
import numpy as np
from math import sqrt


In [24]:
RATING_DATA_PATH = 'ratings.csv'  # Define rate data root.
df = pd.read_csv('ratings.csv')
np.set_printoptions(precision=2)  # Output up to second decimal place.

df.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,19
0,0,2.0,3.0,4.0,,2.0,3.0,,,,...,4.0,,,1.0,,,2.0,5.0,2.0,
1,1,,,,4.0,,5.0,,,2.0,...,4.0,,1.0,,,,,5.0,,
2,2,2.0,,1.0,,,5.0,5.0,,,...,,5.0,3.0,3.0,,3.0,3.0,4.0,,
3,3,,3.0,5.0,,,,4.0,3.0,,...,3.0,,4.0,,,2.0,,,,2.0
4,4,,,,5.0,,,2.0,2.0,2.0,...,3.0,4.0,2.0,5.0,4.0,,,,4.0,2.0


In [25]:
# Function that calculates Euclidean distance.
def distance(user_1, user_2):
    return sqrt(np.sum((user_1 - user_2)**2))

In [26]:
# Function that excludes users who haven't rate 'movie_id'th movie.
def filter_users_without_movie(rating_data, movie_id):
    return rating_data[~np.isnan(rating_data[:,movie_id])]

In [27]:
# Function that fills empty values of the rating data with the average value of each user.
def fill_nan_with_user_mean(rating_data):
    filled_data = np.copy(rating_data)  # Copy rating data not to damage it.
    row_mean = np.nanmean(filled_data, axis=0)  # Calculate the user average ratings.
    
    inds = np.where(np.isnan(filled_data))  # Find the empty index.
    filled_data[inds] = np.take(row_mean, inds[1])  # Fill in the empty index with the user average rate. 
    
    return filled_data

In [28]:
# Function that finds the user's neighbors corresponding to user_id.

def get_k_neighbors(user_id, rating_data, k):
    distance_data = np.copy(rating_data)  # Copy rating data not to damage it.
    # Append a row to add 'distance_Data.
    distance_data = np.append(distance_data, np.zeros((distance_data.shape[0], 1)), axis=1)
    
    for i in range(len(distance_data)):
        row = distance_data[i]
        
        if i == user_id:  # Set as inf if they are the same users.
            row[-1] = np.inf
        
        else:  # Add distance data if not.
            row[-1] = distance(distance_data[user_id][:-1], row[:-1])

    
    # Sorts data by distance column
    distance_data = distance_data[np.argsort(distance_data[:, -1])]
    
    # Exclude the distance row + retrun k rows.
    return distance_data[:k, :-1]

In [29]:
# Execution Code    
rating_data = pd.read_csv(RATING_DATA_PATH, index_col='user_id').values  # Call rating_data.
rating_data

array([[ 2.,  3.,  4., nan,  2.,  3., nan, nan, nan,  4.,  4., nan, nan,
         1., nan, nan,  2.,  5.,  2., nan],
       [nan, nan, nan,  4., nan,  5., nan, nan,  2., nan,  4., nan,  1.,
        nan, nan, nan, nan,  5., nan, nan],
       [ 2., nan,  1., nan, nan,  5.,  5., nan, nan, nan, nan,  5.,  3.,
         3., nan,  3.,  3.,  4., nan, nan],
       [nan,  3.,  5., nan, nan, nan,  4.,  3., nan,  5.,  3., nan,  4.,
        nan, nan,  2., nan, nan, nan,  2.],
       [nan, nan, nan,  5., nan, nan,  2.,  2.,  2., nan,  3.,  4.,  2.,
         5.,  4., nan, nan, nan,  4.,  2.],
       [nan,  4.,  3., nan, nan,  5., nan, nan, nan,  3.,  2., nan, nan,
        nan,  1., nan,  4.,  3., nan,  5.],
       [ 3., nan, nan, nan,  4.,  5.,  2.,  2., nan,  3., nan,  4., nan,
        nan,  2.,  4., nan, nan, nan,  1.],
       [nan,  5.,  1.,  4., nan, nan, nan,  3.,  1., nan,  1., nan, nan,
         4., nan, nan,  2., nan, nan,  4.],
       [nan,  3.,  1.,  1., nan,  3., nan,  4., nan,  1.,  1., n

In [30]:
filtered_data = filter_users_without_movie(rating_data, 3)  
filtered_data

array([[nan, nan, nan,  4., nan,  5., nan, nan,  2., nan,  4., nan,  1.,
        nan, nan, nan, nan,  5., nan, nan],
       [nan, nan, nan,  5., nan, nan,  2.,  2.,  2., nan,  3.,  4.,  2.,
         5.,  4., nan, nan, nan,  4.,  2.],
       [nan,  5.,  1.,  4., nan, nan, nan,  3.,  1., nan,  1., nan, nan,
         4., nan, nan,  2., nan, nan,  4.],
       [nan,  3.,  1.,  1., nan,  3., nan,  4., nan,  1.,  1., nan,  4.,
         4.,  2., nan, nan,  5.,  3., nan],
       [ 1.,  1., nan,  1.,  1., nan, nan, nan, nan, nan, nan,  2., nan,
        nan,  1., nan, nan, nan,  4., nan],
       [ 3., nan, nan,  5.,  4., nan, nan,  4.,  5.,  3., nan,  1.,  2.,
         1.,  1., nan, nan, nan,  4., nan],
       [nan, nan,  2.,  5., nan, nan, nan, nan, nan, nan, nan, nan,  1.,
        nan, nan, nan,  3.,  1.,  5.,  2.],
       [nan,  5., nan,  5.,  3., nan, nan,  3.,  2.,  4.,  2., nan,  4.,
         4.,  5.,  4.,  2., nan,  1.,  3.],
       [ 2., nan, nan,  2.,  5., nan,  2., nan,  2.,  1., nan, n

In [31]:
filled_data = fill_nan_with_user_mean(filtered_data)  
filled_data

array([[2.  , 3.5 , 1.33, 4.  , 3.25, 5.  , 2.  , 3.2 , 2.  , 2.25, 4.  ,
        2.33, 1.  , 3.33, 2.5 , 3.5 , 2.33, 5.  , 3.14, 2.6 ],
       [2.  , 3.5 , 1.33, 5.  , 3.25, 4.  , 2.  , 2.  , 2.  , 2.25, 3.  ,
        4.  , 2.  , 5.  , 4.  , 3.5 , 2.33, 3.67, 4.  , 2.  ],
       [2.  , 5.  , 1.  , 4.  , 3.25, 4.  , 2.  , 3.  , 1.  , 2.25, 1.  ,
        2.33, 2.71, 4.  , 2.5 , 3.5 , 2.  , 3.67, 3.14, 4.  ],
       [2.  , 3.  , 1.  , 1.  , 3.25, 3.  , 2.  , 4.  , 2.33, 1.  , 1.  ,
        2.33, 4.  , 4.  , 2.  , 3.5 , 2.33, 5.  , 3.  , 2.6 ],
       [1.  , 1.  , 1.33, 1.  , 1.  , 4.  , 2.  , 3.2 , 2.33, 2.25, 2.2 ,
        2.  , 2.71, 3.33, 1.  , 3.5 , 2.33, 3.67, 4.  , 2.6 ],
       [3.  , 3.5 , 1.33, 5.  , 4.  , 4.  , 2.  , 4.  , 5.  , 3.  , 2.2 ,
        1.  , 2.  , 1.  , 1.  , 3.5 , 2.33, 3.67, 4.  , 2.6 ],
       [2.  , 3.5 , 2.  , 5.  , 3.25, 4.  , 2.  , 3.2 , 2.33, 2.25, 2.2 ,
        2.33, 1.  , 3.33, 2.5 , 3.5 , 3.  , 1.  , 5.  , 2.  ],
       [2.  , 5.  , 1.33, 5.  , 3.  , 4. 

In [32]:
user_0_neighbors = get_k_neighbors(0, filled_data, 5)  
user_0_neighbors

array([[2.  , 3.5 , 1.33, 5.  , 3.25, 4.  , 2.  , 2.  , 2.  , 2.25, 3.  ,
        4.  , 2.  , 5.  , 4.  , 3.5 , 2.33, 3.67, 4.  , 2.  ],
       [2.  , 5.  , 1.  , 4.  , 3.25, 4.  , 2.  , 3.  , 1.  , 2.25, 1.  ,
        2.33, 2.71, 4.  , 2.5 , 3.5 , 2.  , 3.67, 3.14, 4.  ],
       [2.  , 3.5 , 2.  , 5.  , 3.25, 4.  , 2.  , 3.2 , 2.33, 2.25, 2.2 ,
        2.33, 1.  , 3.33, 2.5 , 3.5 , 3.  , 1.  , 5.  , 2.  ],
       [3.  , 3.5 , 1.33, 5.  , 4.  , 4.  , 2.  , 4.  , 5.  , 3.  , 2.2 ,
        1.  , 2.  , 1.  , 1.  , 3.5 , 2.33, 3.67, 4.  , 2.6 ],
       [1.  , 1.  , 1.33, 1.  , 1.  , 4.  , 2.  , 3.2 , 2.33, 2.25, 2.2 ,
        2.  , 2.71, 3.33, 1.  , 3.5 , 2.33, 3.67, 4.  , 2.6 ]])

In [33]:
# Find the predicted user rating.
def predict_user_rating(rating_data, k, user_id, movie_id,):
    # Exclude users who have not watched the 'movie_id'th movie from the data in advance.
    filtered_data = filter_users_without_movie(rating_data, movie_id)

    filled_data = fill_nan_with_user_mean(filtered_data)

    neighbors = get_k_neighbors(user_id, filled_data, k)

    return np.mean(neighbors[:, movie_id])

In [34]:
# Execution code

rating_data = pd.read_csv(RATING_DATA_PATH, index_col='user_id').values
# Use five neighbors and find the predicted rating of user0's movie 3.
predict_user_rating(rating_data, 5, 0, 3)  

4.0