In [131]:
##Dataset url: https://grouplens.org/datasets/movielens/latest/

import pandas as pd
import numpy as np
from random import seed
from random import randrange
from csv import reader
from math import sqrt

In [132]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = cosineSimilarity(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors

In [133]:
#Calculate cosine similarity between two vectors
def cosineSimilarity(row1, row2):
    sumv1, sumv2, sumv1v2 = 0, 0, 0
    for i in range(len(row1)-1):
        x = row1[i]
        y = row2[i]
        sumv1 += x * x
        sumv2 += y * y
        sumv1v2 += x * y
    return -(sumv1v2 / (sqrt(sumv1) * sqrt(sumv2)))

In [134]:
# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)

In [135]:
#Calculate the Hamming distance between two vectors
def hamming_distance(row1, row2):
    hdistance = 0.0
    for i in range(len(row1)-1):
        if row1[i] != row2[i]:
            hdistance += 1
    return hdistance

In [136]:
#Calculate the Manhattan distance between two vectors
def manhattan_distance(row1, row2):
    mdistance = 0.0
    for i in range(len(row1)-1):
        mdistance += abs(row1[i] - row2[i])
    return mdistance

In [137]:
movies_df = pd.read_csv('C:/Users/kkr0219/Documents/ML - KNN/ml-20m/movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
rating_df=pd.read_csv('C:/Users/kkr0219/Documents/ML - KNN/ml-20m/ratings.csv',usecols=['userId', 'movieId', 'rating'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#movies_df.head()
movies_arr = movies_df.values
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [138]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [139]:
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,2,3.5,Jumanji (1995)
1,5,2,3.0,Jumanji (1995)
2,13,2,3.0,Jumanji (1995)
3,29,2,3.0,Jumanji (1995)
4,34,2,3.0,Jumanji (1995)


In [140]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,"""Great Performances"" Cats (1998)",155
1,#chicagoGirl: The Social Network Takes on a Di...,3
2,$ (Dollars) (1971),24
3,$5 a Day (2008),39
4,$9.99 (2008),55


In [141]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,2,3.5,Jumanji (1995),22243
1,5,2,3.0,Jumanji (1995),22243
2,13,2,3.0,Jumanji (1995),22243
3,29,2,3.0,Jumanji (1995),22243
4,34,2,3.0,Jumanji (1995),22243


In [142]:
popularity_threshold = 20000
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,2,3.5,Jumanji (1995),22243
1,5,2,3.0,Jumanji (1995),22243
2,13,2,3.0,Jumanji (1995),22243
3,29,2,3.0,Jumanji (1995),22243
4,34,2,3.0,Jumanji (1995),22243


In [143]:
## First lets create a Pivot matrix

movie_features_df=rating_popular_movie.pivot_table(index='movieId',columns='userId',values='rating').fillna(0)
#movie_features_df.head()

#Convert pivot table to dataframe and then to array of vectors
flattened = pd.DataFrame(movie_features_df.to_records())
dataset = flattened.values #array of vectors
#dataset
flattened.shape

(160, 135726)

In [144]:
flattened.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,138484,138485,138486,138487,138488,138489,138490,138491,138492,138493
0,1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,...,0.0,0.0,5.0,0.0,3.0,0.0,0.0,2.0,0.0,3.5
1,2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0
2,6,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,0.0,0.0,0.0,0.0,3.0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0


In [151]:
#Add random movie index selector from dataset code
rand_index = np.random.choice(len(dataset))

print("Movies similar to " + movies_arr[(int(dataset[rand_index][0]) - 1)][1] + " are :-")

similarity = get_neighbors(dataset, dataset[rand_index], 10)
recomovieindices = list()
for movie in similarity:
    movieid = int(movie[0] - 1)
    recomovieindices.append(movieid + 1)
    print(movies_arr[movieid][1])

Movies similar to Clueless (1995) are :-
Clueless (1995)
Babe (1995)
True Lies (1994)
Get Shorty (1995)
I Love Trouble (1994)
I Like It Like That (1994)
Rob Roy (1995)
Wyatt Earp (1994)
Pushing Hands (Tui shou) (1992)
Beauty of the Day (Belle de jour) (1967)


In [152]:
#Accuracy calculation

#Select random movie from dataset and choose first user with a non zero rating
for user_rating in range(1, len(dataset[rand_index]) - 1):
    if dataset[rand_index][user_rating] > 0:
        userid = user_rating
        break

#Select all movies for user from ratings dataframe
is_userid = rating_df['userId'] == userid
moviesbyuser = rating_df[is_userid]

movieindexes = moviesbyuser['movieId'].values

#Compare recommended movies with movies watched by user
count = 0
for movieid in recomovieindices:
    if np.count_nonzero(movieindexes == movieid) > 0:
        count = count + 1

print("Accuracy for recommender system is " + str(100*count/10) + "%")

Accuracy for recommender system is 90.0%
