<a href="https://colab.research.google.com/github/leonistor/ml-manning/blob/master/06-data-mining-machine-learning-techniques/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KNN (K-nearest Heighbors)

In [0]:
# ! wget 'https://github.com/miaomiao3/tensorflow_fm/raw/master/ml-100k/u.data'
# ! pwd
# ! ls -lah
# prepend user 0 data
# ! echo -e '0\t50\t5\t881250949\n0\t172\t5\t881250949\n0\t133\t1\t881250949' > user0.data
# ! mv u.data userall.data
# ! cat user0.data userall.data > u.data
# ! head u.data

In [12]:
import pandas as pd
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [27]:
import numpy as np
# aggregate by rating (count and avg)
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieProperties.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [16]:
# nomalize ratings
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [40]:
# extract genre
# ! wget 'https://raw.githubusercontent.com/miaomiao3/tensorflow_fm/master/ml-100k/u.item'
# ! head u.item
movieDict = {}
with open('u.item', encoding='iso-8859-1') as f:
  temp = ''
  for line in f:
    fields = line.rstrip('\n').split('|')
    movieID = int(fields[0])
    name = fields[1]
    genres = fields[5:25]
    genres = list(map(int, genres))
    movieDict[movieID] = (name, genres, movieNormalizedNumRatings.loc[movieID].get('size'),
                          movieProperties.loc[movieID].rating.get('mean'))

# x = list(map(int, ['0', '1', '1']))
# x
movieDict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.7735849056603774,
 3.8783185840707963)

In [42]:
# compute similarity: "ditance" based on how similar the genres are and houw similar the popularity is
from scipy import spatial

def ComputeDistance(a, b):
  genresA = a[1]
  genresB = b[1]
  genresDistance = spatial.distance.cosine(genresA, genresB)
  popA = a[2]
  popB = b[2]
  popDistance = abs(popA - popB)
  return genresDistance + popDistance

ComputeDistance(movieDict[2], movieDict[4])

0.8004574042309892

In [43]:
print(movieDict[2])
print(movieDict[4])

('GoldenEye (1995)', [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', [0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0.3567753001715266, 3.550239234449761)


## Neighbors

In [56]:
import operator

def getNeighbors(movieID, K):
  distances = []
  for movie in movieDict:
    if (movie != movieID):
      dist = ComputeDistance(movieDict[movieID], movieDict[movie])
      distances.append((movie, dist))
  distances.sort(key=operator.itemgetter(1))
  neighbors = []
  for x in range(K):
    neighbors.append(distances[x][0])
  return neighbors

K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
  avgRating += movieDict[neighbor][3]
  print(movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

avgRating /= float(K)

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463


In [50]:
# avg rating of K-nearest neighbors (predicted rating)
avgRating

3.3445905900235564

In [51]:
# actual movie 1 rating
movieDict[1]

('Toy Story (1995)',
 [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 0.7735849056603774,
 3.8783185840707963)