# remove outliers from data and recommendation
Take User based CF for example

## load package

In [1]:
import heapq
from collections import defaultdict
from operator import itemgetter

from surprise import KNNBasic

from utils.MovieLens3 import MovieLens
from utils.RecommenderMetrics import RecommenderMetrics
from utils.EvaluationData import EvaluationData

In [2]:
def LoadMovieLensData():
    ml = MovieLens()
    print("Loading movie ratings...")
    data = ml.loadMovieLensLatestSmall()
    print("\nComputing movie popularity ranks so we can measure novelty later...")
    rankings = ml.getPopularityRanks()
    return (ml, data, rankings)

## read data

In [3]:
ml, data, rankings = LoadMovieLensData()

evalData = EvaluationData(data, rankings)

Loading movie ratings...
Raw ratings data:
   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205
(100004, 4)
Ratings by user:
   userId  rating
0       1      20
1       2      76
2       3      51
3       4     204
4       5     100
Users with outliers computed:
   userId  outlier
0       1    False
1       2    False
2       3    False
3       4    False
4       5    False
Merged dataframes:
   userId  movieId  rating   timestamp  outlier
0       1       31     2.5  1260759144    False
1       1     1029     3.0  1260759179    False
2       1     1061     3.0  1260759182    False
3       1     1129     2.0  1260759185    False
4       1     1172     4.0  1260759205    False
Filtered ratings data:
   userId  movieId  rating
0       1       31     2.5
1       1     1029     3.0
2       1     1061     3.0
3   

## user based CF with KNN

In [4]:
# Train on leave-One-Out train set
trainSet = evalData.GetLOOCVTrainSet()
sim_options = {'name': 'cosine',
               'user_based': True
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

leftOutTestSet = evalData.GetLOOCVTestSet()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [5]:
# Build up dict to lists of (int(movieID), predictedrating) pairs
topN = defaultdict(list)
k = 10
for uiid in range(trainSet.n_users):
    # Get top N similar users to this one
    similarityRow = simsMatrix[uiid]
    
    similarUsers = []
    for innerID, score in enumerate(similarityRow):
        if (innerID != uiid):
            similarUsers.append( (innerID, score) )
    
    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
    
    # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
    candidates = defaultdict(float)
    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
        
    # Build a dictionary of stuff the user has already seen
    watched = {}
    for itemID, rating in trainSet.ur[uiid]:
        watched[itemID] = 1
        
    # Get top-rated items from similar users:
    pos = 0
    for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            topN[int(trainSet.to_raw_uid(uiid))].append( (int(movieID), 0.0) )
            pos += 1
            if (pos > 40):
                break 

In [6]:
# Measure
print("HR", RecommenderMetrics.HitRate(topN, leftOutTestSet))

HR 0.04420731707317073
