<a href="https://colab.research.google.com/github/kjw9797/2020-2_BigData_TP/blob/main/movielens_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file1 = '/content/drive/MyDrive/ratings.csv'
file2 = '/content/drive/MyDrive/movies.csv'

In [None]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import time

In [None]:
df_ratings = pd.read_csv(file1)
df_movies = pd.read_csv(file2, error_bad_lines=False)

In [None]:
df= pd.merge(df_ratings.drop('timestamp', axis=1), df_movies.drop('genres', axis=1), how='outer', on='movieId') [['movieId','userId','rating']].sort_values(by=['movieId']).fillna(0)

In [None]:
genre_list = ['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'IMAX',
 'Documentary',
 'War',
 'Musical',
 'Western',
 'Film-Noir',
 '(no genres listed)']

In [None]:
df.head()

Unnamed: 0,movieId,userId,rating
643435,1,120023.0,4.0
605233,1,11386.0,3.0
605232,1,11385.0,3.0
605231,1,11382.0,5.0
605230,1,11380.0,4.5


In [None]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [None]:
movieProperties = df.groupby('movieId').agg({'rating': [np.size, np.mean]})
 
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

In [None]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movieDict = {}
for index, row in df_movies.iterrows():    
  movieID = int(row['movieId'])
  name = row['title']
  genres = row[2:].str.split('|')
  genres = list(genres.values)  
  temp = []
  for gen in genre_list:    
    if gen in genres[0]:
      temp.append(1)
    else:
      temp.append(0)
  movieDict[movieID] = (name, np.array(temp), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))

In [None]:
from scipy import spatial
 
# 장르, 인기도의 cosine 유사도 적용
def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
 
ComputeDistance(movieDict[1], movieDict[4])

1.4141044603571462

In [None]:
import operator
 
# neighbors 출력
def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        # 같은 movie가 아닐때만 movie distance를 구함
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    # movie distance를 sort시켜주어 가장 가까운 영화들을 추천
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    neighbor_distance = []
    for x in range(K):
        neighbors.append(distances[x][0])
        neighbor_distance.append(distances[x][1])
    return neighbors, neighbor_distance
 
 
# 최종 추천
def recommend(movieID,K):
    avgRating = 0
    print(movieDict[movieID], '\n')
    neighbors, neighbor_distance = getNeighbors(movieID, K) 
    idx = 0    
    for neighbor in neighbors:
        # neigbor의 평균 rating을 더해줌
        avgRating += movieDict[neighbor][3]
        print(movieDict[neighbor][0] + ", with distance of " + str(neighbor_distance[idx]) + "  avg ratings: " + str(movieDict[neighbor][3]))
        idx = idx+1
    avgRating /= K
    print("평균 Rating: ",avgRating)
 
recommend(1,10)

('Toy Story (1995)', array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.703251932752485, 3.893707794587238) 

Shrek (2001), with distance of 0.27127436472581523  avg ratings: 3.7548282627709617
Monsters, Inc. (2001), with distance of 0.27901583016321024  avg ratings: 3.8486202707393264
Aladdin (1992), with distance of 0.3710148484476623  avg ratings: 3.6987526802388584
Toy Story 2 (1999), with distance of 0.37762915695177324  avg ratings: 3.8114636719927644
Finding Nemo (2003), with distance of 0.3828706369544343  avg ratings: 3.8339767227471766
Lord of the Rings: The Fellowship of the Ring, The (2001), with distance of 0.3868474499273009  avg ratings: 4.091188818716808
Lord of the Rings: The Two Towers, The (2002), with distance of 0.4432715510440024  avg ratings: 4.0680511556963515
Monty Python and the Holy Grail (1975), with distance of 0.4657518397780283  avg ratings: 4.147655276621689
Incredibles, The (2004), with distance of 0.5282243220026996  avg ratings: 3.

In [None]:
while True:
  movie_name = str(input('movie name: '))  
  for idx in movieDict:
    if movie_name in movieDict[idx][0]:
      recommend(idx, 10)
      break;  