In [27]:
import os
import pandas as pd
from sklearn.neighbors import NearestNeighbors


1데이터 불러오기

In [41]:
df_movies = pd.read_csv('movies.csv')
df_ratings=pd.read_csv('ratings.csv')
# 불필요한 컬럼은 지우기
df_movies = df_movies.drop(['genres'], axis=1)
df_ratings=df_ratings.drop(['timestamp'], axis=1)

In [42]:
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

2 reshaping the data 데이터 가공하기

In [62]:
from scipy.sparse import csr_matrix
# pivot ratings into movie features -- movie-user 로 이루어진 매트릭스 만들기
movie_user_matrix = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

# create mapper from movie title to index -- 무비 타이틀을 인덱스로 바꾸기?
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_matrix.index].title))
}

# convert dataframe of movie features to scipy sparse matrix -- 매트릭스를 sparse 매트릭스로 바꾸기
movie_user_mat_sparse = csr_matrix(movie_user_matrix.values)

3 fitting the model KNN 클러스터링 적용하기

In [63]:
%env JOBLIB_TEMP_FOLDER=/tmp
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)

env: JOBLIB_TEMP_FOLDER=/tmp


NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

4 movie reccommendations KNN 기준으로 영화 추천해주기

In [64]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True

    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]



def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie


    Parameters
    ----------
    model_knn: sklearn model, knn model

    data: movie-user matrix

    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))


In [67]:
my_favorite = 'Toy Story'

make_recommendation(
    #사용 알고리즘 knn
    model_knn=model_knn,
    # 사용할 데이터 - 정렬해놓은 sparse matrix
    data=movie_user_mat_sparse,
    # 유사한 영화 찾고자하는 좋아하는 movie
    fav_movie=my_favorite,
    # index로 영화를 찾아서
    mapper=movie_to_idx,
    # n 개의 개수만큼 정렬하기
    n_recommendations=10)

You have input movie: Toy Story
Found possible matches in our database: ['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)']

Recommendation system start to make inference
......

Recommendations for Toy Story:
1: Pulp Fiction (1994), with distance of 0.473138425840108
2: Star Wars: Episode VI - Return of the Jedi (1983), with distance of 0.4706659556910192
3: Shrek (2001), with distance of 0.46731490125936237
4: Jurassic Park (1993), with distance of 0.4648029221631048
5: Back to the Future (1985), with distance of 0.4632997104257994
6: Groundhog Day (1993), with distance of 0.45197697860072394
7: Independence Day (a.k.a. ID4) (1996), with distance of 0.4370543997396795
8: Forrest Gump (1994), with distance of 0.4354661385473587
9: Star Wars: Episode IV - A New Hope (1977), with distance of 0.4238121540234343
10: Toy Story 2 (1999), with distance of 0.4052901879681512
