In [1]:
import os
import numpy as np
import pandas as pd

### data

In [2]:
base_src = 'D:/기타자료/AI 스터디/기타/data/Python을 이용한 개인화 추천시스템'

In [3]:
# user data
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv(u_user_src, sep='|', names=u_cols, encoding='latin-1')
# users = users.set_index('user_id')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
# movie data
u_item_src = os.path.join(base_src, 'u.item')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s',
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv(u_item_src, sep='|', names=i_cols, encoding='latin-1')
# movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
# rating data
u_rating_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv(u_rating_src, sep='\t', names=r_cols, encoding='latin-1')
# ratings = ratings.set_index('movie_id')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
ratings = ratings.drop('timestamp', axis=1)
movies = movies[['movie_id', 'title']]

### 단순 CF 알고리즘

In [7]:
# 데이터 train, test set 분리
from sklearn.model_selection import train_test_split
import numpy as np

# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# dataset 만들기
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# pivot 
ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

# cosine sim 계산
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)  # cosine_similarity 대신 피어슨 상관계수 사용해도 됨

user_similarity = pd.DataFrame(user_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

# 주어진 영화의(movie_id) 가중평균 rating을 계산하는 함수
def CF_simple(user_id, movie_id):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # 가중평균
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        
    else:
        mean_rating = 3.0
        
    return mean_rating 

In [8]:
# score 계산
score(CF_simple)

1.0187844051470987

### 이웃을 고려한 CF
- 모든 유사도 사용자들 간의 유사도를 가중평균 하는 것이 아니라,, 정말로 유사도가 높은 사용자들만 선정해서 그 사람들과의 평점만을 가지고 가중평균을 해서 예측치를 냄

In [116]:
# 데이터 train, test set 분리
from sklearn.model_selection import train_test_split
import numpy as np

# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 유사집단의 크기를 미리 정하기 위해서 기존 score 함수에 neighbor_size 인자값 추가
def score(model,neighbor_size=0):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

# dataset 만들기
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# pivot 
ratings_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

# cosine sim 계산
from sklearn.metrics.pairwise import cosine_similarity

matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)  # cosine_similarity 대신 피어슨 상관계수 사용해도 됨

user_similarity = pd.DataFrame(user_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

### Neighbor size를 정해서 예측치를 계산하는 함수 ###
def CF_knn(user_id, movie_id, neighbor_size):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        # 가중평균
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        
        else:
            if len(sim_scores) > 1:  # 나랑 유사한 게 2,3개 이렇게 있는 경우에는 ?!
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                
                user_idx = np.argsort(sim_scores)                   # sim score 가 작은 값을 순서대로 
                sim_scores = sim_scores[user_idx][-neighbor_size:]  # 최상위 neighbor_size 개 뽑기
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]  # 최상위 neighbor_size 개 뽑기
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
                
    return mean_rating 

In [117]:
score(CF_knn, neighbor_size=20)

1.0198012365123228

In [128]:
# 실제 주어진 사용자에 대해 추천을 받는 기능 구현
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')
martix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id].copy()
    
    for movie in rating_matrix.columns:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
            
        else:
            user_movie.loc[movie] = CF_knn(user_id, movie, neighbor_size)
            
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    
    return recommendations

In [129]:
# 729 user한테 5개의 아이템을 추천해줘라 (단, 너랑 유사한 30명의 유저를 고려해서 추천해줘라)
recom_movie(user_id=729, n_items=5, neighbor_size=30) 

movie_id
1175                           Welcome To Sarajevo (1997)
1467                                     Cure, The (1995)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
973                                 Eye for an Eye (1996)
868                                  Fools Rush In (1997)
Name: title, dtype: object