In [3]:
import os
import pandas as pd

In [4]:
# 사용자 u.user파일을 DataFrame으로 읽기
base_src = "./"
u_user_src = os.path.join(base_src, "u.user")
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(u_user_src, sep='|', 
                    names=u_cols, 
                    encoding='latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [5]:
# u.item 파일을 DataFrame으로 읽기
u_item_src = os.path.join(base_src,'u.item')
i_cols = ['movie_id','title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
          'Children\'s','Comedy','Crime','Documentary','Drama','Fantasy',
          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src,
	    sep='|',
            names=i_cols,
            encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head()


Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
# u.data 파일을 DataFrame으로 읽기
u_data_src = os.path.join(base_src,'u.data')
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(u_data_src,
        sep = '\t',
        names = r_cols,
        encoding='latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


In [7]:
# 인기 제품 방식 추천
def recommend_movie(n_items):
    movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recommend_movies = movies.loc[movie_sort.index]
    recommendations = recommend_movies[['title']]
    return recommendations

recommend_movie(5)

Unnamed: 0_level_0,title
movie_id,Unnamed: 1_level_1
814,"Great Day in Harlem, A (1994)"
1599,Someone Else's America (1995)
1201,Marlene Dietrich: Shadow and Light (1996)
1122,They Made Me a Criminal (1939)
1653,Entertaining Angels: The Dorothy Day Story (1996)


## Best-Seller
- 각 영화의 평균 평점(=best-seller 방식)을 예측값으로 두고
- 실제 사용자 평점과 비교해 RMSE(Root Mean Square Error) 를 계산하는 코드

In [18]:
# 100k의 영화 평점에 대해 실제값과 best-seller 방식으로 구현 예측값의 RMSE를 계산하는 코드
import numpy as np

# 값이 작을수록 예측이 실제에 가깝다는 의미
def RMSE(y_true, y_pred): 
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 정확도 계산
rsme = []

# 영화별 평균 평점 (best-seller 방식)
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()

for user in set(ratings.index):
    y_true = ratings.loc[user]['rating']

    # best-seller 방식으로
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    accuracy = RMSE(y_true, y_pred)
    rsme.append(accuracy)

# RMSE 계산
print(np.mean(rsme))

0.996007224010567


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# ✅ 1️⃣ 데이터 불러오기 (MovieLens 100k 예시)
# ratings = pd.read_csv('u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
# ratings = ratings.set_index('user_id')  # user_id를 인덱스로 설정

# ✅ 2️⃣ 사용자-영화 행렬 만들기 (NaN → 0으로 채움)
rating_matrix = ratings.pivot_table(values='rating', index=ratings.index, columns='movie_id').fillna(0)

# ✅ 3️⃣ 사용자 간 유사도 계산 (코사인 유사도)
user_similarity = pd.DataFrame(
    cosine_similarity(rating_matrix),
    index=rating_matrix.index,
    columns=rating_matrix.index
)

# ✅ 4️⃣ RMSE 함수
def RMSE(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = (y_true > 0) & (y_pred > 0)
    if np.sum(mask) == 0:
        return np.nan
    return np.sqrt(np.mean((y_true[mask] - y_pred[mask])**2))

# ✅ 5️⃣ 사용자 기반 CF 예측 함수
def predict_user_rating(user_id):
    # 1) 유사 사용자 점수 불러오기
    sim_scores = user_similarity[user_id].copy()
    sim_scores[user_id] = 0  # 자기 자신 제외

    # 2) 유사도 합
    sim_sum = np.sum(np.abs(sim_scores))
    if sim_sum == 0:
        # 유사 사용자 없으면 전체 영화 평균 반환
        return rating_matrix.mean(axis=0)

    # 3) 가중합 계산 (유사도 * 평점)
    weighted_sum = rating_matrix.T.dot(sim_scores)
    pred = weighted_sum / sim_sum
    return pred

# ✅ 6️⃣ RMSE 계산
rmse_list = []

for user_id in rating_matrix.index:
    y_true = rating_matrix.loc[user_id]
    y_pred = predict_user_rating(user_id)
    rmse = RMSE(y_true, y_pred)
    if not np.isnan(rmse):
        rmse_list.append(rmse)

# ✅ 7️⃣ 결과 출력
print("User-based CF (Cosine) RMSE:", np.mean(rmse_list))


User-based CF RMSE: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
