In [1]:
%pip install scikit-learn



Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

import zipfile
import requests 
from io import BytesIO

r = requests.get(url)
z = zipfile.ZipFile(BytesIO(r.content))
z.extractall()



In [3]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.shape, movies.shape

((100836, 4), (9742, 3))

## 데이터 전처리

In [8]:
# 사용자 - 아이템 평점 매트릭스 생성
user_item_matrix = ratings.pivot(index='userId', columns = 'movieId', values='rating')
  # 모든 아이템에대한 사용자의 평점

# 결측치 채우기
user_item_matrix = user_item_matrix.fillna(0)

# 데이터 나누기
train_data, test_data = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

In [9]:
user_item_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 유저기반 협업 필터링 (UBCF)

In [11]:
user_similarity = cosine_similarity(train_data)
user_similarity_df = pd.DataFrame(user_similarity, index=train_data.index, columns=train_data.index)
user_similarity_df.head()



userId,24,583,288,323,133,175,339,178,159,138,...,331,215,467,122,21,72,107,271,436,103
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,1.0,0.088871,0.165263,0.123879,0.169039,0.0,0.239542,0.195493,0.135244,0.0,...,0.227686,0.215958,0.041101,0.201336,0.23342,0.201595,0.077365,0.022907,0.064935,0.163884
583,0.088871,1.0,0.086646,0.053543,0.099652,0.0,0.069184,0.05952,0.133764,0.0,...,0.077343,0.064925,0.042167,0.095583,0.112393,0.063452,0.062813,0.0,0.09317,0.119046
288,0.165263,0.086646,1.0,0.148134,0.149359,0.008126,0.231072,0.185749,0.137454,0.041236,...,0.177872,0.19717,0.061113,0.25318,0.222707,0.18186,0.120905,0.119063,0.147292,0.34479
323,0.123879,0.053543,0.148134,1.0,0.326685,0.0,0.103652,0.222585,0.111773,0.0,...,0.087569,0.128223,0.053567,0.121997,0.11933,0.247436,0.271299,0.029527,0.328214,0.15799
133,0.169039,0.099652,0.149359,0.326685,1.0,0.0,0.114057,0.344794,0.133991,0.0,...,0.073031,0.1346,0.092182,0.146312,0.079232,0.42338,0.255691,0.041026,0.343221,0.130685


In [15]:
# 유저 기반 추천 함수
def user_based_recommand(user_id, num_recommendations=5):
  user_ratings = train_data.loc[user_id]
  similar_users = user_similarity_df[user_id]
  weighted_sum = np.dot(similar_users, train_data)
  sum_of_weights = np.sum(similar_users)

  # 예측 평점 : 내가 보지 못한 영화에 대한 예측 평점을 만듬
  predicted_ratings = weighted_sum / sum_of_weights

  # 이미 본거 제외
  user_seen_movies = user_ratings[user_ratings > 0].index
  predicted_ratings = pd.Series(predicted_ratings, index=train_data.columns).drop(user_seen_movies)

  # 추천 영화 출력
  recommedations = predicted_ratings.sort_values(ascending=False).head(num_recommendations)
  recommended_movies_titles = movies[movies['movieId'].isin(recommedations.index)]
  return recommended_movies_titles

# 추천 영화 출력
user_based_recommand(1, 5)



[1.78287037e+00 8.27328496e-01 4.42050173e-01 ... 3.18479077e-04
 3.18479077e-04 3.78483163e-03]
movieId
318     2.583128
589     1.988215
858     1.742447
2762    1.602517
150     1.562748
dtype: float64


Unnamed: 0,movieId,title,genres
123,150,Apollo 13 (1995),Adventure|Drama|IMAX
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
507,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi
659,858,"Godfather, The (1972)",Crime|Drama
2078,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery


## 컨텐츠 기반 추천

In [17]:
item_similarity = cosine_similarity(train_data.T)
# index와 columns는 모두 영화 ID를 나타냄
# 예를 들어 item_similarity_df[1][2]는 영화 ID 1과 영화 ID 2 사이의 유사도 값을 의미
item_similarity_df = pd.DataFrame(item_similarity, index=train_data.columns, columns=train_data.columns) # 즉 인덱스와 컬럼 모두 기존 train_data의 컬럼으로 나타낸다는거지?
item_similarity_df.head()

# 아이템 기반 추천 함수
def item_based_recommand(movie_title, num_recommendations=5):
  movie_id = movies[movies['title'] == movie_title]['movieId'].values[0]
  similar_movies = item_similarity_df[movie_id].sort_values(ascending=False).index[1:num_recommendations+1]
  return movies[movies['movieId'].isin(similar_movies)]

# 추천 영화 출력
item_based_recommand('Toy Story (1995)', 5)




Unnamed: 0,movieId,title,genres
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
322,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
546,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
615,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
