In [1]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares


In [2]:
#데이터 준비를 해준다 + 전처리 포함!
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 여기서 3점만 남깁니다! 
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')
# 3점이상인 데어터에서 전제데이터의 83.63%를 차지한다고 알수있다!

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다! 
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings.tail()

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569


In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다. 근데 왜 NAN 나올까? 태훈이는 생각을 한다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
print(temp_user_data)

if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_artist_data = data['artist'].map(artist_to_idx.get).dropna()
if len(temp_artist_data) == len(data):
    print('artist column indexing OK!!')
    data['artist'] = temp_artist_data
else:
    print('artist column indexing Fail!!')

NameError: name 'data' is not defined

In [8]:
ratings['movie_id'].nunique()

3628

In [9]:
ratings['user_id'].nunique()

6039

In [10]:
def check_movie(my_movies, movies=movies):
    return [True if (movies['title']==x).any() else False for x in my_movies]

my_movies_title = ['Dead Poets Society (1989)', 'Good Will Hunting (1997)', 'Toy Story (1995)', 'Notting Hill (1999)', 'Forrest Gump (1994)']
print(check_movie(my_movies_title, movies))


[True, True, True, True, True]


In [11]:
def check_movie(my_movies, movies=movies):
    return [True if (movies['title']==x).any() else False for x in my_movies]

my_movies_title = ['Dead Poets Society (1989)', 'Good Will Hunting (1997)', 'Toy Story (1995)', 'Notting Hill (1999)', 'Forrest Gump (1994)']
print(check_movie(my_movies_title, movies))

# 영화리스트를 index로 변환해주는 함수
def title2index(my_movies, movies):
    return [movies[movies['title'] == movie]['movie_id'].values[0] for movie in my_movies]

my_movies_index = title2index(my_movies_title, movies)
print(my_movies_index)

# 사용자 인덱스 생성
my_id = ratings['user_id'].max() + 1

# pandas dataframe으로 바꿔줌
my_movie_list = pd.DataFrame({'user_id': [my_id]*5, 'movie_id': my_movies_index, 'count': [5.0]*5, 'timestamp': [956715648]*5})

# ratings에 추가
if not ratings.isin({'user_id':[my_id]})['user_id'].any():
    ratings = ratings.append(my_movie_list)

ratings.tail(10)

[True, True, True, True, True]
[1246, 1704, 1, 2671, 356]


Unnamed: 0,user_id,movie_id,counts,timestamp,count
1000203,6040,1090,3.0,956715518,
1000205,6040,1094,5.0,956704887,
1000206,6040,562,5.0,956704746,
1000207,6040,1096,4.0,956715648,
1000208,6040,1097,4.0,956715569,
0,6041,1246,,956715648,5.0
1,6041,1704,,956715648,5.0
2,6041,1,,956715648,5.0
3,6041,2671,,956715648,5.0
4,6041,356,,956715648,5.0


In [12]:
check_none = ratings.isnull().sum()  # null check -> 없음

n_user = ratings['user_id'].nunique()    # 6040
n_movie = ratings['movie_id'].nunique()  # 3628
print(type(n_user), n_user)
print(type(n_movie), n_movie)
# count내 어떤 종류의 별점이 있는지 확인
print(ratings['count'].unique())  # [5. 3. 4.]



<class 'int'> 6040
<class 'int'> 3628
[nan  5.]


In [13]:
als_model = AlternatingLeastSquares(factors=300, regularization=0.01, use_gpu=False, iterations=20, dtype=np.float32)

csr_data_transpose = csr_data.T

# 모델 훈련
als_model.fit(csr_data_transpose)



NameError: name 'csr_data' is not defined

In [14]:
def get_movie_name(idx):
    if idx in movies.movie_id:
        return movies[movies['movie_id'] == idx]['title'].values[0]
    else:
        print('해당 인덱스의 영화가 존재하지 않습니다.')    

# 내 벡터와 영화 포레스트검프의 벡터가져오기
forrest_gump_id = my_movies_index[4]
my_vector, forrest_gump_vector = als_model.user_factors[my_id], als_model.item_factors[my_movies_index[4]]
# my_vector와 forrest_gump_vector를 내적하는 코드
a = np.dot(my_vector, forrest_gump_vector)

# my_vector와 father_of_the_bride_vector를 내적하는 코드
father_of_the_bride_id = 5
father_of_the_bride_vector = als_model.item_factors[father_of_the_bride_id]
b = np.dot(my_vector, father_of_the_bride_vector)

print(f'내가 선호하는 영화 {get_movie_name(forrest_gump_id)}와의 선호도 : {a}')
print(f'그 외의 영화 {get_movie_name(father_of_the_bride_id)}와의 선호도 : {b}')

TypeError: 'NoneType' object is not subscriptable

In [15]:

def get_similar_movie(movie_title, movies, n=10):
    exist_movie = check_movie([movie_title], movies)[0]

    if exist_movie:
        movie_id = title2index([movie_title], movies)[0]
        similar_movie = als_model.similar_items(movie_id, N=n)
        similar_movie = [get_movie_name(i[0]) for i in similar_movie]
        return similar_movie
    
    print('해당 영화가 데이터에 없습니다.')
    return None

print(get_similar_movie(my_movies_title[0], movies))


TypeError: 'NoneType' object is not subscriptable

In [16]:
movie_recommended = als_model.recommend(my_id, csr_data, N=30, filter_already_liked_items=True)
[get_movie_name(i[0]) for i in movie_recommended]

NameError: name 'csr_data' is not defined

In [17]:
# 추천 기여도 확인
recommend_movie_id = movie_recommended[0][0]
explain = als_model.explain(my_id, csr_data, itemid=recommend_movie_id)
[(get_movie_name(i[0]), i[1]) for i in explain[1]] 

NameError: name 'movie_recommended' is not defined