# 1. 데이터 가져오기

### (1) ratings.dat

In [1]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
print(ratings.head())
print(len(ratings))

   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291
1000209


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   count      836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB


In [5]:
# count 값 별 비율 확인

group_by_count = ratings.groupby('count')['user_id'].count()
print(group_by_count.sort_values(ascending=False))
group_by_count = group_by_count.to_dict()
group_by_count

total_num = len(ratings)
print("total : ", total_num)

for key, value in group_by_count.items():
    print(f'{key} proportion : {value/total_num :.2%}')

count
4    348971
3    261197
5    226310
Name: user_id, dtype: int64
total :  836478
3 proportion : 31.23%
4 proportion : 41.72%
5 proportion : 27.06%


### (2) movies.dat

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  3883 non-null   int64 
 1   title     3883 non-null   object
 2   genre     3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


# 2. 데이터 살펴보기

- ratings에 있는 유니크한 영화 개수
- ratings에 있는 유니크한 사용자 수
- 가장 인기있는 영화 30개(인기순)


### (1) ratings에 존재하는 유니크한 영화 개수 확인

In [8]:
# ratings에 있는 유니크한 영화 개수

unique_movie_num = len(ratings['movie_id'].unique())
print("ratings file 내 유니크한 영화 개수 : ", unique_movie_num)

# ratings 파일의 unique한 영화 개수와 movies 파일의 영화 개수가 같은지 확인
print(unique_movie_num == len(movies))
print("movies file 내 유니크한 영화 개수 : ", len(movies))

ratings file 내 유니크한 영화 개수 :  3628
False
movies file 내 유니크한 영화 개수 :  3883


### (2) ratins에 존재하는 유니크한 사용자 수 확인

In [9]:
unique_user_num = len(ratings['user_id'].unique())
unique_user_num

6039

### (3) 가장 인기있는 영화 30개 추출

In [10]:
# 1) ratings / movies df 합치기
merged_list = pd.merge(ratings, movies, how='left')
merged_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 836477
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    836478 non-null  int64 
 1   movie_id   836478 non-null  int64 
 2   count      836478 non-null  int64 
 3   timestamp  836478 non-null  int64 
 4   title      836478 non-null  object
 5   genre      836478 non-null  object
dtypes: int64(4), object(2)
memory usage: 44.7+ MB


In [11]:
# 2) 인기있는 상위 30개 영화 추출 (누적 평점 기준)
total_ratings = merged_list.groupby("title")["count"].sum()
total_ratings.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   14449
Star Wars: Episode IV - A New Hope (1977)                13178
Star Wars: Episode V - The Empire Strikes Back (1980)    12648
Saving Private Ryan (1998)                               11348
Star Wars: Episode VI - Return of the Jedi (1983)        11303
Raiders of the Lost Ark (1981)                           11179
Silence of the Lambs, The (1991)                         11096
Matrix, The (1999)                                       10903
Sixth Sense, The (1999)                                  10703
Terminator 2: Judgment Day (1991)                        10513
Fargo (1996)                                             10465
Schindler's List (1993)                                  10317
Braveheart (1995)                                        10125
Shawshank Redemption, The (1994)                         10085
Back to the Future (1985)                                10081
Godfather, The (1972)                            

# 3. 내가 선호하는 영화를 5가지 골라서 ratings에 추가해 줍시다.

### (1) 리스트 내 가장 선호하는 영화 5개 선정

In [12]:
# 평점 5점 받은 영화 제목 리스트 추출
len(set(merged_list[merged_list["count"].values==5]['title']))
set(merged_list[merged_list["count"].values==5]['title'])

{'Hunger, The (1983)',
 'Arlington Road (1999)',
 'Mr. Death: The Rise and Fall of Fred A. Leuchter Jr. (1999)',
 'Three Caballeros, The (1945)',
 'Happiness (1998)',
 'Mad Max 2 (a.k.a. The Road Warrior) (1981)',
 '52 Pick-Up (1986)',
 'Quest for Camelot (1998)',
 'Rambo: First Blood Part II (1985)',
 'Gods Must Be Crazy II, The (1989)',
 'Boogie Nights (1997)',
 'Herbie Goes to Monte Carlo (1977)',
 'Marlene Dietrich: Shadow and Light (1996)',
 'Perfect Blue (1997)',
 'Sandpiper, The (1965)',
 'Butterfly (La Lengua de las Mariposas) (2000)',
 'Ghosts of Mississippi (1996)',
 'Autumn Sonata (H�stsonaten ) (1978)',
 'Life of �mile Zola, The (1937)',
 'Wolf Man, The (1941)',
 'Dead Ringers (1988)',
 'Out-of-Towners, The (1999)',
 'Psycho Beach Party (2000)',
 'Texas Chainsaw Massacre, The (1974)',
 'Pagemaster, The (1994)',
 'Passion Fish (1992)',
 'Omega Code, The (1999)',
 'Anne Frank Remembered (1995)',
 'Rain Man (1988)',
 'Palm Beach Story, The (1942)',
 'Ipcress File, The (1965)',

In [13]:
# 2000년 이후 개봉한 영화 추출
titles = merged_list["title"].tolist()
opening_years  = set([x[-5:-1] for x in titles])
print(sorted(opening_years))
merged_list['opening_years'] = pd.to_numeric([x[-5:-1] for x in titles])
merged_list.head()
merged_list.info()
set(merged_list[merged_list["opening_years"].values==2000]['title'])

['1919', '1920', '1921', '1922', '1923', '1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959', '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 836477
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   user_id        836478 non-null  int64 
 1   movie_id       836478 non-null  int64 
 2   count          836478 non-null  int64 
 3   timestamp      8

{'28 Days (2000)',
 '3 Strikes (2000)',
 'About Adam (2000)',
 'Adventures of Rocky and Bullwinkle, The (2000)',
 'Almost Famous (2000)',
 'American Psycho (2000)',
 'Anatomy (Anatomie) (2000)',
 'Art of War, The (2000)',
 'Autumn in New York (2000)',
 'Back Stage (2000)',
 'Bait (2000)',
 "Ballad of Ramblin' Jack, The (2000)",
 'Bamboozled (2000)',
 'Battlefield Earth (2000)',
 'Beach, The (2000)',
 'Beautiful (2000)',
 'Best in Show (2000)',
 'Beyond the Mat (2000)',
 'Big Kahuna, The (2000)',
 "Big Momma's House (2000)",
 'Bittersweet Motel (2000)',
 'Bless the Child (2000)',
 'Boiler Room (2000)',
 'Bootmen (2000)',
 'Boys and Girls (2000)',
 'Bring It On (2000)',
 'Broken Hearts Club, The (2000)',
 'Butterfly (La Lengua de las Mariposas) (2000)',
 'Catfish in Black Bean Sauce (2000)',
 'Cecil B. Demented (2000)',
 'Cell, The (2000)',
 'Center Stage (2000)',
 'Chain of Fools (2000)',
 'Chicken Run (2000)',
 'Chuck & Buck (2000)',
 'Circus (2000)',
 'Closer You Get, The (2000)',
 'C

### (2) 선정한 영화 movie_id 추출

In [14]:
# 내가 좋아하는 영화 5개 movie id 구하기 : man in black / Titanic / Bring it on / Sixth Sense / jumanji

movies['title'] = movies['title'].str.lower()
favorite_list = movies[movies['title'].str.contains("jumanji|bring it on|titanic|sixth sense|men in black")]
print(favorite_list)
favorite_movie_id = [2, 1721, 2762, 3404, 3882]

      movie_id                                   title  \
1            2                          jumanji (1995)   
1539      1580                     men in black (1997)   
1672      1721                          titanic (1997)   
2088      2157  chambermaid on the titanic, the (1998)   
2693      2762                 sixth sense, the (1999)   
3334      3403                raise the titanic (1980)   
3335      3404                          titanic (1953)   
3812      3882                      bring it on (2000)   

                               genre  
1       Adventure|Children's|Fantasy  
1539  Action|Adventure|Comedy|Sci-Fi  
1672                   Drama|Romance  
2088                         Romance  
2693                        Thriller  
3334                  Drama|Thriller  
3335                    Action|Drama  
3812                          Comedy  


In [15]:
# 선정한 영화가 ratings리스트에도 존재하는지 확인 => 해당과정 필요 X because merged_list 자체가 ratins df를 바탕으로 생성한 Df이기 때문

ratings_movie_id = ratings['movie_id'].tolist()
ratings_movie_id

for i in favorite_movie_id:
    if i in ratings_movie_id:
        print(i, "checked")
    else :
        print("no exist", i)

2 checked
1721 checked
2762 checked
3404 checked
3882 checked


### (3) ratings에 리스트 추가

In [16]:
# ratings 에 데이터 추가하기 전, 형태 및 데이터 확인

ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    836478 non-null  int64
 1   movie_id   836478 non-null  int64
 2   count      836478 non-null  int64
 3   timestamp  836478 non-null  int64
dtypes: int64(4)
memory usage: 31.9 MB


In [17]:
ratings.tail()

Unnamed: 0,user_id,movie_id,count,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569


In [18]:
unique_user_num

6039

In [19]:
# timestampt 칼럼은 무의미한 칼럼으로 drop

ratings = ratings.drop('timestamp', axis=1)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 836478 entries, 0 to 1000208
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   836478 non-null  int64
 1   movie_id  836478 non-null  int64
 2   count     836478 non-null  int64
dtypes: int64(3)
memory usage: 25.5 MB


In [20]:
# ratings에 값 추가

my_favorites = pd.DataFrame({"user_id":[10000]*5, "movie_id":favorite_movie_id, "count":[5]*5})

if not ratings.isin({"user_id" : [10000]})["user_id"].any():
    ratings = ratings.append(my_favorites, ignore_index=True)
    
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count
836473,6040,1090,3
836474,6040,1094,5
836475,6040,562,5
836476,6040,1096,4
836477,6040,1097,4
836478,10000,2,5
836479,10000,1721,5
836480,10000,2762,5
836481,10000,3404,5
836482,10000,3882,5


# 4. CSR matrix를 직접 만들어 봅시다.

In [30]:
# CSR Matrix 생성
from scipy.sparse import csr_matrix

# 유니크한 유저수와 영화수 재할당
unique_user_num = ratings['user_id'].nunique()
unique_movie_num = ratings['movie_id'].nunique()
print("num of users :", unique_user_num)
print("num of movies :",unique_movie_num)
print(type(unique_user_num))
print(type(unique_movie_num))


num of users : 6040
num of movies : 3628
<class 'int'>
<class 'int'>


In [38]:
ratings.count()

user_id     836483
movie_id    836483
count       836483
dtype: int64

In [43]:
csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)))
csr_data

<10001x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

**[ Error ]**

1) <font color='red'> TypeError: len() of unsized object</font> 
> csr_data = csr_matrix((ratings.count, (ratings.user_id, ratings.movie_id)), shape = (unique_user_num, unique_movie_num))  

- cause : 컬럼명으로 사용된 count 거 예약어이기 때문에 원하는 값을 불러올 수가 없음
- solution : ratings['count']

2) <font color='red'>  ValueError: row index exceeds matrix dimensions </font> 
> csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)), shape = (unique_user_num, unique_movie_num))
csr_data  

- cause : shape 구문에서 규정한 사이즈와 실제 데이터 사이즈가 다름에 따라 발생하는 에러
- solution : shape 이하 구문을 생략해서 본 데이터 사이즈에 맞춰 자동 생성되게 한다.

In [44]:
print(ratings['movie_id'].nunique())
print(ratings['movie_id'].max())
print(movies['movie_id'].nunique())
print(movies['movie_id'].max())

3628
3952
3883
3952


# 5. als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [48]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [49]:
# als 모델의 input 값으로 csr_data matrix transformation
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x10001 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [50]:
# 모델 훈련
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




# 6. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

# 7. 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

# 8. 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.