In [1]:
# 추천 시스템을 시작하기 전에
#           피자 치킨 김밥 탕수육 
# 고객1     좋다 좋다  X   좋다
# 고객2     X    X    좋다  X
# 고객3     좋다 좋다  X    ?

### 추천 시스템

In [2]:
import numpy as np

user1 = np.array([2,2,1,2])
user2 = np.array([1,1,2,1])
user3 = np.array([2,2,1,0])

rMatrix = np.vstack( (user1, user2, user3))
print(rMatrix.shape)
rMatrix

(3, 4)


array([[2, 2, 1, 2],
       [1, 1, 2, 1],
       [2, 2, 1, 0]])

### 사이킷 런의 코사인 유사도(cosine_similarity)

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

cosineSim = cosine_similarity(rMatrix)
print(cosineSim.shape)
cosineSim

(3, 3)


array([[1.        , 0.83862787, 0.83205029],
       [0.83862787, 1.        , 0.75592895],
       [0.83205029, 0.75592895, 1.        ]])

In [4]:
# 유사도
#           u1      u2      u3
# u1      u1-u1 | u1-u2 | u1-u3
# u2      u2-u1 | u2-u2 | u2-u3
# u3      u3-u1 | u3-u2 | u3-u3

### 아이템 기반 협업 필터링의 경우

In [5]:
#           user1    user2      user3
# 피자
# 치킨
# 김밥
# 탕수육

In [6]:
rMatrix_t = np.transpose(rMatrix)
print(rMatrix_t)
rMatrix_t

[[2 1 2]
 [2 1 2]
 [1 2 1]
 [2 1 0]]


array([[2, 1, 2],
       [2, 1, 2],
       [1, 2, 1],
       [2, 1, 0]])

In [7]:
cosineSim_t = cosine_similarity(rMatrix_t)
cosineSim_t

array([[1.        , 1.        , 0.81649658, 0.74535599],
       [1.        , 1.        , 0.81649658, 0.74535599],
       [0.81649658, 0.81649658, 1.        , 0.73029674],
       [0.74535599, 0.74535599, 0.73029674, 1.        ]])

In [8]:
# 유사도
#              피자           치킨          김밥          탕수육
# 피자     피자와 피자  | 피자와 치킨 | 피자와 김밥 | 피자와 탕수육
# 치킨     치킨과 피자  | 치킨과 치킨 | 치킨과 김밥 | 치킨과 탕수육
# 김밥     김밥과 피자  | 김밥과 치킨 | 김밥과 김밥 | 김밥과 탕수육
# 탕수육   탕수육과 피자 | 탕수육과 치킨  | 탕수육과 김밥 | 탕수육과 탕수육

# 아이템 기반 협업 필터링 기법을 활용한 영화 추천 시스템 만들기

In [9]:
import pandas as pd

### 데이터 불러오기

In [10]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns) # sep='\t' : tab을 기준으로 특성 분할
print(df.shape)
df

(100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [12]:
df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [13]:
df['item_id'].value_counts()

50      583
258     509
100     508
181     507
294     485
       ... 
1543      1
1583      1
1647      1
711       1
1663      1
Name: item_id, Length: 1682, dtype: int64

In [14]:
len(df['item_id'].unique())

1682

In [15]:
df['rating'].value_counts()

4    34174
3    27145
5    21201
2    11370
1     6110
Name: rating, dtype: int64

In [16]:
df['user_id'].value_counts()

405    737
655    685
13     636
450    540
276    518
      ... 
571     20
364     20
812     20
572     20
895     20
Name: user_id, Length: 943, dtype: int64

In [17]:
len(df['user_id'].unique())

943

### 데이터 불러오기2

* u.item
    * 파일명 : u.u_item
    * item_id : 영화 정보
    * movie title : 영화 제목
    * release date : 출시일
    * video release date : 비디오 출시일
    * IMDb URL : IMDb URL 정보
    * unkonwn, ... : 기타 장르 정보

In [18]:
# 장르 분야
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
           'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 
           'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
           'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
           'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
print(movies.shape)
movies.head()

(1682, 24)


Unnamed: 0,item_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
# 영화 리뷰 정보의 영화의 개수는 1682편
# 영화 정보의 영화의 개수는 1682편
print(len(df['item_id'].unique()))
print(len(movies['item_id'].unique()))
print(len(movies['movie title'].unique()))

1682
1682
1664


In [20]:
movies['movie title'].isnull().sum()

0

In [21]:
# df : 10만개 리뷰 정보
# movies 영화 정보 및 장르 정보
# df - movies_name(item_id, movie title)

# 두개 컬럼 선택
movie_names = movies[['item_id', 'movie title']]

# 두개의 데이터 셋을 병합
c_movies_data = pd.merge(df, movie_names, on='item_id')
print(c_movies_data.shape)
c_movies_data.head()

(100000, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [22]:
print(len(movies['movie title'].unique()))

1664


### 미션 : 하나의 영화를 선택하고 관련 유사한 영화 10편을 추천해 주는 시스템을 만들어라.

In [23]:
# 사용자 기반 협업 필터링
rating_c = c_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)

rating_c

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
942,0,0,0,0,0,0,0,3,0,3,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# 아이템 기반 협업 필터링
X= rating_c.T # 행렬 바꾸기 판다스 내부 함수
print(X.shape)

(1664, 943)


## 차원 축소

In [25]:
# 메모리, 시간을 줄이기 위해, 특성(차원) 축소를 한다.(중요한 특성만 추린다.)
# 차원 축소의 대표적인 방법 : SVD(scikit learn)

In [26]:
from sklearn.decomposition import TruncatedSVD

In [27]:
SVD = TruncatedSVD(n_components=12, random_state=5)
r_matrix = SVD.fit_transform(X) # 특성 개수 943 -> 12 축소
print(r_matrix.shape, type(r_matrix))

(1664, 12) <class 'numpy.ndarray'>


### Correlation Pearson

In [28]:
corr_mat = np.corrcoef(r_matrix) # 피어슨 상관계수를 이용해 상관 행렬 생성
print(corr_mat.shape)
corr_mat

(1664, 1664)


array([[ 1.        , -0.11573577,  0.51362284, ...,  0.38310045,
         0.20193733,  0.5065142 ],
       [-0.11573577,  1.        ,  0.05820808, ...,  0.15805829,
         0.51795357,  0.27104818],
       [ 0.51362284,  0.05820808,  1.        , ...,  0.76575655,
         0.43824619,  0.19507139],
       ...,
       [ 0.38310045,  0.15805829,  0.76575655, ...,  1.        ,
         0.18043708,  0.12115972],
       [ 0.20193733,  0.51795357,  0.43824619, ...,  0.18043708,
         1.        ,  0.20126072],
       [ 0.5065142 ,  0.27104818,  0.19507139, ...,  0.12115972,
         0.20126072,  1.        ]])

### 유사 영화를 찾기

* Star Wars(1977)을 좋아합니다.

In [29]:
rating_c.columns.get_loc('Star Wars (1977)')

1398

In [30]:
col_idx = rating_c.columns.get_loc('Star Wars (1977)')
corr_spec = corr_mat[col_idx]
result = pd.DataFrame( {'corr_spec':corr_spec, 'Movies':rating_c.columns})
result

Unnamed: 0,corr_spec,Movies
0,0.357238,'Til There Was You (1997)
1,0.421507,1-900 (1994)
2,0.593815,101 Dalmatians (1996)
3,0.722361,12 Angry Men (1957)
4,0.325221,187 (1997)
...,...,...
1659,0.669308,Young Guns II (1990)
1660,0.492406,"Young Poisoner's Handbook, The (1995)"
1661,0.331338,Zeus and Roxanne (1997)
1662,0.639006,unknown


In [31]:
result.sort_values('corr_spec', ascending=False).head(10)

Unnamed: 0,corr_spec,Movies
1398,1.0,Star Wars (1977)
1234,0.988052,Return of the Jedi (1983)
1460,0.942655,Terminator 2: Judgment Day (1991)
1523,0.933978,Toy Story (1995)
1461,0.931701,"Terminator, The (1984)"
1205,0.925185,Raiders of the Lost Ark (1981)
456,0.923562,"Empire Strikes Back, The (1980)"
570,0.915965,"Fugitive, The (1993)"
414,0.914299,Die Hard (1988)
44,0.892894,Aliens (1986)


### 5-2 (101 Dalmatians (1996) )를 좋아하는 사람들이 좋아할만한 영화 15편을 추천해 보기.

In [32]:
col_idx = rating_c.columns.get_loc("101 Dalmatians (1996)")
corr_spec = corr_mat[col_idx]
result = pd.DataFrame({'corr_spec':corr_spec, 'Movies':rating_c.columns})
result

Unnamed: 0,corr_spec,Movies
0,0.513623,'Til There Was You (1997)
1,0.058208,1-900 (1994)
2,1.000000,101 Dalmatians (1996)
3,0.373757,12 Angry Men (1957)
4,0.278365,187 (1997)
...,...,...
1659,0.563967,Young Guns II (1990)
1660,0.285193,"Young Poisoner's Handbook, The (1995)"
1661,0.765757,Zeus and Roxanne (1997)
1662,0.438246,unknown


In [33]:
result.sort_values('corr_spec', ascending=False).head(16)

Unnamed: 0,corr_spec,Movies
2,1.0,101 Dalmatians (1996)
693,0.944203,Homeward Bound II: Lost in San Francisco (1996)
713,0.93253,"Hunchback of Notre Dame, The (1996)"
659,0.92215,Harriet the Spy (1996)
46,0.910804,All Dogs Go to Heaven 2 (1996)
805,0.903955,Kazaam (1996)
23,0.899279,"Adventures of Pinocchio, The (1996)"
435,0.899266,Dragonheart (1996)
764,0.890192,Jack (1996)
505,0.881306,Father of the Bride Part II (1995)


In [34]:
# 코사인 유사도(아이템 기반)
X= rating_c.T
cosineSim = cosine_similarity(X)
print(cosineSim.shape)

(1664, 1664)


In [35]:
col_idx = rating_c.columns.get_loc("101 Dalmatians (1996)")
corr_spec = cosineSim[col_idx]
print(corr_spec.shape)

(1664,)


In [36]:
len(rating_c.columns)

1664

In [37]:
col_idx = rating_c.columns.get_loc("101 Dalmatians (1996)")
corr_spec = cosineSim[col_idx]
result = pd.DataFrame({'corr_spec':corr_spec, 'Movies':rating_c.columns})
result.sort_values('corr_spec', ascending=False).head(11)

Unnamed: 0,corr_spec,Movies
2,1.0,101 Dalmatians (1996)
764,0.447883,Jack (1996)
1547,0.441882,Twister (1996)
1633,0.423295,Willy Wonka and the Chocolate Factory (1971)
744,0.420642,Independence Day (ID4) (1996)
1523,0.409386,Toy Story (1995)
505,0.403524,Father of the Bride Part II (1995)
713,0.403382,"Hunchback of Notre Dame, The (1996)"
867,0.403309,"Lion King, The (1994)"
1013,0.399968,Mrs. Doubtfire (1993)


### 6-2 실습 - MovieLens 10M Dataset 데이터 셋을 이용해서 추천 시스템 프로그램을 만들어보자.

In [38]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']

df = pd.read_csv('ml-10M100K/ratings.dat', sep='::', names=columns, encoding='latin-1')
df.head()

  df = pd.read_csv('ml-10M100K/ratings.dat', sep='::', names=columns, encoding='latin-1')


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [39]:
# 영화 데이터 불러오기
columns = ['item_id', 'movie title', 'genres']
movies = pd.read_csv('ml-10M100K/movies.dat', sep='::', names=columns, encoding='latin-1')
movies.head()

  movies = pd.read_csv('ml-10M100K/movies.dat', sep='::', names=columns, encoding='latin-1')


Unnamed: 0,item_id,movie title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000054 entries, 0 to 10000053
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   user_id    int64  
 1   item_id    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 305.2 MB


In [41]:
len(df['item_id'].unique())

10677

In [42]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10681 entries, 0 to 10680
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   item_id      10681 non-null  int64 
 1   movie title  10681 non-null  object
 2   genres       10681 non-null  object
dtypes: int64(1), object(2)
memory usage: 250.5+ KB


In [43]:
len(movies['item_id'].unique())

10681

In [49]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    return df

In [50]:
df = reduce_mem_usage(df, verbose=True)

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000054 entries, 0 to 10000053
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   user_id    int32  
 1   item_id    int32  
 2   rating     float16
 3   timestamp  int32  
dtypes: float16(1), int32(3)
memory usage: 133.5 MB


In [52]:
# df와 movies 데이터 셋 합치기
movie_names = movies[['item_id', 'movie title']]
c_movies_data = pd.merge(df, movie_names, on='item_id')
c_movies_data.shape

(10000054, 5)

In [53]:
c_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,1,122,5.0,838985046,Boomerang (1992)
1,139,122,3.0,974302621,Boomerang (1992)
2,149,122,2.5,1112342322,Boomerang (1992)
3,182,122,3.0,943458784,Boomerang (1992)
4,215,122,4.5,1102493547,Boomerang (1992)


In [54]:
c_movies_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000054 entries, 0 to 10000053
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int32  
 1   item_id      int32  
 2   rating       float16
 3   timestamp    int32  
 4   movie title  object 
dtypes: float16(1), int32(3), object(1)
memory usage: 286.1+ MB


In [None]:
### 에러 발생
* 메모리 부족 -> 해결
* No matching signature found 에러 발생 -> 검색 결과, dtype 변경시 에러 발생한다함. 

In [55]:
rating_c = c_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
rating_c

TypeError: No matching signature found