# 유저 기반 영화추천 (라이브러리 사용)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
import helper



# 데이터 받아오기

In [3]:
# Import the Movies dataset
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Import the ratings dataset
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


# movie-level Clustering


In [71]:
# Merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId' )
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')

# print('dataset dimensions: ', user_movie_ratings.shape, '\n\nSubset example:')
user_movie_ratings.iloc[:6, :10]

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
6,,,,,,,,4.0,,


In [72]:
user_movie_ratings

title,"""Great Performances"" Cats (1998)",$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Neath the Arizona Skies (1934),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),...,Zulu (1964),Zulu (2013),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931),İtirazım Var (2014)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,,,,,,,,,,,...,,,,,,,,,,


In [73]:
n_movies = 30
n_users = 18
most_rated_movies_users_selection = helper.sort_by_rating_density(user_movie_ratings, n_movies, n_users)

# print('dataset dimensions: ', most_rated_movies_users_selection.shape)
most_rated_movies_users_selection

title,Forrest Gump (1994),Pulp Fiction (1994),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Jurassic Park (1993),"Matrix, The (1999)",Toy Story (1995),Schindler's List (1993),Terminator 2: Judgment Day (1991),...,Dances with Wolves (1990),Fight Club (1999),"Usual Suspects, The (1995)",Seven (a.k.a. Se7en) (1995),"Lion King, The (1994)","Godfather, The (1972)","Lord of the Rings: The Fellowship of the Ring, The (2001)",Apollo 13 (1995),True Lies (1994),Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
29,5.0,5.0,5.0,4.0,4.0,4.0,3.0,4.0,5.0,4.0,...,5.0,4.0,5.0,4.0,3.0,5.0,3.0,5.0,4.0,2.0
508,4.0,5.0,4.0,4.0,5.0,3.0,4.5,3.0,5.0,2.0,...,5.0,4.0,5.0,4.0,3.5,5.0,4.5,3.0,2.0,4.0
14,1.0,5.0,2.0,5.0,5.0,3.0,5.0,2.0,4.0,4.0,...,3.0,5.0,5.0,5.0,4.0,5.0,5.0,3.0,4.0,4.0
72,5.0,5.0,5.0,4.5,4.5,4.0,4.5,5.0,5.0,3.0,...,4.5,5.0,5.0,5.0,5.0,5.0,5.0,3.5,3.0,5.0
653,4.0,5.0,5.0,4.5,5.0,4.5,5.0,5.0,5.0,5.0,...,4.5,5.0,5.0,4.5,5.0,4.5,5.0,5.0,4.0,5.0
22,4.5,4.5,5.0,4.5,4.5,3.5,4.0,3.0,3.5,3.5,...,2.5,3.5,4.0,4.5,,5.0,4.0,3.5,4.0,4.0
460,4.0,4.5,5.0,5.0,4.5,5.0,4.5,3.5,4.0,5.0,...,4.0,5.0,4.0,5.0,1.5,5.0,4.5,3.0,2.5,
267,5.0,5.0,3.5,5.0,5.0,4.5,4.5,5.0,5.0,5.0,...,,4.5,3.5,4.0,5.0,4.5,5.0,4.5,4.0,3.0
561,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.5,4.0,5.0,...,4.0,4.5,5.0,4.5,4.5,4.0,5.0,4.5,4.0,4.0
354,5.0,4.5,5.0,4.5,5.0,3.5,5.0,3.0,4.0,4.0,...,4.0,5.0,3.5,5.0,4.0,5.0,4.5,3.0,3.5,4.0


In [77]:
user_movie_ratings =  pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
most_rated_movies_1k = helper.get_most_rated_movies(user_movie_ratings, 1000)
most_rated_movies_1k

title,Forrest Gump (1994),Pulp Fiction (1994),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Jurassic Park (1993),"Matrix, The (1999)",Toy Story (1995),Schindler's List (1993),Terminator 2: Judgment Day (1991),...,Insomnia (2002),What Lies Beneath (2000),Roman Holiday (1953),"Motorcycle Diaries, The (Diarios de motocicleta) (2004)",Sophie's Choice (1982),Dawn of the Dead (2004),Ocean's Thirteen (2007),Seabiscuit (2003),Easy Rider (1969),Lucky Number Slevin (2006)
0,,,,,,,,,,,...,,,,,,,,,,
1,3.0,4.0,,3.0,,4.0,,,4.0,5.0,...,,,,,,,,,,
2,5.0,4.5,5.0,3.0,,,,,3.0,,...,,,,,,,,,,
3,5.0,5.0,,,5.0,5.0,,,,5.0,...,,,,,,,,,,
4,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,4.0,5.0,,,,4.0,,,,,...,,,,,,,,,,
667,,5.0,4.0,5.0,,,,,,,...,,,,,,,,,,
668,,,,,5.0,3.0,,,,,...,,,,,,,,,,
669,,,5.0,5.0,,,4.0,4.0,5.0,,...,,,,,,,,,,


In [92]:
test = most_rated_movies_1k.T

In [94]:
test.keys()[idx] = 3

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            661, 662, 663, 664, 665, 666, 667, 668, 669, 670],
           dtype='int64', length=671)

# cluster

In [80]:
from sklearn.cluster import KMeans 
sparse_ratings = csr_matrix(pd.SparseDataFrame(most_rated_movies_1k).to_coo())

Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)


In [86]:
# 20 clusters
predictions_l = KMeans(n_clusters=10, algorithm='full').fit_predict(sparse_ratings)

In [87]:
predictions_l

array([0, 7, 0, 9, 5, 0, 7, 5, 0, 0, 0, 0, 0, 0, 1, 0, 8, 0, 9, 0, 9, 8,
       1, 0, 0, 5, 0, 0, 0, 1, 5, 7, 0, 9, 0, 7, 0, 5, 7, 0, 0, 5, 0, 0,
       0, 0, 7, 4, 0, 7, 0, 0, 0, 0, 0, 8, 9, 0, 5, 0, 5, 0, 5, 7, 0, 0,
       7, 5, 5, 0, 0, 5, 6, 0, 5, 0, 8, 8, 0, 0, 0, 7, 5, 5, 7, 7, 0, 8,
       5, 0, 5, 7, 8, 5, 8, 0, 5, 0, 9, 0, 5, 1, 0, 5, 8, 0, 0, 7, 0, 7,
       8, 0, 0, 7, 0, 0, 0, 9, 3, 5, 7, 0, 0, 5, 8, 7, 0, 5, 0, 8, 0, 0,
       0, 8, 0, 5, 0, 5, 0, 0, 0, 0, 0, 7, 7, 5, 0, 5, 5, 8, 7, 4, 0, 0,
       0, 0, 4, 0, 5, 0, 7, 0, 5, 5, 8, 0, 0, 7, 5, 0, 0, 0, 0, 0, 5, 5,
       7, 5, 0, 0, 0, 7, 0, 7, 8, 0, 8, 7, 0, 0, 7, 7, 0, 7, 9, 0, 0, 0,
       4, 5, 5, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 6, 8, 9, 0, 5, 5, 0, 5, 8,
       0, 0, 0, 7, 7, 0, 0, 5, 0, 5, 0, 3, 0, 5, 5, 9, 0, 0, 0, 8, 7, 3,
       8, 5, 5, 0, 8, 0, 0, 5, 5, 0, 8, 7, 5, 0, 0, 0, 0, 0, 0, 5, 0, 0,
       9, 7, 0, 8, 0, 5, 5, 0, 5, 0, 5, 0, 0, 0, 7, 0, 0, 5, 5, 0, 8, 0,
       5, 7, 0, 0, 0, 8, 0, 2, 8, 0, 5, 0, 0, 0, 0,

In [88]:
print(len(predictions_l))

671


In [97]:
cluster_list = {}
for idx in range(len(predictions_l)):
    user_id = []
#     predictions_l[idx]  == 클러스터 번호 , 그게 딕셔너리 키값에 아예  없으면
    if predictions_l[idx] not in cluster_list.keys():
        user_id.append(test.keys()[idx])  #  해당 클러스터 번호를 가진 유저의 인덱스 값을 일단 빈 리스트에 넣어주고
        cluster_list[predictions_l[idx]] = user_id


    else:
        cluster_list[predictions_l[idx]].append(test.keys()[idx])

print(cluster_list)

{0: [0, 2, 5, 8, 9, 10, 11, 12, 13, 15, 17, 19, 23, 24, 26, 27, 28, 32, 34, 36, 39, 40, 42, 43, 44, 45, 48, 50, 51, 52, 53, 54, 57, 59, 61, 64, 65, 69, 70, 73, 75, 78, 79, 80, 86, 89, 95, 97, 99, 102, 105, 106, 108, 111, 112, 114, 115, 116, 121, 122, 126, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, 142, 146, 152, 153, 154, 155, 157, 159, 161, 165, 166, 169, 170, 171, 172, 173, 178, 179, 180, 182, 185, 188, 189, 192, 195, 196, 197, 201, 202, 203, 205, 206, 207, 208, 209, 210, 214, 217, 220, 221, 222, 225, 226, 228, 230, 232, 236, 237, 238, 245, 247, 248, 251, 255, 256, 257, 258, 259, 260, 262, 263, 266, 268, 271, 273, 275, 276, 277, 279, 280, 283, 285, 288, 289, 290, 292, 295, 297, 298, 299, 300, 303, 304, 307, 309, 313, 314, 317, 318, 319, 320, 321, 322, 324, 325, 326, 328, 329, 330, 331, 333, 334, 335, 336, 339, 340, 342, 346, 347, 348, 350, 355, 356, 358, 359, 360, 363, 364, 365, 367, 371, 373, 375, 376, 380, 382, 385, 390, 391, 392, 394, 396, 397, 398, 400, 402, 403, 405, 409,

In [90]:
most_rated_movies_1k

title,Forrest Gump (1994),Pulp Fiction (1994),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Jurassic Park (1993),"Matrix, The (1999)",Toy Story (1995),Schindler's List (1993),Terminator 2: Judgment Day (1991),...,Insomnia (2002),What Lies Beneath (2000),Roman Holiday (1953),"Motorcycle Diaries, The (Diarios de motocicleta) (2004)",Sophie's Choice (1982),Dawn of the Dead (2004),Ocean's Thirteen (2007),Seabiscuit (2003),Easy Rider (1969),Lucky Number Slevin (2006)
0,,,,,,,,,,,...,,,,,,,,,,
1,3.0,4.0,,3.0,,4.0,,,4.0,5.0,...,,,,,,,,,,
2,5.0,4.5,5.0,3.0,,,,,3.0,,...,,,,,,,,,,
3,5.0,5.0,,,5.0,5.0,,,,5.0,...,,,,,,,,,,
4,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,4.0,5.0,,,,4.0,,,,,...,,,,,,,,,,
667,,5.0,4.0,5.0,,,,,,,...,,,,,,,,,,
668,,,,,5.0,3.0,,,,,...,,,,,,,,,,
669,,,5.0,5.0,,,4.0,4.0,5.0,,...,,,,,,,,,,


# 실습

## (1) user id 받아오기

In [10]:
user_id = 99

## (2) 비슷한 user id 가져오기

In [11]:
clustered_user[user_id]

[17,
 24,
 43,
 54,
 69,
 79,
 86,
 89,
 111,
 153,
 161,
 172,
 210,
 222,
 232,
 251,
 255,
 283,
 317,
 328,
 342,
 360,
 382,
 389,
 412,
 458,
 483,
 489,
 506,
 529,
 542,
 615,
 635,
 639,
 649]

## 끝!!