# 유저 기반 영화추천 (라이브러리 사용 X)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
import helper

In [2]:
# Import the Movies dataset
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Import the ratings dataset
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
# Merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId' )
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')

# print('dataset dimensions: ', user_movie_ratings.shape, '\n\nSubset example:')
# user_movie_ratings.iloc[:6, :10]

In [None]:
n_movies = 30
n_users = 18
most_rated_movies_users_selection = helper.sort_by_rating_density(user_movie_ratings, n_movies, n_users)

most_rated_movies_users_selection

In [5]:
user_movie_ratings =  pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
most_rated_movies_1k = helper.get_most_rated_movies(user_movie_ratings, 1000)
most_rated_movies_1k

title,Forrest Gump (1994),Pulp Fiction (1994),"Shawshank Redemption, The (1994)","Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977),Jurassic Park (1993),"Matrix, The (1999)",Toy Story (1995),Schindler's List (1993),Terminator 2: Judgment Day (1991),...,Insomnia (2002),What Lies Beneath (2000),Roman Holiday (1953),"Motorcycle Diaries, The (Diarios de motocicleta) (2004)",Sophie's Choice (1982),Dawn of the Dead (2004),Ocean's Thirteen (2007),Seabiscuit (2003),Easy Rider (1969),Lucky Number Slevin (2006)
0,,,,,,,,,,,...,,,,,,,,,,
1,3.0,4.0,,3.0,,4.0,,,4.0,5.0,...,,,,,,,,,,
2,5.0,4.5,5.0,3.0,,,,,3.0,,...,,,,,,,,,,
3,5.0,5.0,,,5.0,5.0,,,,5.0,...,,,,,,,,,,
4,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,4.0,5.0,,,,4.0,,,,,...,,,,,,,,,,
667,,5.0,4.0,5.0,,,,,,,...,,,,,,,,,,
668,,,,,5.0,3.0,,,,,...,,,,,,,,,,
669,,,5.0,5.0,,,4.0,4.0,5.0,,...,,,,,,,,,,


In [6]:
class kmeans:
    def __init__(self, k , input):
        self.k = k
        self.df = input
        self.C =None

    # K개의 중심값을 임의로 선택한다

    def centroids(self):
        import random
        C = {
        i+1:[data for data in self.df.values[i]]
        for i, j in zip(range(self.k), random.sample(range(len(self.df)),self.k))}
        return C

    # 각 중심에서 데이터까지의 거리를 계산 using up.linalg.norm
    # 각 데이터에 가장 가까운 중심점(군집)을 할당
        
    def classify(self,C):
        import copy
        cluster_df = copy.deepcopy(self.df)
        col_n = cluster_df.shape[1]
        for i in C.keys():
            cluster_df["Distance_from_{}".format(i)]\
            =np.linalg.norm(np.array(cluster_df)[:,:col_n]-C[i], axis=1)

        dist_cols=["Distance_from_{}".format(i)  for i in C.keys()]
        cluster_df["Closet_Cluster"] = cluster_df.loc[:,dist_cols].idxmin(axis=1).map(lambda x:int(x.lstrip("Distance_from")))
        return cluster_df

    #각 중심점에 선택된 데이터 포인터들의 평균위치로 중심점을 재이동
    def update(self, C):
        c_df = self.classify(C)
        self.C ={
        i:[c for c in np.mean(self.df[c_df["Closet_Cluster"]==i], axis=0)]
        for i in c_df["Closet_Cluster"].unique()}
        return self.C

    # 위 과정을 '갱신된 중심점이 거의 변화가 없어 할당된 군집이 바뀌지 않을만큼 반복

    def train_cluster(self):
        assignments = None
        C = self.centroids()
        while True:
            # 중심점에 해당하는 군집 찾기
            cluster_df = self.classify(C)
            new_assignments = list(self.classify(C)["Closet_Cluster"])
            # 새로운 중심점 찾기
            new_C = self.update(C)
            # 할당된 군집이 바뀌지 않을만큼 중심점이 수렴했다면 종료
            if assignments == new_assignments:
                break
            # 아니라면 다시 중심점과 군집 찾기
            assignments = new_assignments
            C = new_C

        return new_C, list(new_assignments), cluster_df

In [13]:
predictions = kmeans(10, most_rated_movies_1k.fillna(0))

In [14]:
cluster = predictions.train_cluster()

In [15]:
predictions_1 = np.array(cluster[2]['Closet_Cluster'])

In [16]:
clustered_user = {}
for idx in range(len(predictions_1)):
    c_user = []
    for i in range(len(predictions_1)):
        if idx == i:
            continue
        if predictions_1[idx] == predictions_1[i]:
            c_user.append(i)
    clustered_user[idx] = c_user

# print("clustered_user : ", clustered_user.keys())
print("clustered_movie : ", clustered_user)


clustered_movie :  {0: [2, 5, 8, 9, 10, 11, 12, 13, 15, 17, 19, 23, 24, 26, 27, 28, 30, 32, 34, 36, 39, 42, 43, 44, 45, 48, 50, 51, 52, 53, 54, 57, 58, 59, 61, 64, 65, 69, 70, 73, 75, 78, 79, 80, 86, 89, 95, 97, 99, 102, 103, 105, 106, 108, 111, 112, 114, 115, 116, 121, 122, 126, 128, 130, 131, 132, 134, 135, 137, 138, 139, 140, 141, 142, 146, 152, 153, 155, 157, 159, 161, 162, 163, 165, 166, 169, 170, 171, 172, 173, 178, 179, 180, 182, 185, 188, 189, 192, 195, 197, 201, 202, 203, 205, 206, 207, 208, 209, 210, 214, 217, 220, 221, 222, 225, 226, 228, 230, 232, 236, 237, 245, 248, 251, 254, 255, 256, 257, 258, 259, 260, 262, 263, 266, 268, 271, 273, 275, 276, 277, 279, 280, 283, 285, 288, 289, 292, 295, 296, 297, 298, 299, 300, 303, 304, 306, 307, 309, 313, 314, 317, 318, 319, 320, 321, 322, 324, 325, 326, 328, 329, 330, 331, 332, 333, 334, 335, 336, 338, 339, 340, 342, 346, 347, 348, 350, 351, 355, 356, 358, 359, 360, 363, 364, 365, 367, 371, 373, 375, 376, 377, 380, 382, 385, 390, 391,

# 실습


## (1) user id 받아오기

In [17]:
user_id = 99

## (2) 비슷한 user id 가져오기

In [18]:
clustered_user[user_id]

[0,
 2,
 5,
 8,
 9,
 10,
 11,
 12,
 13,
 15,
 17,
 19,
 23,
 24,
 26,
 27,
 28,
 30,
 32,
 34,
 36,
 39,
 42,
 43,
 44,
 45,
 48,
 50,
 51,
 52,
 53,
 54,
 57,
 58,
 59,
 61,
 64,
 65,
 69,
 70,
 73,
 75,
 78,
 79,
 80,
 86,
 89,
 95,
 97,
 102,
 103,
 105,
 106,
 108,
 111,
 112,
 114,
 115,
 116,
 121,
 122,
 126,
 128,
 130,
 131,
 132,
 134,
 135,
 137,
 138,
 139,
 140,
 141,
 142,
 146,
 152,
 153,
 155,
 157,
 159,
 161,
 162,
 163,
 165,
 166,
 169,
 170,
 171,
 172,
 173,
 178,
 179,
 180,
 182,
 185,
 188,
 189,
 192,
 195,
 197,
 201,
 202,
 203,
 205,
 206,
 207,
 208,
 209,
 210,
 214,
 217,
 220,
 221,
 222,
 225,
 226,
 228,
 230,
 232,
 236,
 237,
 245,
 248,
 251,
 254,
 255,
 256,
 257,
 258,
 259,
 260,
 262,
 263,
 266,
 268,
 271,
 273,
 275,
 276,
 277,
 279,
 280,
 283,
 285,
 288,
 289,
 292,
 295,
 296,
 297,
 298,
 299,
 300,
 303,
 304,
 306,
 307,
 309,
 313,
 314,
 317,
 318,
 319,
 320,
 321,
 322,
 324,
 325,
 326,
 328,
 329,
 330,
 331,
 332,
 333,
 334