In [1]:
import sys
import pickle
from pprint import pprint 
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from data.make_casting_graph import oneway_to_bidirected_graph
from scipy.sparse import csc_matrix
import time
from pagerank import pagerank
from sklearn.preprocessing import normalize
from pyvis.network import Network

In [2]:
# create idx to num comments
with open('./data/ratings.csv', encoding='utf-8') as f:
    docs = [line.strip().split(',') for line in f.readlines()[1:]]
    _idx2numcomments = {movie_idx:int(num) for num, movie_idx in docs}

In [3]:
# pre defined casting weight graph
with open('./data/casting_graph.pkl', 'rb') as f:
    graph = pickle.load(f)

In [4]:
# create idx to actor name function
with open('./data/actors.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    # English name if exist else Korean name
    _idx2actor = {doc[0]:doc[1] for doc in docs}

In [5]:
with open('./data/movies.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    _idx2movie = {doc[0]:doc[1] for doc in docs if len(docs)}

In [6]:
idx2movie = lambda idx: _idx2movie.get(idx, 'Unknown')
idx2actor = lambda idx: _idx2actor.get(idx, 'Unknown')
idx2numcomments = lambda idx: _idx2numcomments.get(idx,0)

In [7]:
g = oneway_to_bidirected_graph(graph)

# [실습1]리뷰가 많은 영화 순위

In [8]:
for movie in sorted(_idx2numcomments.items(), key = lambda x: x[1], reverse=True)[:10]:
    print(idx2movie(movie[0]),movie[1])

기생충 40
극한직업 15
마약왕 15
인터스텔라 14
어벤져스: 엔드게임 12
걸캅스 12
마녀 12
택시운전사 11
배심원들 11
신과함께-죄와 벌 11


# [실습2] Dict를 이용한 PageRank

In [9]:
bias = {node:(idx2numcomments(node.split()[1]) if node[0] == 'm' else 0) for node in g}
_sum = sum(bias.values())
bias = {node:b / _sum for node, b in bias.items()}
rank = pagerank(g, bias = bias, df = 0.15, max_iter = 30, converge_error = 0.001, verbose = 1)

Iteration = 1, diff = 0.6745935594038653, sum = 1.000000000000003
Iteration = 2, diff = 0.5133755765513065, sum = 1.0000000000000062
Iteration = 3, diff = 0.4070843471025293, sum = 1.0000000000000078
Iteration = 4, diff = 0.32881145690448776, sum = 1.0000000000000004
Iteration = 5, diff = 0.2690000626169725, sum = 1.0000000000000069
Iteration = 6, diff = 0.22172923044566567, sum = 0.999999999999992
Iteration = 7, diff = 0.18372765496993113, sum = 0.9999999999999958
Iteration = 8, diff = 0.15290648077655553, sum = 1.0000000000000047
Iteration = 9, diff = 0.12756391624362115, sum = 0.9999999999999909
Iteration = 10, diff = 0.1067656357170639, sum = 0.9999999999999944
Iteration = 11, diff = 0.08947335545631445, sum = 1.0000000000000075
Iteration = 12, diff = 0.07517014319662872, sum = 1.00000000000001
Iteration = 13, diff = 0.06318528811144808, sum = 0.9999999999999933
Iteration = 14, diff = 0.05320609097840679, sum = 0.9999999999999913
Iteration = 15, diff = 0.04483047792706717, sum = 1.

# [실습3] Numpy를 이용한 PageRank

In [10]:
nodes = set(g.keys())
idx2node = list(sorted(nodes))
node2idx = {node:idx for idx, node in enumerate(idx2node)}

bias = np.asarray([b for node, b in sorted(bias.items(), key = lambda tp:node2idx[tp[0]])])
print(bias.shape)

rows =[]
cols =[]
data =[]

for from_node, to_dict in g.items():
    from_idx = node2idx[from_node]
    for to_node, weight in to_dict.items():
        to_idx = node2idx[to_node]
        rows.append(from_idx)
        cols.append(to_idx)
        data.append(weight)
A = csc_matrix((data, (rows, cols)))
print(A.shape)

(6154,)
(6154, 6154)


In [11]:
max_iter = 30
df = 0.85

ir = 1/A.shape[0]
rank = np.asarray([ir] * A.shape[0])

for n_iter in range(1, max_iter + 1):
    rank_new = A.dot(rank)
    rank_new = normalize(rank_new.reshape(1,-1), norm = 'l1').reshape(-1)
    rank_new = df * rank_new + (1-df) * bias
    diff = abs(rank-rank_new).sum()
    rank = rank_new
    print('iter {} : diff = {}'.format(n_iter, diff))

iter 1 : diff = 0.1685245368865779
iter 2 : diff = 0.123534416788289
iter 3 : diff = 0.11717242074154521
iter 4 : diff = 0.08676250638774644
iter 5 : diff = 0.08106650827175174
iter 6 : diff = 0.06044614044638538
iter 7 : diff = 0.05589952786903922
iter 8 : diff = 0.04188475454126574
iter 9 : diff = 0.038452782327255894
iter 10 : diff = 0.0289095171904886
iter 11 : diff = 0.026405522194198443
iter 12 : diff = 0.01994486388644759
iter 13 : diff = 0.01811137289916391
iter 14 : diff = 0.013753287448751986
iter 15 : diff = 0.012408911428306675
iter 16 : diff = 0.009469243738374537
iter 17 : diff = 0.008494000468005527
iter 18 : diff = 0.006511648928942716
iter 19 : diff = 0.005809774127703195
iter 20 : diff = 0.004473307017566352
iter 21 : diff = 0.0039712967053357525
iter 22 : diff = 0.0030704578506105173
iter 23 : diff = 0.0027152845982687866
iter 24 : diff = 0.002106149459828414
iter 25 : diff = 0.0018577039374234091
iter 26 : diff = 0.0014438021951808503
iter 27 : diff = 0.001270456142

# [과제 1] 성능 비교
## Dict를 이용한 경우 

In [12]:
bias = {node:(idx2numcomments(node.split()[1]) if node[0] == 'm' else 0) for node in g}
_sum = sum(bias.values())
bias = {node:b / _sum for node, b in bias.items()}

Start = time.time()
rank_Dict = pagerank(g, bias = bias, df = 0.15, max_iter = 30, converge_error = 0.001, verbose = 1)
Finish = time.time()
print("Dict를 이용한 PageRank 걸린 시간 : ",round(Finish-Start,3))

Iteration = 1, diff = 0.6745935594038653, sum = 1.000000000000003
Iteration = 2, diff = 0.5133755765513065, sum = 1.0000000000000062
Iteration = 3, diff = 0.4070843471025293, sum = 1.0000000000000078
Iteration = 4, diff = 0.32881145690448776, sum = 1.0000000000000004
Iteration = 5, diff = 0.2690000626169725, sum = 1.0000000000000069
Iteration = 6, diff = 0.22172923044566567, sum = 0.999999999999992
Iteration = 7, diff = 0.18372765496993113, sum = 0.9999999999999958
Iteration = 8, diff = 0.15290648077655553, sum = 1.0000000000000047
Iteration = 9, diff = 0.12756391624362115, sum = 0.9999999999999909
Iteration = 10, diff = 0.1067656357170639, sum = 0.9999999999999944
Iteration = 11, diff = 0.08947335545631445, sum = 1.0000000000000075
Iteration = 12, diff = 0.07517014319662872, sum = 1.00000000000001
Iteration = 13, diff = 0.06318528811144808, sum = 0.9999999999999933
Iteration = 14, diff = 0.05320609097840679, sum = 0.9999999999999913
Iteration = 15, diff = 0.04483047792706717, sum = 1.

## Numpy를 이용한 경우 

In [13]:
nodes = set(g.keys())
idx2node = list(sorted(nodes))
node2idx = {node:idx for idx, node in enumerate(idx2node)}

bias = np.asarray([b for node, b in sorted(bias.items(), key = lambda tp:node2idx[tp[0]])])
print(bias.shape)

rows =[]
cols =[]
data =[]

for from_node, to_dict in g.items():
    from_idx = node2idx[from_node]
    for to_node, weight in to_dict.items():
        to_idx = node2idx[to_node]
        rows.append(from_idx)
        cols.append(to_idx)
        data.append(weight)
A = csc_matrix((data, (rows, cols)))
print(A.shape)

(6154,)
(6154, 6154)


In [14]:
max_iter = 30
df = 0.85

Start = time.time()
ir = 1/A.shape[0]
rank_Numpy = np.asarray([ir] * A.shape[0])

for n_iter in range(1, max_iter + 1):
    rank_new = A.dot(rank_Numpy)
    rank_new = normalize(rank_new.reshape(1,-1), norm = 'l1').reshape(-1)
    rank_new = df * rank_new + (1-df) * bias
    diff = abs(rank_Numpy-rank_new).sum()
    rank_Numpy = rank_new
    print('iter {} : diff = {}'.format(n_iter, diff))
Finish = time.time()
print("Numpy를 이용한 PageRank 걸린 시간 : ",round(Finish-Start,3))

iter 1 : diff = 0.1685245368865779
iter 2 : diff = 0.123534416788289
iter 3 : diff = 0.11717242074154521
iter 4 : diff = 0.08676250638774644
iter 5 : diff = 0.08106650827175174
iter 6 : diff = 0.06044614044638538
iter 7 : diff = 0.05589952786903922
iter 8 : diff = 0.04188475454126574
iter 9 : diff = 0.038452782327255894
iter 10 : diff = 0.0289095171904886
iter 11 : diff = 0.026405522194198443
iter 12 : diff = 0.01994486388644759
iter 13 : diff = 0.01811137289916391
iter 14 : diff = 0.013753287448751986
iter 15 : diff = 0.012408911428306675
iter 16 : diff = 0.009469243738374537
iter 17 : diff = 0.008494000468005527
iter 18 : diff = 0.006511648928942716
iter 19 : diff = 0.005809774127703195
iter 20 : diff = 0.004473307017566352
iter 21 : diff = 0.0039712967053357525
iter 22 : diff = 0.0030704578506105173
iter 23 : diff = 0.0027152845982687866
iter 24 : diff = 0.002106149459828414
iter 25 : diff = 0.0018577039374234091
iter 26 : diff = 0.0014438021951808503
iter 27 : diff = 0.001270456142

# [과제 2] 영화 Top 10
## Dict를 이용한 방식 

In [15]:
num = 0
for movie in sorted(rank_Dict.items(), key = lambda x: x[1], reverse=True):
    if num == 10 :
        break
    if movie[0].split()[0] == 'movie':
        print(movie[0].split()[1],idx2movie(movie[0].split()[1]),movie[1])
        num += 1
    else:
        continue

161967 기생충 0.0032033878121671224
167651 극한직업 0.0014303471787626468
175322 마녀 0.0011565783119412997
156464 보헤미안 랩소디 0.0011527961465662747
130966 부산행 0.001098819013448319
177483 배심원들 0.0009469824923736168
174065 걸캅스 0.0009354687095915042
37886 클레멘타인 0.000918249213245038
154449 리틀 포레스트 0.00091821747845663
163788 알라딘 0.0007997936563664337


## Numpy를 이용한 방식 

In [16]:
num = 0
num_RankData = {idx2node[index]:val for index, val in enumerate(rank_Numpy)}
for movie in sorted(num_RankData.items(), key=lambda x:-x[1]):
    if num == 10 :
        break
    if movie[0].split()[0] == 'movie':
        print(movie[0].split()[1],idx2movie(movie[0].split()[1]),movie[1])
        num += 1
    else:
        continue

161967 기생충 0.0015437432925532173
156464 보헤미안 랩소디 0.0010864984266341052
175322 마녀 0.0008946794759721638
174065 걸캅스 0.0008564445054703045
167651 극한직업 0.0007648489380972874
37886 클레멘타인 0.000728929546919159
157297 마약왕 0.0007133104346250872
71509 아저씨 0.0006938076365826392
136900 어벤져스: 엔드게임 0.0006567566198412949
163788 알라딘 0.000638759850450271


# [과제 3]영화 Top 10 노드 시각화
## Dict를 이용한 방식 

In [17]:
G_Dict=Network(1000,1000, notebook = True)
num = 0

for movie in sorted(rank_Dict.items(), key = lambda x: x[1], reverse=True):
    if num == 10 :
        break
    if movie[0].split()[0] == 'movie':
        MovieNode = idx2movie(movie[0].split()[1])
        G_Dict.add_node(MovieNode)
        for actor in g[movie[0]].items():
            ActorNode = idx2actor(actor[0].split()[1])
            G_Dict.add_node(ActorNode)
            G_Dict.add_edge(MovieNode,ActorNode)
            for actor_movie in g[actor[0]].items():
                ActorToMovieNode = idx2movie(actor_movie[0].split()[1])
                G_Dict.add_node(ActorToMovieNode)
                G_Dict.add_edge(ActorNode,ActorToMovieNode)
        num += 1
    else:
        continue

G_Dict.show('Dict.html')

## Numpy를 이용한 방식 

In [18]:
G_num=Network(1000,1000, notebook = True)
num = 0

for movie in sorted(num_RankData.items(), key=lambda x:-x[1]):
    if num == 10 :
        break
    if movie[0].split()[0] == 'movie':
        MovieNode = idx2movie(movie[0].split()[1])
        G_num.add_node(MovieNode)
        for actor in g[movie[0]].items():
            ActorNode = idx2actor(actor[0].split()[1])
            G_num.add_node(ActorNode)
            G_num.add_edge(MovieNode,ActorNode)
            for actor_movie in g[actor[0]].items():
                ActorToMovieNode = idx2movie(actor_movie[0].split()[1])
                G_num.add_node(ActorToMovieNode)
                G_num.add_edge(ActorNode,ActorToMovieNode)
        num += 1
    else:
        continue

G_num.show('numpy.html')