In [1]:
import sys
import pickle
from pprint import pprint 
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from data.make_casting_graph import oneway_to_bidirected_graph
from scipy.sparse import csc_matrix
import time
from pagerank import pagerank
from sklearn.preprocessing import normalize
from pyvis.network import Network

In [2]:
# create idx to num comments
with open('./data/ratings.csv', encoding='utf-8') as f:
    docs = [line.strip().split(',') for line in f.readlines()[1:]]
    _idx2numcomments = {movie_idx:int(num) for num, movie_idx in docs}

In [3]:
# pre defined casting weight graph
with open('./data/casting_graph.pkl', 'rb') as f:
    graph = pickle.load(f)

In [4]:
# create idx to actor name function
with open('./data/actors.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    # English name if exist else Korean name
    _idx2actor = {doc[0]:doc[1] for doc in docs}

In [5]:
with open('./data/movies.csv', encoding='utf-8') as f:
    next(f)
    docs = [line.split(',') for line in f.readlines()[1:]]
    _idx2movie = {doc[0]:doc[1] for doc in docs if len(docs)}

In [6]:
idx2movie = lambda idx: _idx2movie.get(idx, 'Unknown')
idx2actor = lambda idx: _idx2actor.get(idx, 'Unknown')
idx2numcomments = lambda idx: _idx2numcomments.get(idx,0)

In [7]:
g = oneway_to_bidirected_graph(graph)

In [8]:
for movie in sorted(_idx2numcomments.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(idx2movie(movie[0]), movie[1])

기생충 40
극한직업 15
마약왕 15
인터스텔라 14
어벤져스: 엔드게임 12
걸캅스 12
마녀 12
택시운전사 11
배심원들 11
신과함께-죄와 벌 11


## [과제 1] 성능 비교 - dict

In [9]:
import time
rank_top = []
bias = {node:(idx2numcomments(node.split()[1]) if node[0] == 'm' else 0) for node in g}
_sum = sum(bias.values())
bias = {node:b / _sum for node, b in bias.items()}
starttime = time.time()
rank = pagerank(g,
                bias=bias,
                df=0.15,
                max_iter=30,
                converge_error=0.001,
                verbose=1)
print('computation time: {0}s'.format(time.time()-starttime))

Iteration = 1, diff = 0.6745935594038658, sum = 1.0000000000000029
Iteration = 2, diff = 0.5133755765513064, sum = 1.0000000000000078
Iteration = 3, diff = 0.40708434710252833, sum = 1.0000000000000075
Iteration = 4, diff = 0.32881145690448826, sum = 1.0000000000000002
Iteration = 5, diff = 0.2690000626169734, sum = 1.000000000000005
Iteration = 6, diff = 0.22172923044566514, sum = 0.9999999999999897
Iteration = 7, diff = 0.1837276549699306, sum = 0.9999999999999932
Iteration = 8, diff = 0.15290648077655514, sum = 1.0000000000000075
Iteration = 9, diff = 0.12756391624362132, sum = 0.9999999999999944
Iteration = 10, diff = 0.10676563571706411, sum = 0.9999999999999946
Iteration = 11, diff = 0.08947335545631493, sum = 1.0000000000000036
Iteration = 12, diff = 0.0751701431966286, sum = 1.000000000000012
Iteration = 13, diff = 0.06318528811144823, sum = 0.9999999999999936
Iteration = 14, diff = 0.053206090978406756, sum = 0.9999999999999923
Iteration = 15, diff = 0.04483047792706735, sum =

## [과제 2] 영화 Top 10 - dict

In [10]:
from pandas import Series, DataFrame
rank_movie = []
#print(rank)
for i in rank:
    if i.startswith('movie'):
        rank_movie.append((i[6:], rank[i]))

movie_rank = pd.DataFrame(rank_movie)
movie_rank = movie_rank.rename(columns = {0: 'movieId', 1: 'rank'})
movie_rank

Unnamed: 0,movieId,rank
0,94187,0.000291
1,163842,0.000198
2,25915,0.000290
3,34431,0.000074
4,36609,0.000124
5,60016,0.000091
6,66402,0.000102
7,152344,0.000334
8,82262,0.000109
9,105521,0.000350


In [11]:
sorting_rank = movie_rank.sort_values(by='rank', ascending=False)
sorting_rank = sorting_rank[0:10]
sorting_rank

Unnamed: 0,movieId,rank
342,161967,0.003203
1698,167651,0.00143
770,175322,0.001157
1584,156464,0.001153
314,130966,0.001099
1587,177483,0.000947
2661,174065,0.000935
810,37886,0.000918
2681,154449,0.000918
541,163788,0.0008


In [12]:
sorting_rank['movieName'] = 0
for index in _idx2movie.keys():
    for index2 in range(10):
        if index == sorting_rank.iloc[index2, 0]:
            sorting_rank.iloc[index2, 2] = _idx2movie.get(index)

sorting_rank

Unnamed: 0,movieId,rank,movieName
342,161967,0.003203,기생충
1698,167651,0.00143,극한직업
770,175322,0.001157,마녀
1584,156464,0.001153,보헤미안 랩소디
314,130966,0.001099,부산행
1587,177483,0.000947,배심원들
2661,174065,0.000935,걸캅스
810,37886,0.000918,클레멘타인
2681,154449,0.000918,리틀 포레스트
541,163788,0.0008,알라딘


In [13]:
for indexing in range(10):
    print(str(sorting_rank.iloc[indexing, 0]) + ", " + str(sorting_rank.iloc[indexing, 1]) + ", " + str(sorting_rank.iloc[indexing, 2]))

161967, 0.0032033878121671224, 기생충
167651, 0.0014303471787626468, 극한직업
175322, 0.0011565783119412997, 마녀
156464, 0.0011527961465662747, 보헤미안 랩소디
130966, 0.001098819013448319, 부산행
177483, 0.0009469824923736168, 배심원들
174065, 0.0009354687095915042, 걸캅스
37886, 0.000918249213245038, 클레멘타인
154449, 0.00091821747845663, 리틀 포레스트
163788, 0.0007997936563664337, 알라딘


## [과제 1] 성능 비교 - numpy

In [14]:
import numpy as np
from scipy.sparse import csc_matrix

nodes = set(g.keys())
idx2node = list(sorted(nodes))
node2idx = {node:idx for idx, node in enumerate(idx2node)}

bias = np.asarray([b for node, b in sorted(bias.items(), key=lambda tp:node2idx[tp[0]])])
print(bias.shape)

rows = []
cols = []
data = []

for from_node, to_dict in g.items():
    from_idx = node2idx[from_node]
    for to_node, weight in to_dict.items():
        to_idx = node2idx[to_node]
        rows.append(from_idx)
        cols.append(to_idx)
        data.append(weight)
        
A = csc_matrix((data, (rows, cols)))
print(A.shape)

(6154,)
(6154, 6154)


In [15]:
max_iter = 30
df = 0.85

ir = 1 / A.shape[0]
rank = np.asarray([ir] * A.shape[0])

starttime2 = time.time()
for n_iter in range(1, max_iter + 1):
    rank_new = A.dot(rank)
    rank_new = normalize(rank_new.reshape(1, -1), norm='l1').reshape(-1)
    rank_new = df *    rank_new + (1 - df) * bias
    diff = abs(rank - rank_new).sum()
    rank = rank_new
    print('iter {} : diff = {}'.format(n_iter, diff))
    
print('computation time: {0}s'.format(time.time()-starttime2))

iter 1 : diff = 0.1685245368865779
iter 2 : diff = 0.123534416788289
iter 3 : diff = 0.11717242074154521
iter 4 : diff = 0.08676250638774644
iter 5 : diff = 0.08106650827175174
iter 6 : diff = 0.06044614044638538
iter 7 : diff = 0.05589952786903922
iter 8 : diff = 0.04188475454126574
iter 9 : diff = 0.038452782327255894
iter 10 : diff = 0.0289095171904886
iter 11 : diff = 0.026405522194198443
iter 12 : diff = 0.01994486388644759
iter 13 : diff = 0.01811137289916391
iter 14 : diff = 0.013753287448751986
iter 15 : diff = 0.012408911428306675
iter 16 : diff = 0.009469243738374537
iter 17 : diff = 0.008494000468005527
iter 18 : diff = 0.006511648928942716
iter 19 : diff = 0.005809774127703195
iter 20 : diff = 0.004473307017566352
iter 21 : diff = 0.0039712967053357525
iter 22 : diff = 0.0030704578506105173
iter 23 : diff = 0.0027152845982687866
iter 24 : diff = 0.002106149459828414
iter 25 : diff = 0.0018577039374234091
iter 26 : diff = 0.0014438021951808503
iter 27 : diff = 0.001270456142

## [과제 2] 영화 Top 10 - numpy

In [16]:
rank_ = {idx2node[idx]:value for idx, value in enumerate(rank)}
movierank = {node:value for node, value in rank_.items() if 'movie' in node}
actorrank = {node:value for node, value in rank_.items() if 'actor' in node}
numpy_list = []
for movie, value in sorted(movierank.items(), key=lambda x:-x[1])[:10]:
    movie_idx = movie.split()[1]
    print(str(movie) + ", " + idx2movie(movie_idx) + ", " + str(value))
    numpy_list.append(str(movie))

movie 161967, 기생충, 0.0015437432925532173
movie 156464, 보헤미안 랩소디, 0.0010864984266341052
movie 175322, 마녀, 0.0008946794759721638
movie 174065, 걸캅스, 0.0008564445054703045
movie 167651, 극한직업, 0.0007648489380972874
movie 37886, 클레멘타인, 0.000728929546919159
movie 157297, 마약왕, 0.0007133104346250872
movie 71509, 아저씨, 0.0006938076365826392
movie 136900, 어벤져스: 엔드게임, 0.0006567566198412949
movie 163788, 알라딘, 0.000638759850450271


## [과제 3] 영화 Top 10 노드 시각화 - dict

In [17]:
movie_page = []
actor_page = []
for k in g.keys():
    for j in sorting_rank['movieId']:
        if k.startswith('movie'):
            if k[6:] == j:
                movie_page.append(k)
                actor_page.append(g.get(k))

In [18]:
movie = DataFrame({'id':[]})
for i in range(10):
    movie.loc[i] = movie_page[i]
movie

Unnamed: 0,id
0,movie 161967
1,movie 163788
2,movie 167651
3,movie 175322
4,movie 130966
5,movie 156464
6,movie 37886
7,movie 174065
8,movie 177483
9,movie 154449


In [19]:
dict_graph = DataFrame({'src':[], 'destination':[]})

for k in movie['id']:
    for i in g.get(k):
        dict_graph.loc[len(dict_graph), 'src'] = k
        dict_graph.loc[len(dict_graph)-1, 'destination'] = i
        for j in g.get(i):
            dict_graph.loc[len(dict_graph), 'src'] = i
            dict_graph.loc[len(dict_graph)-1, 'destination'] = j
dict_graph

Unnamed: 0,src,destination
0,movie 161967,actor 793
1,actor 793,movie 161967
2,actor 793,movie 47414
3,actor 793,movie 62328
4,actor 793,movie 121922
5,actor 793,movie 157297
6,actor 793,movie 39841
7,actor 793,movie 65674
8,actor 793,movie 59845
9,actor 793,movie 48747


In [20]:
net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")
net.barnes_hut()

sources = dict_graph['src']
targets = dict_graph['destination']

edge_data = zip(sources, targets)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = 1

    net.add_node(src, src, title=src)
    net.add_node(dst, dst, title=dst)
    net.add_edge(src, dst, title=w)

neighbor_map = net.get_adj_list()

# add neighbor data to node hover data
for node in net.nodes:
    node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
    node["value"] = len(neighbor_map[node["id"]])
    
net.show("dict_graph.html")

## [과제 3] 영화 Top 10 노드 시각화 - numpy

In [21]:
numpy_list

['movie 161967',
 'movie 156464',
 'movie 175322',
 'movie 174065',
 'movie 167651',
 'movie 37886',
 'movie 157297',
 'movie 71509',
 'movie 136900',
 'movie 163788']

In [22]:
num_graph = DataFrame({'src':[], 'destination':[]})

for k in numpy_list:
    for i in g.get(k):
        num_graph.loc[len(num_graph), 'src'] = k
        num_graph.loc[len(num_graph)-1, 'destination'] = i
        for j in g.get(i):
            num_graph.loc[len(num_graph), 'src'] = i
            num_graph.loc[len(num_graph)-1, 'destination'] = j
num_graph

Unnamed: 0,src,destination
0,movie 161967,actor 793
1,actor 793,movie 161967
2,actor 793,movie 47414
3,actor 793,movie 62328
4,actor 793,movie 121922
5,actor 793,movie 157297
6,actor 793,movie 39841
7,actor 793,movie 65674
8,actor 793,movie 59845
9,actor 793,movie 48747


In [23]:
net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")
net.barnes_hut()

sources = num_graph['src']
targets = num_graph['destination']

edge_data = zip(sources, targets)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = 1

    net.add_node(src, src, title=src)
    net.add_node(dst, dst, title=dst)
    net.add_edge(src, dst, title=w)

neighbor_map = net.get_adj_list()

# add neighbor data to node hover data
for node in net.nodes:
    node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
    node["value"] = len(neighbor_map[node["id"]])
    
net.show("numpy_graph.html")