In [2]:
import numpy as np
import pandas as pd

### Pré-processando os dados

Carregando os dados

In [3]:
SOURCE, TARGET, RATING, TIME = 'source', 'target', 'rating', 'time'

file_path = '../data/soc-sign-bitcoinotc.csv'
data = pd.read_csv(file_path, names=[SOURCE, TARGET, RATING, TIME ])

Filtrando por aqueles com Rating superior ou igual a 8:

In [4]:
data = data.loc[(data[RATING] >= 8)]

Desconsiderando a coluna de Tempo:

In [5]:
data = data[[SOURCE, TARGET, RATING]]

Normalizando a coluna Rating para os valores ficarem entre 0 e 1:

In [6]:
data[RATING] = data[RATING].apply(lambda rate: rate / 10.0)

Nossos dados têm a seguinte cara:

In [7]:
data.head(10)

Unnamed: 0,source,target,rating
4,13,16,0.8
5,13,10,0.8
10,21,1,0.8
11,21,10,0.8
12,21,8,0.9
16,10,1,0.8
18,10,21,0.8
20,10,25,1.0
41,13,1,0.8
52,1,17,0.9


Guardando a informação de para quantos nós um nó aponta (essa info será usada para construir a matriz de adjacência)

In [8]:
n_pointing_to = dict()
for source, group_itens in data.groupby([SOURCE]):
    n_pointing_to[source] = len(group_itens)

Lista ordenada com todos os nós:

In [9]:
nodes = sorted(list(set(data.source.append(data.target))))

Construindo matrix de adjacência:

In [10]:
adjacency_matrix = pd.DataFrame(columns=nodes)

for source in nodes:
    filtered_by_source = data.loc[(data.source == source)]
    target_list, rating_list = filtered_by_source.target, filtered_by_source.rating
    
    adjacency_matrix_source_row = [0] * len(nodes)
    for t, r in zip(target_list, rating_list):
        target_index = nodes.index(t)
        adjacency_matrix_source_row[target_index] = r / n_pointing_to[source]
    
    adjacency_matrix.loc[source] = adjacency_matrix_source_row

adjacency_matrix = adjacency_matrix.transpose()

### PageRank

In [11]:
teleportation_factor = 0.15

In [12]:
B = (teleportation_factor / len(nodes)) * np.matrix([[1] * len(nodes) for i in range(len(nodes))])

In [13]:
M = ((1 - teleportation_factor) * adjacency_matrix.values) + B

In [14]:
page_rank_vector = (1/len(nodes)) * np.matrix([[1] for i in range(len(nodes))])

In [15]:
def pagerank(page_rank_vector, curr_iter):
    
    diff_between_iterations = sum(abs(M * page_rank_vector - page_rank_vector))
    will_converge = diff_between_iterations < 0.001
    
    print('Number of currently iteration:', curr_iter)
    print('Diff of page rank vector between iterations:', diff_between_iterations, '\n')
    
    if not will_converge:
        return pagerank(M * page_rank_vector, curr_iter + 1)
    else:
        print('Converged!')
        return M * page_rank_vector

In [16]:
result = pagerank(page_rank_vector, 1)

Number of currently iteration: 1
Diff of page rank vector between iterations: [[0.6617727]] 

Number of currently iteration: 2
Diff of page rank vector between iterations: [[0.36270756]] 

Number of currently iteration: 3
Diff of page rank vector between iterations: [[0.16347109]] 

Number of currently iteration: 4
Diff of page rank vector between iterations: [[0.10193408]] 

Number of currently iteration: 5
Diff of page rank vector between iterations: [[0.07076079]] 

Number of currently iteration: 6
Diff of page rank vector between iterations: [[0.05212454]] 

Number of currently iteration: 7
Diff of page rank vector between iterations: [[0.03942961]] 

Number of currently iteration: 8
Diff of page rank vector between iterations: [[0.03089666]] 

Number of currently iteration: 9
Diff of page rank vector between iterations: [[0.02447621]] 

Number of currently iteration: 10
Diff of page rank vector between iterations: [[0.01980965]] 

Number of currently iteration: 11
Diff of page ran

### Exportando resultados para csv

In [20]:
node_id_list = adjacency_matrix.index.tolist()jiiiiiiiiiiiiiiikl

In [21]:
PR_INDEX = 0
page_rank_list = list(map(lambda list_with_pr: list_with_pr[PR_INDEX], result.tolist()))

In [23]:
nodes_dict = {'Id': node_id_list, 'Modularity Class': page_rank_list}

In [24]:
nodes_dataframe = pd.DataFrame.from_dict(nodes_dict)

In [25]:
nodes_dataframe.head()

Unnamed: 0,Id,Modularity Class
0,1,8.2e-05
1,2,1.3e-05
2,3,2e-06
3,4,2.5e-05
4,6,7e-06


In [26]:
nodes_dataframe.to_csv('result/nodes.csv', index=False)
data.to_csv('result/edges.csv',  index=False)

### Criando grafos com a ferramenta Gephi

<img src="gephi_imgs/all.png">

Zoom em um nó específico (as cores das arestas na imagem abaixo representam o rating, em que quanto mais escuro o tom maior é rating):

<img src="gephi_imgs/zoom.png">

### Perguntas

#### Quantas iterações o PageRank precisou rodar até atingir convergência?
- R: 29 iterações!

#### Quais os 5 investidores mais importantes segundo o PageRank? Quais seus valores de PageRank?

In [58]:
print('PageRank', ' - ', 'Investidor')
for pr, inv_id in sorted(zip(page_rank_list, node_id_list), reverse=True)[:5]:
    print('{:.7f}   {:>10}'.format(pr, inv_id))

PageRank  -  Investidor
0.0001086         3996
0.0001023          361
0.0000824            1
0.0000602          623
0.0000539           25


#### Como você poderia usar o PageRank caso você fosse um investidor em bitcoins?  
- R: O modelo Bitcoin pode ser representado por grafos, visto que se baseia em transações Peer to Peer. O PageRank pode ser utilizado para elaborar uma estratégia de transações para um dado investidor, por exemplo, encontrar usuários de um mercado com quem possivelmente seja mais propício trocar moedas.

notes:
- https://www.evernote.com/Home.action?login=true#n=96a140dd-0c4f-49f0-a5f6-083037c9c75b&s=s540&ses=4&sh=2&sds=5&  
- http://www.ams.org/publicoutreach/feature-column/fcarc-pagerank   
- https://gephi.org/users/quick-start/   
- https://www.youtube.com/watch?v=zv4OVNWfVt4