In [1]:
import numpy as np
import pandas as pd

### Pré-processando os dados

Carregando os dados

In [2]:
SOURCE, TARGET, RATING, TIME = 'source', 'target', 'rating', 'time'

file_path = '../data/soc-sign-bitcoinotc.csv'
data = pd.read_csv(file_path, names=[SOURCE, TARGET, RATING, TIME ])

Filtrando por aqueles com Rating superior ou igual a 8:

In [3]:
data = data.loc[(data[RATING] >= 8)]

Desconsiderando a coluna de Tempo:

In [4]:
data = data[[SOURCE, TARGET, RATING]]

Nossos dados têm a seguinte cara:

In [5]:
data.head()

Unnamed: 0,source,target,rating
4,13,16,8
5,13,10,8
10,21,1,8
11,21,10,8
12,21,8,9


Guardando a informação de para quantos nós um nó aponta (essa info será usada para construir a matriz de adjacência)

In [6]:
n_pointing_to = dict()
for source, group_itens in data.groupby([SOURCE]):
    n_pointing_to[source] = len(group_itens)

Lista ordenada com todos os nós:

In [7]:
nodes = sorted(list(set(data.source.append(data.target))))

Construindo matrix de adjacência:

In [8]:
adjacency_matrix = pd.DataFrame(columns=nodes)

for source in nodes:
    filtered_by_source = data.loc[(data.source == source)]
    target_list, rating_list = filtered_by_source.target, filtered_by_source.rating
    
    adjacency_matrix_source_row = [0] * len(nodes)
    for t, r in zip(target_list, rating_list):
        target_index = nodes.index(t)
        adjacency_matrix_source_row[target_index] = 1 / n_pointing_to[source]
    
    adjacency_matrix.loc[source] = adjacency_matrix_source_row

adjacency_matrix = adjacency_matrix.transpose()

### PageRank

In [9]:
teleportation_factor = 0.15

In [10]:
B = (teleportation_factor / len(nodes)) * np.matrix([[1] * len(nodes) for i in range(len(nodes))])

In [11]:
M = ((1 - teleportation_factor) * adjacency_matrix.values) + B

In [12]:
page_rank_vector = (1/len(nodes)) * np.matrix([[1] for i in range(len(nodes))])

In [13]:
def pagerank(page_rank_vector, curr_iter):
    
    diff_between_iterations = sum(abs(M * page_rank_vector - page_rank_vector))
    will_converge = diff_between_iterations < 0.001
    
    print('Number of currently iteration:', curr_iter)
    print('Diff of page rank vector between iterations:', diff_between_iterations, '\n')
    
    if not will_converge:
        return pagerank(M * page_rank_vector, curr_iter + 1)
    else:
        print('Converged!')
        return M * page_rank_vector

In [14]:
result = pagerank(page_rank_vector, 1)

Number of currently iteration: 1
Diff of page rank vector between iterations: [[0.66774917]] 

Number of currently iteration: 2
Diff of page rank vector between iterations: [[0.38115283]] 

Number of currently iteration: 3
Diff of page rank vector between iterations: [[0.17744513]] 

Number of currently iteration: 4
Diff of page rank vector between iterations: [[0.11254746]] 

Number of currently iteration: 5
Diff of page rank vector between iterations: [[0.07910257]] 

Number of currently iteration: 6
Diff of page rank vector between iterations: [[0.05877311]] 

Number of currently iteration: 7
Diff of page rank vector between iterations: [[0.04590417]] 

Number of currently iteration: 8
Diff of page rank vector between iterations: [[0.03712659]] 

Number of currently iteration: 9
Diff of page rank vector between iterations: [[0.03054642]] 

Number of currently iteration: 10
Diff of page rank vector between iterations: [[0.02559546]] 

Number of currently iteration: 11
Diff of page ra

### Exportando resultados para csv

In [15]:
node_id_list = adjacency_matrix.index.tolist()

In [16]:
PR_INDEX = 0
page_rank_list = list(map(lambda list_with_pr: list_with_pr[PR_INDEX], result.tolist()))

In [17]:
nodes_dict = {'Id': node_id_list, 'Label': node_id_list, 'Modularity Class': page_rank_list}

In [18]:
nodes_dataframe = pd.DataFrame.from_dict(nodes_dict)

In [19]:
nodes_dataframe.to_csv('result/nodes.csv', index=False)
data.to_csv('result/edges.csv',  index=False)

### Criando grafos com a ferramenta Gephi

<img src="gephi_imgs/result.png">

### Perguntas

#### Quantas iterações o PageRank precisou rodar até atingir convergência?
- R: 35 iterações!

#### Quais os 5 investidores mais importantes segundo o PageRank? Quais seus valores de PageRank?

In [20]:
print('PageRank', ' - ', 'Investidor')
for pr, inv_id in sorted(zip(page_rank_list, node_id_list), reverse=True)[:5]:
    print('{:.7f}   {:>10}'.format(pr, inv_id))

PageRank  -  Investidor
0.0001390            1
0.0001247          202
0.0001174          144
0.0000928         3996
0.0000915          361


#### Como você poderia usar o PageRank caso você fosse um investidor em bitcoins?  
- R: O modelo Bitcoin pode ser representado por grafos, visto que se baseia em transações Peer to Peer. O PageRank pode ser utilizado para elaborar uma estratégia de transações para um dado investidor, por exemplo, encontrar usuários de um mercado com quem possivelmente seja mais propício trocar moedas. Se um usuário tem um pagerank maior que outros, então ele é uma melhor opção para com quem fazer uma transação. 