In [350]:
import os
import re
import numpy as np
import pandas as pd

# for networkxdigraph
import sys
from pymongo import MongoClient
import networkx as nx


In [203]:
pd.set_option("display.max_rows", 100)

In [348]:
class Acordao:

    def __init__( self, idAcordao, tribunal, relator, virtual, similares=[], citacoes=[]):
        self.idAcordao = idAcordao
        self.tribunal = tribunal
        self.relator = relator
        self.citacoes = citacoes
        self.similares = similares
        self.virtual = virtual

    def getId( self):
        return self.idAcordao

    def getTribunal( self):
        return self.tribunal

    def getRelator( self):
        return self.relator

#     def getCitacoes( self):
#         return self.citacoes

#     def getSimilares( self):
#         return self.similares

    def getVirtual( self):
        return self.virtual


class GraphMaker:
    def __init__(
        self, mongo_uri, dbName, collections_in, collectionOutName,
    ):
        client = MongoClient(mongo_uri)
        self.db = client[dbName]
        self.collectionsIn = collections_in
        self.collectionOut = self.db[collectionOutName]
        self.collectionOut.drop()
        self.onePercent = sum([coll.count() for coll in self.collectionsIn]) / 100
        self.count = 0
        self.progress = 0

    def set_collections_out(self, collection_out_name):
        self.collectionOut = self.db[collection_out_name]
        self.collectionOut.drop()

    def save_removed_decisions(self, i, removed_decisions, collection_out_name):
        removed_coll = self.db[collection_out_name + "_removed_%d" % i]
        removed_coll.drop()

        removed_coll.insert_one(
            {"iteration": i, "removed_decisions": removed_decisions}
        )

    def __addElemSetToDict(self, aDict, elemKey, elemValue):
        if elemKey not in aDict:
            aDict[elemKey] = set()

        aDict[elemKey].add(elemValue)
        return aDict

    def removeInvalidAcordaosFromDicts(self, validAcordaos, quotes, quotedBy):
        """
            Remove do 'quotedBy' acórdãos que não estão presentes no BD ou nos similares apontados
            por decisões do BD. 'quotes' fica apenas com decisões citadas presentes no BD ou nos
            similares de uma determinada decisão.
        """
        for docId, quotesId in quotes.items():
            newQuotesId = set()
            for q in quotesId:
                if q in validAcordaos:
                    newQuotesId.add(q)
                else:
                    quotedBy.pop(q, 0)

            quotes[docId] = newQuotesId

        return [quotes, quotedBy]

    def buildDicts(self, query, removed_decisions, compute_similars):
        acordaos = {}
        quotes = {}
        quotedBy = {}
        similars = {}

        print("building map")

        self.count = self.progress = 0

        for coll in self.collectionsIn:
            decisions_set = list(coll.find({}, no_cursor_timeout=True))
            dec_relator_trib = {
                dec["acordaoId"]: [
                    re.sub(r"\s*\(.+", "", dec["relator"]),
                    dec["tribunal"],
                ]
                for dec in decisions_set
            }
            docsFound = coll.find(query, no_cursor_timeout=True)
            for doc in docsFound:
                if doc["acordaoId"] in removed_decisions:
                    continue

                docId = doc["acordaoId"]
                for quotedId in doc["citacoesObs"]:
                    if (quotedId in removed_decisions) or (docId == quotedId):
                        continue
                    if quotedId not in acordaos:
                        relator, tribunal = (
                            dec_relator_trib[quotedId]
                            if quotedId in dec_relator_trib
                            else ["", ""]
                        )
                        acordaos[quotedId] = Acordao(quotedId, tribunal, relator, False)

                    quotes = self.__addElemSetToDict(quotes, docId, quotedId)
                    quotedBy = self.__addElemSetToDict(quotedBy, quotedId, docId)

                # similares são decisões (nós) virtuais que apontam para citacoes de 'docId'
                if compute_similars == "S":
                    for similar in doc["similares"]:
                        similarId = similar["acordaoId"]
                        if similarId not in removed_decisions:
                            for quotedId in doc["citacoesObs"]:
                                if quotedId == similarId:
                                    continue
                                quotes = self.__addElemSetToDict(
                                    quotes, similarId, quotedId
                                )
                                quotedBy = self.__addElemSetToDict(
                                    quotedBy, quotedId, similarId
                                )

                                similars = self.__addElemSetToDict(
                                    similars, similarId, docId
                                )
                                similars = self.__addElemSetToDict(
                                    similars, docId, similarId
                                )

                            if similarId not in acordaos:
                                acordaos[similarId] = Acordao(
                                    similarId, doc["tribunal"], similar["relator"], True
                                )

                acordaos[docId] = Acordao(docId, doc["tribunal"], doc["relator"], False)
                self.__printProgress()

            print("")

        return [acordaos, quotes, quotedBy, similars]

    def insertNodes(self, acordaos, quotes, quotedBy, similars, pageRanks):
        nDocs = len(acordaos)
        self.onePercent = nDocs / 100
        self.count = self.progress = 0
        insertStep = nDocs
        if nDocs > 10000:
            insertStep = 10000

        print("n acordaos %s to be inserted" % nDocs)

        i = 0
        docs2Insert = []
        for docId, doc in acordaos.items():
            docQuotedBy = list(quotedBy.get(docId, set()))
            docQuotes = list(quotes.get(docId, set()))
            docSimilars = list(similars.get(docId, set()))
            docPageRank = float(pageRanks.get(docId, 0.0))
            docs2Insert.append(
                {
                    "acordaoId": docId,
                    "citacoes": docQuotes,
                    "citadoPor": docQuotedBy,
                    "similares": docSimilars,
                    "indegree": len(docQuotedBy),
                    "outdegree": len(docQuotes),
                    "relator": doc.getRelator(),
                    "tribunal": doc.getTribunal(),
                    "pageRank": docPageRank,
                    "virtual": doc.getVirtual(),
                }
            )
            i += 1
            self.__printProgress()
            if i >= insertStep:
                self.collectionOut.insert_one(docs2Insert)
                docs2Insert = []
                i = 0

        print("")
        if i > 0:
            self.collectionOut.insert_one(docs2Insert)

    def __printProgress(self):
        self.count += 1
        if self.count >= self.onePercent:
            self.count = 0
            self.progress += 1
            sys.stdout.write("\r%d%%" % self.progress)
            sys.stdout.flush()


In [131]:
def get_decisions_ids(collections, query):
    MONGO_URI = os.getenv("MONGO_URI")
    MONGO_DATABASE = os.getenv("MONGO_DATABASE")

    client = MongoClient(MONGO_URI)
    db = client[MONGO_DATABASE]

    decisions_ids = []
    colls = []
    if collections == "acordaos":
        colls.append(db["acordaos"])
    elif collections == "decisoes_monocraticas":
        colls.append(db["decisoes_monocraticas"])
    elif collections == "decisoes":
        colls.append(db["acordaos"])
        colls.append(db["decisoes_monocraticas"])

    for coll in colls:
        docs = coll.find(query, no_cursor_timeout=True)
        for doc in docs:
            decisions_ids.append(doc["acordaoId"])

    return decisions_ids, colls


def get_removed_decisions(decisions_ids, percentage):
    removed_decisons_len = ceil(len(decisions_ids) * (percentage / 100.0))
    decisions_ids_len = len(decisions_ids)
    removed_decisions = []
    i = 0
    while i < removed_decisons_len:
        x = randint(0, decisions_ids_len - 1)
        if decisions_ids[x] not in removed_decisions:
            removed_decisions.append(decisions_ids[x])
            i += 1

    return removed_decisions

Bibliography

https://math.stackexchange.com/questions/936757/why-is-pagerank-an-eigenvector-problem  
http://pi.math.cornell.edu/~mec/Winter2009/RalucaRemus/Lecture3/lecture3.html  
http://pi.math.cornell.edu/~mec/Winter2009/RalucaRemus/Lecture1/lecture1.html

## Analysis of PageRank models 1 and 2 to understand how they rank decisions

### Exemple of graph, G1, with cycles

In [261]:
v = np.array([1/2]*4)
print(v, "\n")

# PR1
A1 = np.array([
               [0,   0,   1, 1/2],
               [1/3, 0,   0, 0],
               [1/3, 1/2, 0, 1/2],
               [1/3, 1/2, 0, 0]
            ])
print(A1, "\n")

# PR2
A2 = np.array([
                [0, 0, 1, 1],
                [1, 0, 0, 0],
                [1, 1, 0, 1],
                [1, 1, 0, 0]
            ])
print(A2)

alpha = 0.85
N = len(A1)
p = np.repeat(1.0 / N, N)
# alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)

[0.5 0.5 0.5 0.5] 

[[0.         0.         1.         0.5       ]
 [0.33333333 0.         0.         0.        ]
 [0.33333333 0.5        0.         0.5       ]
 [0.33333333 0.5        0.         0.        ]] 

[[0 0 1 1]
 [1 0 0 0]
 [1 1 0 1]
 [1 1 0 0]]


In [224]:
print((alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)))
print("\nsum:", np.sum((alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)), axis=1), "\n")
print((alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)))
print("\nsum:", np.sum((alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)), axis=1))

[[0.33935404 0.3611043  0.38285456 0.40460482]
 [0.14760673 0.11860638 0.1621069  0.11860638]
 [0.29572144 0.31022161 0.26672109 0.28847135]
 [0.2173178  0.21006771 0.18831745 0.18831745]]

sum: [1.48791771 0.54692639 1.1611355  0.8040204 ] 

[[3.72779492 2.90433086 2.20414336 3.49949023]
 [2.25904805 1.03279023 1.44976836 1.60491992]
 [4.95405273 3.55845898 2.33220117 3.6546418 ]
 [3.55845898 2.25904805 1.60491992 2.33220117]]

sum: [12.33575937  6.34652656 14.49935469  9.75462812]


In [263]:
v1 = v.copy()
for i in range(8):
    v1 = (alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v1)
    v1_sum = sum(v1)
    v1 = v1/v1_sum
    print(v1)
    
v1.argsort()[::-1] + 1

[0.35625    0.10833333 0.32083333 0.21458333]
[0.40140625 0.1384375  0.27567708 0.18447917]
[0.35022917 0.15123177 0.28847135 0.21006771]
[0.37197943 0.1367316  0.29028388 0.2010051 ]
[0.36966846 0.14289417 0.28643227 0.2010051 ]
[0.36639459 0.1422394  0.28839659 0.20296942]
[0.3688991  0.1413118  0.28802555 0.20176355]
[0.36807122 0.14202141 0.28782844 0.20207893]


array([1, 3, 4, 2])

In [264]:
v2 = v.copy()
for i in range(8):
    v2 = (alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v2.T)
    v2_sum = sum(v2)
    v2 = v2/v2_sum
    print(v2)
    
v2.argsort()[::-1] + 1
# PR2 tende a privilegiar mais nós que recebe muitas citações e que cita poucos
# PR1 tende a privilegiar mais nós que recebem citações de nós muito citados e que citam poucos

[0.25       0.13513514 0.36486486 0.25      ]
[0.31964527 0.14266435 0.32947754 0.20821284]
[0.2685303  0.16789277 0.32983821 0.23373872]
[0.28730394 0.14781272 0.3376948  0.22718854]
[0.28644315 0.1558839  0.33226545 0.2254075 ]
[0.28244483 0.15514564 0.33410122 0.2283083 ]
[0.28544951 0.1536899  0.33415459 0.226706  ]
[0.28432562 0.15488885 0.33366602 0.22711951]


array([3, 1, 4, 2])

### Removing node 3->1 from G1 to understand what happens when a node only has incoming edges 

In [265]:
# PR1
A1 = np.array([
               [0,   0,   0, 1/2],
               [1/3, 0,   0, 0],
               [1/3, 1/2, 0, 1/2],
               [1/3, 1/2, 0, 0]
            ])
print(A1, "\n")

# PR2
A2 = np.array([
                [0, 0, 0, 1],
                [1, 0, 0, 0],
                [1, 1, 0, 1],
                [1, 1, 0, 0]
            ])
print(A2)

[[0.         0.         0.         0.5       ]
 [0.33333333 0.         0.         0.        ]
 [0.33333333 0.5        0.         0.5       ]
 [0.33333333 0.5        0.         0.        ]] 

[[0 0 0 1]
 [1 0 0 0]
 [1 1 0 1]
 [1 1 0 0]]


In [266]:
v1 = v.copy()
for i in range(8):
    v1 = (alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v1)
    v1_sum = sum(v1)
    v1 = v1/v1_sum
    print(v1)
    
v1.argsort()[::-1] + 1

[0.18253968 0.13756614 0.40740741 0.27248677]
[0.23452044 0.13648321 0.40307568 0.22592068]
[0.20310191 0.15812249 0.39241669 0.24635891]
[0.21337449 0.14261556 0.4005579  0.24345205]
[0.21374012 0.14852506 0.39730795 0.24042687]
[0.21090729 0.14806197 0.39765808 0.24337266]
[0.21289332 0.14691608 0.3982183  0.24197231]
[0.21214689 0.14787247 0.39771971 0.24226093]


array([3, 4, 1, 2])

In [267]:
v2 = v.copy()
for i in range(8):
    v2 = (alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v2)
    v2_sum = sum(v2)
    v2 = v2/v2_sum
    print(v2)
    
v2.argsort()[::-1] + 1

[0.15267176 0.15267176 0.41221374 0.28244275]
[0.21702477 0.13078186 0.41994927 0.23224411]
[0.17788512 0.16808892 0.40175699 0.25226897]
[0.19111339 0.14314985 0.4142013  0.25153546]
[0.19208464 0.15282867 0.40925414 0.24583255]
[0.18705898 0.15238396 0.40957695 0.25098011]
[0.191079   0.14968937 0.41087199 0.24835964]
[0.18920703 0.1521515  0.40965413 0.24898733]


array([3, 4, 1, 2])

### Removing node 1->2 from G1 to understand what happens when a node only has outcoming edges 

In [268]:
# PR1
A1 = np.array([
               [0,   0,   1, 1/2],
               [0,   0,   0, 0],
               [1/2, 1/2, 0, 1/2],
               [1/2, 1/2, 0, 0]
            ])
print(A1, "\n")

# PR2
A2 = np.array([
                [0, 0, 1, 1],
                [0, 0, 0, 0],
                [1, 1, 0, 1],
                [1, 1, 0, 0]
            ])
print(A2)

[[0.  0.  1.  0.5]
 [0.  0.  0.  0. ]
 [0.5 0.5 0.  0.5]
 [0.5 0.5 0.  0. ]] 

[[0 0 1 1]
 [0 0 0 0]
 [1 1 0 1]
 [1 1 0 0]]


In [269]:
v1 = v.copy()
for i in range(8):
    v1 = (alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v1.T)
    v1_sum = sum(v1)
    v1 = v1/v1_sum
    print(v1)
    
v1.argsort()[::-1] + 1

[0.35625 0.0375  0.35625 0.25   ]
[0.4465625  0.0375     0.31109375 0.20484375]
[0.38898828 0.0375     0.33028516 0.24322656]
[0.42161367 0.0375     0.32212881 0.21875752]
[0.40428143 0.0375     0.32559526 0.23262331]
[0.41312087 0.0375     0.32412202 0.22525711]
[0.40873798 0.0375     0.32474814 0.22901387]
[0.41086682 0.0375     0.32448204 0.22715114]


array([1, 3, 4, 2])

In [270]:
v2 = v.copy()
for i in range(8):
    v2 = (alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v2.T)
    v2_sum = sum(v2)
    v2 = v2/v2_sum
    print(v2)
    
v2.argsort()[::-1] + 1

[0.28244275 0.02290076 0.41221374 0.28244275]
[0.41874523 0.02500636 0.35817002 0.19807839]
[0.33017977 0.02426312 0.37724656 0.26831056]
[0.38331789 0.02452037 0.37064372 0.22151801]
[0.35234794 0.02443072 0.3729449  0.25027644]
[0.37001912 0.02446189 0.37214482 0.23337417]
[0.36004398 0.02445104 0.37242323 0.24308175]
[0.36563487 0.02445482 0.37232638 0.23758394]


array([3, 1, 4, 2])

### Exemple of graph, G2, without cycles

In [309]:
# PR1
A1 = np.array([
               [0,   0,   0,   0, 0, 0],
               [1/3, 0,   0,   0, 0, 0],
               [1/3, 1/2, 0,   0, 0, 0],
               [1/3, 1/2, 1/2, 0, 0, 0],
               [0,   0,   1/2, 0, 0, 0],
               [0,   0,   0,   0, 1, 0],
            ])
print(A1, "\n")

# PR2
A2 = np.array([
               [0,   0,   0, 0, 0, 0],
               [1,   0,   0, 0, 0, 0],
               [1,   1,   0, 0, 0, 0],
               [1,   1,   1, 0, 0, 0],
               [0,   0,   1, 0, 0, 0],
               [0,   0,   0, 0, 1, 0],
            ])
print(A2)

alpha = 0.85
N = len(A1)
p = np.repeat(1.0 / N, N)
# alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)

[[0.         0.         0.         0.         0.         0.        ]
 [0.33333333 0.         0.         0.         0.         0.        ]
 [0.33333333 0.5        0.         0.         0.         0.        ]
 [0.33333333 0.5        0.5        0.         0.         0.        ]
 [0.         0.         0.5        0.         0.         0.        ]
 [0.         0.         0.         0.         1.         0.        ]] 

[[0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 1 0]]


In [310]:
v1 = np.array([1/len(A1)] * len(A1))
for i in range(8):
    v1 = (alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v1.T)
    v1_sum = sum(v1)
    v1 = v1/v1_sum
    print(v1)
    
v1.argsort()[::-1] + 1

[0.03488372 0.10077519 0.1996124  0.29844961 0.13372093 0.23255814]
[0.04556694 0.06358177 0.14164606 0.2962734  0.20019428 0.25273755]
[0.04687435 0.07108145 0.12174748 0.23462013 0.159747   0.36592958]
[0.05106911 0.0781992  0.13991032 0.24560842 0.1567672  0.32844575]
[0.04882298 0.0770809  0.1419855  0.25810976 0.16494724 0.30905361]
[0.04827083 0.07498039 0.13823329 0.25474718 0.16478472 0.3189836 ]
[0.04879678 0.07549201 0.13769164 0.25236243 0.16346757 0.32218957]
[0.04886336 0.07588627 0.13859576 0.25297306 0.16324066 0.32044089]


array([6, 4, 5, 3, 2, 1])

In [311]:
v2 = np.array([1/len(A1)] * len(A1))
for i in range(8):
    v2 = (alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v2.T)
    v2_sum = sum(v2)
    v2 = v2/v2_sum
    print(v2)
    
v2.argsort()[::-1] + 1

[0.01948052 0.12987013 0.24025974 0.35064935 0.12987013 0.12987013]
[0.02661597 0.04424473 0.16176979 0.37919115 0.24403733 0.14414103]
[0.03223614 0.061408   0.1099015  0.28720583 0.20954047 0.29970806]
[0.035636   0.07469409 0.14909751 0.28225681 0.16879531 0.28952029]
[0.03268856 0.07229481 0.15531065 0.32101926 0.19839718 0.22028954]
[0.03168878 0.06690805 0.14479982 0.3121344  0.19902336 0.2454456 ]
[0.03289995 0.06834696 0.14319019 0.30516301 0.19487277 0.25552713]
[0.03293157 0.06976877 0.14629506 0.30662131 0.19325783 0.25112545]


array([4, 6, 5, 3, 2, 1])

### Exemple of graph, G2', with a cycle between node 3 and 4

In [319]:
# PR1
A1 = np.array([
               [0,   0,   0,   0, 0, 0],
               [1/3, 0,   0,   0, 0, 0],
               [1/3, 1/1, 0,   1, 0, 0],
               [1/3, 1/2, 1/2, 0, 0, 0],
               [0,   0,   1/2, 0, 0, 0],
               [0,   0,   0,   0, 1, 0],
            ])
print(A1, "\n")

# PR2
A2 = np.array([
               [0,   0,   0, 0, 0, 0],
               [1,   0,   0, 0, 0, 0],
               [1,   1,   0, 1, 0, 0],
               [1,   1,   1, 0, 0, 0],
               [0,   0,   1, 0, 0, 0],
               [0,   0,   0, 0, 1, 0],
            ])
print(A2)

alpha = 0.85
N = len(A1)
p = np.repeat(1.0 / N, N)
# alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)

[[0.         0.         0.         0.         0.         0.        ]
 [0.33333333 0.         0.         0.         0.         0.        ]
 [0.33333333 1.         0.         1.         0.         0.        ]
 [0.33333333 0.5        0.5        0.         0.         0.        ]
 [0.         0.         0.5        0.         0.         0.        ]
 [0.         0.         0.         0.         1.         0.        ]] 

[[0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 1 0 1 0 0]
 [1 1 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 1 0]]


In [313]:
v1 = np.array([1/len(A1)] * len(A1))
for i in range(8):
    v1 = (alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v1.T)
    v1_sum = sum(v1)
    v1 = v1/v1_sum
    print(v1)
    
v1.argsort()[::-1] + 1

[0.02690583 0.07772795 0.38266069 0.23019432 0.10313901 0.1793722 ]
[0.02839077 0.03704804 0.33428111 0.2592514  0.21307927 0.12794941]
[0.02756375 0.03643271 0.3141149  0.21043151 0.18420243 0.2272547 ]
[0.03040189 0.03989911 0.29507379 0.22107335 0.19274654 0.22080533]
[0.0301469  0.04053416 0.30802954 0.21220688 0.18137144 0.22771108]
[0.03035187 0.04072203 0.30154154 0.22057453 0.18928948 0.21752056]
[0.03003312 0.04036415 0.30718086 0.21511127 0.18398908 0.22332153]
[0.03021763 0.04050297 0.3029783  0.21903677 0.18801638 0.21924794]


array([3, 6, 4, 5, 2, 1])

In [327]:
v2 = np.array([1/len(A1)] * len(A1))
for i in range(20):
    v2 = (alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v2.T)
    v2_sum = sum(v2)
    v2 = v2/v2_sum
    print(v2)
    
v2.argsort()[::-1] + 1

[0.01754386 0.11695906 0.31578947 0.31578947 0.11695906 0.11695906]
[0.01925676 0.03074324 0.31407658 0.31407658 0.22601351 0.09583333]
[0.02009041 0.03324419 0.26878159 0.26878159 0.2346279  0.17447432]
[0.02188045 0.03682641 0.26151406 0.26151406 0.22183657 0.19642843]
[0.02224552 0.03879474 0.26444383 0.26444383 0.22004103 0.19003106]
[0.022045   0.03871869 0.26600514 0.26600514 0.22025363 0.1869724 ]
[0.02197676 0.03844901 0.26614167 0.26614167 0.22073844 0.18655245]
[0.02197429 0.0383937  0.26596132 0.26596132 0.22081565 0.18689372]
[0.02198385 0.03840856 0.265899   0.265899   0.22077688 0.18703271]
[0.0219866  0.03842051 0.26590392 0.26590392 0.22075792 0.18702713]
[0.02198614 0.03842176 0.26591304 0.26591304 0.22075698 0.18700904]
[0.02198569 0.03842062 0.26591497 0.26591497 0.22075925 0.1870045 ]
[0.02198562 0.03842016 0.26591434 0.26591434 0.22075996 0.18700557]
[0.02198565 0.03842017 0.26591393 0.26591393 0.22075988 0.18700643]
[0.02198567 0.03842023 0.2659139  0.2659139  0.2

array([4, 3, 5, 6, 2, 1])

### Exemple of graph, G2'', with a cycle between node 5 and 6

In [315]:
# PR1
A1 = np.array([
               [0,   0,   0,   0, 0, 0],
               [1/3, 0,   0,   0, 0, 0],
               [1/3, 1/2, 0,   0, 0, 0],
               [1/3, 1/2, 1/2, 0, 0, 0],
               [0,   0,   1/2, 0, 0, 1],
               [0,   0,   0,   0, 1, 0],
            ])
print(A1, "\n")

# PR2
A2 = np.array([
               [0,   0,   0, 0, 0, 0],
               [1,   0,   0, 0, 0, 0],
               [1,   1,   0, 0, 0, 0],
               [1,   1,   1, 0, 0, 0],
               [0,   0,   1, 0, 0, 1],
               [0,   0,   0, 0, 1, 0],
            ])
print(A2)

alpha = 0.85
N = len(A1)
p = np.repeat(1.0 / N, N)
# alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)

[[0.         0.         0.         0.         0.         0.        ]
 [0.33333333 0.         0.         0.         0.         0.        ]
 [0.33333333 0.5        0.         0.         0.         0.        ]
 [0.33333333 0.5        0.5        0.         0.         0.        ]
 [0.         0.         0.5        0.         0.         1.        ]
 [0.         0.         0.         0.         1.         0.        ]] 

[[0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 1 0 0 0 0]
 [1 1 1 0 0 0]
 [0 0 1 0 0 1]
 [0 0 0 0 1 0]]


In [316]:
v1 = np.array([1/len(A1)] * len(A1))
for i in range(8):
    v1 = (alpha * A1 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v1.T)
    v1_sum = sum(v1)
    v1 = v1/v1_sum
    print(v1)
    
v1.argsort()[::-1] + 1

[0.02912621 0.08414239 0.16666667 0.24919094 0.27669903 0.19417476]
[0.03171833 0.04218846 0.08755902 0.17742763 0.33098953 0.33011702]
[0.02943994 0.04002285 0.06113728 0.10495874 0.40369465 0.36074655]
[0.02744885 0.03660723 0.0552831  0.08381162 0.39264797 0.40420123]
[0.02691761 0.03529132 0.05204277 0.07734028 0.42213954 0.38626849]
[0.02675912 0.03492243 0.05097663 0.07465115 0.40186467 0.41082599]
[0.02669382 0.03478925 0.05063687 0.0737698  0.4226882  0.39142206]
[0.02667248 0.0347417  0.05051627 0.07347665 0.40459957 0.40999333]


array([6, 5, 4, 3, 2, 1])

In [317]:
v2 = np.array([1/len(A1)] * len(A1))
for i in range(8):
    v2 = (alpha * A2 + (1 - alpha) * np.outer(np.ones(N), p)).dot(v2.T)
    v2_sum = sum(v2)
    v2 = v2/v2_sum
    print(v2)
    
v2.argsort()[::-1] + 1

[0.01754386 0.11695906 0.21637427 0.31578947 0.21637427 0.11695906]
[0.02392947 0.03820319 0.13336132 0.30940386 0.29513014 0.19997201]
[0.02707044 0.049095   0.08425702 0.2070021  0.33386873 0.29870671]
[0.02542156 0.04881944 0.09125387 0.16408    0.35642978 0.31399535]
[0.02444244 0.04556886 0.08613991 0.16197581 0.36122201 0.32065096]
[0.02461053 0.04506295 0.08319306 0.15527134 0.36499612 0.326866  ]
[0.02453687 0.0450683  0.08266222 0.15206633 0.36663011 0.32903618]
[0.02448515 0.04491196 0.08243109 0.15124697 0.36722198 0.32970285]


array([5, 6, 4, 3, 2, 1])

PR1: Quando surge um ciclo  
PR2: Quando surge um ciclo o nó com o arco da 'volta' do ciclo adquire mais ranking que alguns outros e "pula" posições trocando de lugar com o nó que estava no seu lugar

Resumo

PR1: 6 > 4 > 5 > 3 > 2 > 1  
PR2: 4 > 6 > 5 > 3 > 2 > 1
    
Ciclo entre 3 e 4  
PR1: 3 > 6 > 4 > 5 > 2 > 1  
PR2: 4 > 3 > 5 > 6 > 2 > 1

Ciclo entre 5 e 6  
PR1: 6 > 5 > 4 > 3 > 2 > 1  
PR2: 5 > 6 > 4 > 3 > 2 > 1


### Generating decisions graph

In [351]:
query = {}
collections_name = "acordaos"
collection_out_iter_name = "complete_graph_20200426"
compute_similars = "S"

decisions_ids, collections = get_decisions_ids(
    collections_name, query
)

MONGO_URI = os.getenv("MONGO_URI")
MONGO_DATABASE = os.getenv("MONGO_DATABASE")
graph = GraphMaker(
    MONGO_URI,
    MONGO_DATABASE,
    collections,
    collection_out_iter_name,
)
removed_decisions = []
# Construct graph
[acordaos, quotes, quotedBy, similars] = graph.buildDicts(query, removed_decisions, compute_similars)

[quotes, quotedBy] = graph.removeInvalidAcordaosFromDicts(
    acordaos, quotes, quotedBy
)

building map
99%


### Building adjacency matrix for PR models 1 and 2

In [352]:
pr1_dict = {}
pr2_dict = {}
for k, v in quotes.items():
    if len(v) == 0:
        continue
    pr1_val = 1/len(v)
    pr2_val = 1
    for dec in v:
        if dec not in pr1_dict:
            pr1_dict[dec] = pr1_val
            pr2_dict[dec] = pr2_val
        else:
            pr1_dict[dec] += pr1_val
            pr2_dict[dec] += pr2_val

pr1_rank_lst = [[k, v] for k, v in pr1_dict.items()]
pr1_rank_lst = sorted(pr1_rank_lst, key=lambda x: x[1], reverse=True)
pr2_rank_lst = [[k, v] for k, v in pr2_dict.items()]
pr2_rank_lst = sorted(pr2_rank_lst, key=lambda x: x[1], reverse=True)

In [353]:
df_pr1_adj_matrix_sum = pd.DataFrame(pr1_rank_lst, columns=["acordaoId", "adj_sum"])
df_pr2_adj_matrix_sum = pd.DataFrame(pr2_rank_lst, columns=["acordaoId", "adj_sum"])

In [354]:
len(df_pr1_adj_matrix_sum)

71122

In [169]:
def get_db_object():
    """
    """
    MONGO_URI = os.getenv("MONGO_URI")
    MONGO_DATABASE = "DJs_v7"

    client = MongoClient(MONGO_URI)
    db = client[MONGO_DATABASE]
    return db


def get_decisions_collection(db, collection_name):
    """
    """
    page_ranks_iters = []
    selected_keys = {"acordaoId": 1, "pageRank": 1, "relator": 1}
    df_collection = pd.DataFrame()
#     for i in range(1, 11):
    for i in range(1, 2):
        coll_name_iter_i = collection_name + "_%d" % i
        page_ranks_cursor = (
            db[coll_name_iter_i]
            .find({}, selected_keys)
            .sort([("pageRank", -1)])
            .limit(60000)
        )
        df = pd.DataFrame(page_ranks_cursor)
        df["collection_iter"] = i
        df.drop("_id", axis=1, inplace=True)
        df_collection = df_collection.append(df)

    return df_collection


def get_decisions_dataframes_tabulated(db, coll_name, perturbances, pr_models):
    """
    """
    coll_names = []
    for perturbance in perturbances:
        pr_models_coll_names = []
        for pr_model in pr_models:
            pr_models_coll_names.append(coll_name.format(pr_model, perturbance))
        coll_names.append(pr_models_coll_names)

    df_lst = []
    for i, perturbance in enumerate(perturbances):
        pr_models_dfs = []
        for j, pr_model in enumerate(pr_models):
            pr_models_dfs.append(get_decisions_collection(db, coll_names[i][j]))
        df_lst.append(pr_models_dfs)

    return df_lst

In [355]:
db = get_db_object()

df_all_decs_pr_lst = get_decisions_dataframes_tabulated(
    db,
    "stf_pr_{}_acordaos_{}_no_loop", [10], [1, 2]
)

In [356]:
df_pr1_rank = df_all_decs_pr_lst[0][0][df_all_decs_pr_lst[0][0]["collection_iter"] == 1]
df_pr2_rank = df_all_decs_pr_lst[0][1][df_all_decs_pr_lst[0][1]["collection_iter"] == 1]
df_pr1_rank

Unnamed: 0,acordaoId,relator,pageRank,collection_iter
0,QO AI 664567,SEPÚLVEDA PERTENCE,0.006443,1
1,AGR RE 569476,ELLEN GRACIE,0.004232,1
2,RE 416827,GILMAR MENDES,0.003819,1
3,RE 415454,GILMAR MENDES,0.003811,1
4,RG ARE 748371,,0.002604,1
...,...,...,...,...
59995,AGR RE 271752,,0.000002,1
59996,AGR AI 477132,EROS GRAU,0.000002,1
59997,AGR AI 686353,CELSO DE MELLO,0.000002,1
59998,HC 84590,CEZAR PELUSO,0.000002,1


In [357]:
df_pr1_rank = df_pr1_rank.reset_index().rename(columns={"index": "pos rank"})
df_pr1_adj_matrix_sum = df_pr1_adj_matrix_sum.reset_index().rename(columns={"index": "pos adj"})
df_pr2_rank = df_pr2_rank.reset_index().rename(columns={"index": "pos rank"})
df_pr2_adj_matrix_sum = df_pr2_adj_matrix_sum.reset_index().rename(columns={"index": "pos adj"})

### See 100 top ranked by adj matrix in page rank simulation ranking for PR1

In [358]:
df_pr1_rank[df_pr1_rank["acordaoId"].isin(df_pr1_adj_matrix_sum.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos rank,acordaoId,relator,pageRank,collection_iter
0,0,QO AI 664567,SEPÚLVEDA PERTENCE,0.006443,1
1,1,AGR RE 569476,ELLEN GRACIE,0.004232,1
2,2,RE 416827,GILMAR MENDES,0.003819,1
3,3,RE 415454,GILMAR MENDES,0.003811,1
4,4,RG ARE 748371,,0.002604,1
5,5,QO AI 760358,GILMAR MENDES (PRESIDENTE),0.002581,1
6,6,RG QO AI 791292,,0.002135,1
7,7,RE 140370,,0.002118,1
8,8,ED AGR AI 252559,,0.001927,1
9,9,RE 453740,GILMAR MENDES,0.001851,1


### See 100 top ranked by page rank simulation in adj matrix in ranking for PR1

In [359]:
df_pr1_adj_matrix_sum[df_pr1_adj_matrix_sum["acordaoId"].isin(df_pr1_rank.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos adj,acordaoId,adj_sum
0,0,RE 416827,2526.077357
1,1,RE 415454,2514.85621
2,2,AGR RE 569476,2222.932418
3,3,RG ARE 748371,1471.701459
4,4,RE 453740,1215.163888
5,5,QO AI 760358,1177.845733
6,6,ED AGR AI 252559,1147.021981
7,7,RG QO AI 791292,1095.271598
8,8,AGR AI 541696,865.381412
9,9,QO AI 664567,796.744781


In [360]:
# quotes["AGR RE 405097"], quotedBy["AGR RE 405097"]

KeyError: 'AGR RE 405097'

In [361]:
df_pr1 = pd.merge(df_pr1_rank, df_pr1_adj_matrix_sum, how='left', on="acordaoId")
df_pr1.head(100)

Unnamed: 0,pos rank,acordaoId,relator,pageRank,collection_iter,pos adj,adj_sum
0,0,QO AI 664567,SEPÚLVEDA PERTENCE,0.006443,1,9,796.744781
1,1,AGR RE 569476,ELLEN GRACIE,0.004232,1,2,2222.932418
2,2,RE 416827,GILMAR MENDES,0.003819,1,0,2526.077357
3,3,RE 415454,GILMAR MENDES,0.003811,1,1,2514.85621
4,4,RG ARE 748371,,0.002604,1,3,1471.701459
5,5,QO AI 760358,GILMAR MENDES (PRESIDENTE),0.002581,1,5,1177.845733
6,6,RG QO AI 791292,,0.002135,1,7,1095.271598
7,7,RE 140370,,0.002118,1,21,538.488336
8,8,ED AGR AI 252559,,0.001927,1,6,1147.021981
9,9,RE 453740,GILMAR MENDES,0.001851,1,4,1215.163888


In [381]:
df_pr1_rank.head(100)

Unnamed: 0,pos rank,acordaoId,relator,pageRank,collection_iter
0,0,QO AI 664567,SEPÚLVEDA PERTENCE,0.006443,1
1,1,AGR RE 569476,ELLEN GRACIE,0.004232,1
2,2,RE 416827,GILMAR MENDES,0.003819,1
3,3,RE 415454,GILMAR MENDES,0.003811,1
4,4,RG ARE 748371,,0.002604,1
5,5,QO AI 760358,GILMAR MENDES (PRESIDENTE),0.002581,1
6,6,RG QO AI 791292,,0.002135,1
7,7,RE 140370,,0.002118,1
8,8,ED AGR AI 252559,,0.001927,1
9,9,RE 453740,GILMAR MENDES,0.001851,1


In [380]:
df_pr2_rank.head(100)

Unnamed: 0,pos rank,acordaoId,relator,pageRank,collection_iter
0,0,AGR RE 369696,EROS GRAU,0.061454,1
1,1,AGR RE 148837,,0.056726,1
2,2,AGR RE 223891,CARLOS VELLOSO,0.056726,1
3,3,AGR AI 561181,EROS GRAU,0.056726,1
4,4,AGR RE 481073,,0.05236,1
5,5,AGR RE 481230,,0.05236,1
6,6,AGR RE 481234,,0.05236,1
7,7,AGR RE 481242,,0.05236,1
8,8,AGR RE 481256,,0.05236,1
9,9,AGR RE 481297,,0.05236,1


In [389]:
df_pr1_rank[df_pr1_rank["acordaoId"].isin(df_pr2_rank["acordaoId"].head(100))].head(100)

Unnamed: 0,pos rank,acordaoId,relator,pageRank,collection_iter
0,0,QO AI 664567,SEPÚLVEDA PERTENCE,0.006443,1
1,1,AGR RE 569476,ELLEN GRACIE,0.004232,1
2,2,RE 416827,GILMAR MENDES,0.003819,1
3,3,RE 415454,GILMAR MENDES,0.003811,1
4,4,RG ARE 748371,,0.002604,1
5,5,QO AI 760358,GILMAR MENDES (PRESIDENTE),0.002581,1
6,6,RG QO AI 791292,,0.002135,1
7,7,RE 140370,,0.002118,1
8,8,ED AGR AI 252559,,0.001927,1
9,9,RE 453740,GILMAR MENDES,0.001851,1


### See 100 top ranked by adj matrix in page rank simulation ranking for PR2

In [363]:
df_pr2_rank[df_pr2_rank["acordaoId"].isin(df_pr2_adj_matrix_sum.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos rank,acordaoId,relator,pageRank,collection_iter
0,0,AGR RE 369696,EROS GRAU,0.061454,1
1,1,AGR RE 148837,,0.056726,1
2,2,AGR RE 223891,CARLOS VELLOSO,0.056726,1
3,3,AGR AI 561181,EROS GRAU,0.056726,1
38,38,RE 415454,GILMAR MENDES,2.3e-05,1
39,39,RE 416827,,2.3e-05,1
40,40,RG QO AI 791292,,1.8e-05,1
41,41,QO AI 760358,GILMAR MENDES (PRESIDENTE),1.8e-05,1
42,42,RG ARE 748371,,1.6e-05,1
43,43,QO AI 664567,SEPÚLVEDA PERTENCE,1.3e-05,1


### See 100 top ranked by page rank simulation in adj matrix in ranking for PR2

In [364]:
df_pr2_adj_matrix_sum[df_pr2_adj_matrix_sum["acordaoId"].isin(df_pr2_rank.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos adj,acordaoId,adj_sum
0,0,RE 416827,5189
1,1,RE 415454,5155
2,2,RG QO AI 791292,3585
3,3,QO AI 760358,3405
4,4,RG ARE 748371,3277
5,5,AGR RE 569476,2591
6,6,AGR AI 330970,2418
7,7,AGR AI 481531,2300
8,8,RTJ 191/694,2294
9,9,RTJ 134/1296,2261


In [365]:
quotes["AGR AI 530087"], quotedBy["AGR AI 530087"]
# Há múltiplas referências circulares aqui

# Consultar acórdaos extraídos para decisão AI 529924 AgR
# Acórdãos citados: AI 488966 AgR, AI 506475 AgR.
# - Os AI 529924 AgR, AI 530087 AgR, AI 530186 AgR,....

KeyError: 'AGR AI 530087'

In [366]:
df_pr2 = pd.merge(df_pr2_rank, df_pr2_adj_matrix_sum, how='left', on="acordaoId")
df_pr2.head(100)

Unnamed: 0,pos rank,acordaoId,relator,pageRank,collection_iter,pos adj,adj_sum
0,0,AGR RE 369696,EROS GRAU,0.061454,1,65,747
1,1,AGR RE 148837,,0.056726,1,73,686
2,2,AGR RE 223891,CARLOS VELLOSO,0.056726,1,75,685
3,3,AGR AI 561181,EROS GRAU,0.056726,1,89,621
4,4,AGR RE 481073,,0.05236,1,4009,30
5,5,AGR RE 481230,,0.05236,1,4013,30
6,6,AGR RE 481234,,0.05236,1,4008,30
7,7,AGR RE 481242,,0.05236,1,4016,30
8,8,AGR RE 481256,,0.05236,1,4010,30
9,9,AGR RE 481297,,0.05236,1,4011,30


In [287]:
def get_decisions_collection_kleinberg(db, collection_name, sortby="authority"):
    """
    """
    page_ranks_iters = []
    selected_keys = {"acordaoId": 1, "authority": 1, "hub": 1, "relator": 1}
    df_collection = pd.DataFrame()
#     for i in range(1, 11):
    for i in range(1, 2):
        coll_name_iter_i = collection_name + "_%d" % i
        page_ranks_cursor = (
            db[coll_name_iter_i]
            .find({}, selected_keys)
            .sort([(sortby, -1)])
            .limit(60000)
        )
        df = pd.DataFrame(page_ranks_cursor)
        df["collection_iter"] = i
        df.drop("_id", axis=1, inplace=True)
        df_collection = df_collection.append(df)

    return df_collection


def get_decisions_dataframes_tabulated_kleinberg(db, coll_name, perturbances, sortby="authority"):
    """
    """
    coll_names = []
    for perturbance in perturbances:
        coll_names.append(coll_name.format(perturbance))

    df_lst = []
    for i, perturbance in enumerate(perturbances):
        df_lst.append(get_decisions_collection_kleinberg(db, coll_names[i], sortby))

    return df_lst


In [367]:
df_all_decs_kl_authority_lst = get_decisions_dataframes_tabulated_kleinberg(
    db, "stf_kleinberg_acordaos_{}", [10],
)
df_all_decs_kl_hub_lst = get_decisions_dataframes_tabulated_kleinberg(
    db, "stf_kleinberg_acordaos_{}", [10], "hub"
)
df_all_decs_kl_authority = df_all_decs_kl_authority_lst[0]
df_all_decs_kl_hub = df_all_decs_kl_hub_lst[0]

In [368]:
df_all_decs_kl_authority = df_all_decs_kl_authority.reset_index().rename(columns={"index": "pos authority"})
df_all_decs_kl_hub = df_all_decs_kl_hub.reset_index().rename(columns={"index": "pos hub"})

In [369]:
df_all_decs_kl_authority.head(60)

Unnamed: 0,pos authority,acordaoId,relator,authority,hub,collection_iter
0,0,RE 416827,GILMAR MENDES,0.459487,1.792882e-06,1
1,1,RE 415454,GILMAR MENDES,0.457955,1.792882e-06,1
2,2,RE 495042,SEPÚLVEDA PERTENCE,0.010582,0.0001929961,1
3,3,AGR RE 459727,RICARDO LEWANDOWSKI,0.006124,0.0001929961,1
4,4,AGR RE 537300,EROS GRAU,0.006124,0.0001962486,1
5,5,RTJ 143/57,,0.005471,0.0,1
6,6,RE 420532,CÁRMEN LÚCIA,0.005433,4.760781e-10,1
7,7,RTJ 119/895,,0.005431,0.0,1
8,8,RTJ 189/747,,0.00543,0.0,1
9,9,RE 485606,CARLOS BRITTO,0.00488,0.0001929961,1


In [370]:
df_all_decs_kl_hub.head(60)

Unnamed: 0,pos hub,acordaoId,relator,authority,hub,collection_iter
0,0,ED RE 567360,CELSO DE MELLO,0.0007623592,0.000201,1
1,1,ED RE 567361,CELSO DE MELLO,0.0,0.000201,1
2,2,AGR AI 625446,CELSO DE MELLO,8.372007e-08,0.0002,1
3,3,AGR AI 602251,CELSO DE MELLO,0.0,0.0002,1
4,4,AGR AI 613443,CELSO DE MELLO,0.0,0.0002,1
5,5,AGR AI 625691,CELSO DE MELLO,0.0,0.0002,1
6,6,ED AI 627067,CELSO DE MELLO,0.0,0.0002,1
7,7,AGR AI 709612,CELSO DE MELLO,0.0,0.0002,1
8,8,AGR RE 461904,CELSO DE MELLO,0.0,0.0002,1
9,9,AGR RE 458238,CELSO DE MELLO,0.0,0.0002,1


### See 100 top ranked by adj matrix in Kleinberg simulation ranking ranked by authority

In [374]:
df_all_decs_kl_authority[df_all_decs_kl_authority["acordaoId"].isin(df_pr2_adj_matrix_sum.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos authority,acordaoId,relator,authority,hub,collection_iter
0,0,RE 416827,GILMAR MENDES,0.4594868,1.792882e-06,1
1,1,RE 415454,GILMAR MENDES,0.4579545,1.792882e-06,1
20,20,ED AI 243832,,0.0003258218,0.0,1
22,22,ED AI 243159,,0.0003240555,0.0,1
24,24,RTJ 153/834,,0.0003176423,0.0,1
25,25,RTJ 145/664,,0.0003158872,0.0,1
27,27,ED RCL 4395,CEZAR PELUSO,0.0002779325,0.0,1
46,46,ED AGR AI 177313,,0.0002324159,0.0,1
53,53,MC ADI 2010,,0.0001956086,0.0,1
80,80,AGR ED PET 1245,,0.0001145323,0.0,1


### See 100 top ranked Kleinberg simulation ranking ranked by authority in adj matrix ranking

In [372]:
df_pr2_adj_matrix_sum[df_pr2_adj_matrix_sum["acordaoId"].isin(df_all_decs_kl_authority.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos adj,acordaoId,adj_sum
0,0,RE 416827,5189
1,1,RE 415454,5155
22,22,ED AGR AI 177313,1574
23,23,AGR ED PET 1245,1504
37,37,ED AI 243832,1069
38,38,ED AI 243159,1037
39,39,RTJ 153/834,979
41,41,RTJ 145/664,947
87,87,ED RCL 4395,627
97,97,ED RE 195578,584


### See 100 top ranked by adj matrix in Kleinberg simulation ranking ranked by hub

In [295]:
df_all_decs_kl_hub[df_all_decs_kl_hub["acordaoId"].isin(df_pr2_adj_matrix_sum.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos hub,acordaoId,relator,authority,hub,collection_iter
5201,5201,RE 415454,GILMAR MENDES,0.4579545,1.792882e-06,1
5202,5202,RE 416827,GILMAR MENDES,0.4594868,1.792882e-06,1
5998,5998,ED ARE 719790,CELSO DE MELLO,2.833232e-08,2.770021e-07,1
6252,6252,ED AI 436371,CELSO DE MELLO,3.481367e-08,2.03543e-07,1
8021,8021,RE 453740,GILMAR MENDES,1.078747e-07,6.858702e-08,1
10487,10487,ED RCL 11022,CÁRMEN LÚCIA,1.871504e-07,4.620073e-08,1
15081,15081,QO AI 664567,SEPÚLVEDA PERTENCE,6.613859e-07,1.931812e-08,1
16704,16704,AGR AI 360265,CELSO DE MELLO,3.490221e-08,1.382117e-08,1
16845,16845,AGR ARE 808798,CELSO DE MELLO,2.687043e-09,1.174738e-08,1
20406,20406,RE 377457,GILMAR MENDES,6.944565e-06,5.824834e-09,1


### See 100 top ranked Kleinberg simulation ranking ranked by hub in adj matrix ranking

In [373]:
df_pr2_adj_matrix_sum[df_pr2_adj_matrix_sum["acordaoId"].isin(df_all_decs_kl_hub.loc[:100, "acordaoId"])].head(100)

Unnamed: 0,pos adj,acordaoId,adj_sum
1735,1735,AGR RE 537300,68
8486,8486,ED RE 567360,14
15713,15713,AGR RE 496627,7
21862,21862,AGR AI 625446,5
22285,22285,RE 661256,5
24468,24468,AGR RE 454569,4
30650,30650,RE 381367,3
64706,64706,RE 827833,1


### Conclusões
- Resultado de Kleinberg não tem muita relação com a matriz de adjacências
- pr1 separa mais os valores do ranking rankings, mas o pr2 converge mais rápido no cálculo
- authority values no topo são altos (quase 0.5), mas decaem rápido
- hub values são baixos 10^(-4), mas são quase uniformes no topo do ranking e decaem suavemente
- no pr1 e pr2 os ciclos estancam parte do valor no nó onde o ciclo começa e "escoa" menos probabilidade para nós que teriam um ranking mais alto  
- nós citados por nós muitos citados que citam pouco tendem a ter um ranking maior 


### Problemas
- seria interessante alguém validar o ranking do Kleinberg
- depois dos PRs serem executados novamente também seria interessante alguém validar os rankings dos PRs 
- a simulação do PR precisa considerar as decisões que são citadas, mas que não citam e não foram extraídas (foi feita uma correção pra considerar isso)
- identificar ciclos e verificar se eles estão corretos
- não são removidas exatamente as mesmas decisões. Como vamos poder comparar os rankings de forma justa?
- Análises feitas usando similares pode ser criticada por juristas pela falta de precisão na construção das arestas/arcos


### Lembretes
- lembrar de mostrar como um ciclo (construído erroneamente) no P2 pode levar nós com poucas citações para o topo quando está em um ciclo envolvendo um nó muito citado

### Busca de ciclos imediatos

In [None]:
citations = set()
decisions = set()
loops = set()
for k, decs in quotes.items():
    decisions = decisions.union([k])
    for v in decs:
        citations = citations.union(["{} -> {}".format(k, v)])
        if "{} -> {}".format(v, k) in citations:
            loops = loops.union([(k, v)])

In [345]:
len(citations)

533952

In [346]:
# len(loops)
len([c[0] for c in loops if c[0] == c[1]])

458

In [335]:
# sorted(ciclos, key=lambda x: x[0])

# auto citação:
[c[0] for c in loops if c[0] == c[1]]

# MS 25181: possui loop: http://redir.stf.jus.br/paginadorpub/paginador.jsp?docTP=AC&docID=86278, pág 180
# ADI 1359: cita-se a si mesma no cabeçalho. O processo foi distribuído a um magistrado e quando julgado
# em colegiado foi relatado por outro magistrado
# closed path of size two

['AGR RE 377147',
 'AGR RE 356595',
 'AGR RE 437211',
 'AGR RE 265335',
 'AGR RE 294560',
 'AGR RE 386691',
 'MS 24268',
 'AGR RE 438072',
 'AGR RE 335601',
 'AGR AI 530236',
 'ADI 2860',
 'AGR RE 386650',
 'AGR RE 405511',
 'AGR RE 405082',
 'AGR AI 546296',
 'AGR AI 539468',
 'AGR AI 534769',
 'AGR RE 483474',
 'AGR RE 275635',
 'AGR AI 456186',
 'AGR AI 441467',
 'AGR AI 441524',
 'AGR RE 420132',
 'AGR RE 489382',
 'ADI 1704',
 'AGR RE 293311',
 'AGR RE 369429',
 'AGR RE 438748',
 'ADI 508',
 'ADI 1976',
 'AGR RE 285704',
 'AGR AI 543653',
 'AGR AI 264552',
 'AGR RE 438069',
 'AGR AI 535149',
 'AGR AI 501025',
 'AGR AI 538470',
 'AGR AI 501039',
 'HC 83544',
 'ED RE 356378',
 'AGR RE 274892',
 'AGR AI 438772',
 'AGR RE 375142',
 'AGR RE 367542',
 'AGR RE 275548',
 'AGR RE 284265',
 'AGR RE 483582',
 'RE 181966',
 'AGR RE 388285',
 'AGR RE 265647',
 'AGR RE 320633',
 'AGR AI 538459',
 'AGR AI 538839',
 'AGR AI 538766',
 'AGR RE 377593',
 'AGR RE 436996',
 'RE 170484',
 'HC 90516',
 

In [None]:
# regex de exclusão: "Os ... foram objetos ... rejeitados"


# - Os RE 274892 AgR, RE 275635 AgR, RE 280366 AgR, RE 280902 AgR, RE 281886 AgR, RE 284137 AgR, RE 284259 AgR, RE 284366 AgR, RE 284848 AgR, RE 285834 AgR, RE 286849 AgR, RE 286953 AgR, RE 287755 AgR, RE 288324 AgR, RE 288404 AgR foram objeto dos
# Embargos de Declaração rejeitados

In [133]:
# PENSAR DEPOIS EM FAZER A ANÁLISE DO KLEINBERG REMOVENDO DECISÕES QUE SÃO CITADAS, MAS QUE NÃO CITAM

# Análise Kleinberg feita não remove decisões "anômalas" e decisões citadas, mas que não citam agora também aparecem no ranking do PR
# page rank executado até agora faz a remoção de decisões consideradas 'anômalas' e por isso elas não possuem um PR