In [24]:
import sys
from pymongo import MongoClient
import networkx as nx


class NetworkXDigraph:
    def __init__(
        self,
        mongo_uri,
        db_name,
        collection_in,
        collection_out_name,
    ):
        client = MongoClient(mongo_uri)
        self.db = client[db_name]
        self.collection_in = collection_in
        self.collection_out = self.db[collection_out_name]
        self.collection_out.drop()
        self.one_percent = sum([coll.count() for coll in self.collection_in]) / 100

    def save_removed_decisions(self, i, removed_decisions, collection_out_name):
        removed_coll = self.db[collection_out_name + "_removed_%d" % i]
        removed_coll.drop()

        removed_coll.insert_one(
            {"iteration": i, "removed_decisions": removed_decisions}
        )

    def set_collections_out(self, collection_out_name):
        self.collection_out = self.db[collection_out_name]
        self.collection_out.drop()

    def build_graph(self, query, removed_decisions, compute_similars):
        acordaos = {}
        G = nx.DiGraph()

        print("building map")

        self.count = self.progress = 0

        for coll in self.collection_in:
            docsFound = coll.find(query, no_cursor_timeout=True)
            for doc in docsFound:
                if doc["acordaoId"] in removed_decisions:
                    continue

                # ADICIONA NÓ
                docId = doc["acordaoId"]
                G.add_node(docId) if docId not in acordaos else None
                acordaos[docId] = Acordao(docId, doc["tribunal"], doc["relator"], False)
                # ADICIONA ARCOS
                for ac_cit in doc["citacoesObs"]:
                    if ac_cit in removed_decisions:
                        continue
                    if ac_cit not in acordaos:
                        G.add_node(ac_cit)
                        acordaos[ac_cit] = Acordao(ac_cit, "", "", False)

                    G.add_edge(docId, ac_cit)

                if compute_similars == "S":
                    for similar in doc["similares"]:
                        similarId = similar["acordaoId"]
                        if similarId not in removed_decisions:
                            if (similarId not in acordaos) or (acordaos[similarId].getRelator() == ""):
                                G.add_node(similarId)
                                acordaos[similarId] = Acordao(
                                    similarId, doc["tribunal"], similar["relator"], True
                                )
                            for quotedId in doc["citacoesObs"]:
                                G.add_edge(similarId, quotedId)

                self.__print_progress()

            print("")

        return [G, acordaos]

    # Insert nodes and authority and hub node values
    def insert_nodes(self, G, acordaos, authorities, hubs):
        n_docs = len(acordaos)
        self.one_percent = n_docs / 100
        self.count = self.progress = 0
        insert_step = n_docs
        if n_docs > 10000:
            insert_step = 10000

        print("n acordaos %s to be inserted" % n_docs)

        i = 0
        docs_to_insert = []
        for docId, doc in acordaos.items():
            docs_to_insert.append(
                {
                    "acordaoId": docId,
                    # "citacoes": doc.getCitacoes(),
                    "citadoPor": [tup[0] for tup in G.out_edges(docId)],
                    # "similares": doc.getSimilares(),
                    "relator": doc.getRelator(),
                    "tribunal": doc.getTribunal(),
                    "authority": authorities[docId],
                    "hub": hubs[docId],
                    "virtual": doc.getVirtual(),
                }
            )
            i += 1
            self.__print_progress()
            if i >= insert_step:
                self.collection_out.insert_many(docs_to_insert)
                docs_to_insert = []
                i = 0

        print("")
        if i > 0:
            self.collection_out.insert_one(docs_to_insert)


    def __print_progress(self):
        self.count += 1
        if self.count >= self.one_percent:
            self.count = 0
            self.progress += 1
            sys.stdout.write("\r%d%%" % self.progress)
            sys.stdout.flush()


In [3]:
class Acordao:

    def __init__( self, idAcordao, tribunal, relator, virtual, similares=[], citacoes=[]):
        self.idAcordao = idAcordao
        self.tribunal = tribunal
        self.relator = relator
        self.citacoes = citacoes
        self.similares = similares
        self.virtual = virtual

    def getId( self):
        return self.idAcordao

    def getTribunal( self):
        return self.tribunal

    def getRelator( self):
        return self.relator

#     def getCitacoes( self):
#         return self.citacoes

#     def getSimilares( self):
#         return self.similares

    def getVirtual( self):
        return self.virtual


In [4]:
import os
from math import ceil
from random import randint
import datetime
from multiprocessing import Pool
from pymongo import MongoClient
import traceback


In [5]:
def get_decisions_ids(collections, query):
    MONGO_URI = os.getenv("MONGO_URI")
    MONGO_DATABASE = os.getenv("MONGO_DATABASE")

    client = MongoClient(MONGO_URI)
    db = client[MONGO_DATABASE]

    decisions_ids = []
    colls = []
    if collections == "acordaos":
        colls.append(db["acordaos"])
    elif collections == "decisoes_monocraticas":
        colls.append(db["decisoes_monocraticas"])
    elif collections == "decisoes":
        colls.append(db["acordaos"])
        colls.append(db["decisoes_monocraticas"])

    for coll in colls:
        docs = coll.find(query, no_cursor_timeout=True)
        for doc in docs:
            decisions_ids.append(doc["acordaoId"])

    return decisions_ids, colls


def get_removed_decisions(decisions_ids, percentage):
    removed_decisons_len = ceil(len(decisions_ids) * (percentage / 100.0))
    decisions_ids_len = len(decisions_ids)
    removed_decisions = []
    i = 0
    while i < removed_decisons_len:
        x = randint(0, decisions_ids_len - 1)
        if decisions_ids[x] not in removed_decisions:
            removed_decisions.append(decisions_ids[x])
            i += 1

    return removed_decisions


def get_top_10_relatores():
    MONGO_URI = os.getenv("MONGO_URI")
    MONGO_DATABASE = os.getenv("MONGO_DATABASE")
    client = MongoClient(MONGO_URI)
    db = client[MONGO_DATABASE]
    docs = db["acordaos"].aggregate(
        [
            {"$group": {"_id": "$relator", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 10},
        ]
    )
    relatores = [doc["_id"] for doc in docs]

    return relatores

In [6]:
def run_hits_execution(args):
    i, query, collections_name, percentage, compute_similars, collection_out_iter_name = args
    print("execution: ", i, collection_out_iter_name)

    decisions_ids, collections = get_decisions_ids(
        collections_name, query
    )

    MONGO_URI = os.getenv("MONGO_URI")
    MONGO_DATABASE = os.getenv("MONGO_DATABASE")
    graph = NetworkXDigraph(
        MONGO_URI,
        MONGO_DATABASE,
        collections,
        collection_out_iter_name,
    )
    if i == 1:
        removed_decisions = []
    else:
        removed_decisions = get_removed_decisions(decisions_ids, percentage)

    graph.save_removed_decisions(i, removed_decisions, collection_out_iter_name)

    # Construct graph
    [G, acordaos] = graph.build_graph(query, removed_decisions, compute_similars)
    
    # Se esta função fosse executada seriam removidos acórdãos fora do intervalo temporal
    # [quotes, quotedBy] = graph.removeInvalidAcordaosFromDicts(
    #     acordaos, quotes, quotedBy
    # )

    print("Início da execução do kleinberg:", len(acordaos))
    # KLEINBERG authorities and hubs
    try:
        hubs, authorities = nx.hits(G, max_iter=1000)
        graph.set_collections_out(collection_out_iter_name + "_{}".format(i))

        print("Execução:", collection_out_iter_name + "_{}".format(i), "está pronta para ser inserida no banco")
        # Insert results
        graph.insert_nodes(G, acordaos, authorities, hubs)
        print(collection_out_iter_name + "_{}".format(i), "finalizada")
    except Exception as e:
        traceback.print_exc()

In [7]:
def run_acordaos_kleinberg_experiments():
    # parâmetros
    # relatores: "S" ou "N"
    query = {}
    collections_name = "acordaos"
    compute_similars = "S"
    kleinberg_iters = []

    # experimento kleinberg todas as decisões
    percentages = [10, 20, 30]
    for percentage in percentages:
        collection_out_iter_name = "stf_kleinberg_acordaos_{}".format(percentage)
        kleinberg_iters.extend([
            (i, query, collections_name, percentage, compute_similars, collection_out_iter_name)
            for i in range(1, 11)
        ])

    compute_similars = "N"
    percentage = 10
    collection_out_iter_name = "stf_kleinberg_acordaos_{}_no_similars".format(percentage)
    kleinberg_iters.extend([
        (i, query, collections_name, percentage, compute_similars, collection_out_iter_name)
        for i in range(1, 11)
    ])

    compute_similars = "S"
    percentage = 10
    relatores = get_top_10_relatores()
    for j, rel in enumerate(relatores):
        new_query = query.copy()
        new_query["relator"] = rel
        collection_out_iter_name = "stf_kleinberg_acordaos_{}_rel_{}".format(percentage, (j + 1))
        kleinberg_iters.extend([
            (i, new_query, collections_name, percentage, compute_similars, collection_out_iter_name)
            for i in range(1, 11)
        ])

    processes = 3
    pool = Pool(processes=processes)
    pool.map(run_hits_execution, kleinberg_iters)
    pool.close()
    pool.join()

In [None]:
if __name__ == '__main__':
    try:
        tini = datetime.datetime.now()
        run_acordaos_kleinberg_experiments()

        os.system(
            'echo "Kleinberg acabou com sucesso =)!" | mail -s "Kleinberg acabou!" -r "Jackson<jackson@ime.usp.br>" jackson@ime.usp.br'
        )

    except Exception as e:
        os.system(
            'echo %s | mail -s "Kleinberg falhou!" -r "Jackson<jackson@ime.usp.br>" jackson@ime.usp.br'
            % e
        )
        with open("kleinberg_error.log", "a") as f:
            f.write("%d: %s\n\n" % ((datetime.datetime.now() - tini).seconds, e))


### Debugging

In [20]:
from pymongo import MongoClient
import sys


class GraphMaker:
    def __init__(
        self,
        mongo_uri,
        dbName,
        collections_in,
        collectionOutName,
    ):
        client = MongoClient(mongo_uri)
        self.db = client[dbName]
        self.collectionsIn = collections_in
        self.collectionOut = self.db[collectionOutName]
        self.collectionOut.drop()
        self.onePercent = sum([coll.count() for coll in self.collectionsIn]) / 100
        self.count = 0
        self.progress = 0

    def set_collections_out(self, collection_out_name):
        self.collectionOut = self.db[collection_out_name]
        self.collectionOut.drop()

    def save_removed_decisions(self, i, removed_decisions, collection_out_name):
        removed_coll = self.db[collection_out_name + "_removed_%d" % i]
        removed_coll.drop()

        removed_coll.insert_one(
            {"iteration": i, "removed_decisions": removed_decisions}
        )

    def __addElemSetToDict(self, aDict, elemKey, elemValue):
        if elemKey not in aDict:
            aDict[elemKey] = set()

        aDict[elemKey].add(elemValue)
        return aDict

    def removeInvalidAcordaosFromDicts(self, validAcordaos, quotes, quotedBy):
        """
            Remove do 'quotedBy' acórdãos que não estão presentes no BD ou nos similares apontados
            por decisões do BD. 'quotes' fica apenas com decisões citadas presentes no BD ou nos
            similares de uma determinada decisão.
        """
        for docId, quotesId in quotes.items():
            newQuotesId = set()
            for q in quotesId:
                if q in validAcordaos:
                    newQuotesId.add(q)
                else:
                    quotedBy.pop(q, 0)

            quotes[docId] = newQuotesId

        return [quotes, quotedBy]

    def buildDicts(self, query, removed_decisions, compute_similars):
        acordaos = {}
        quotes = {}
        quotedBy = {}
        similars = {}

        print("building map")

        self.count = self.progress = 0

        for coll in self.collectionsIn:
            docsFound = coll.find(query, no_cursor_timeout=True)
            for doc in docsFound:
                if doc["acordaoId"] in removed_decisions:
                    continue

                docId = doc["acordaoId"]
                for quotedId in doc["citacoesObs"]:
                    if quotedId not in removed_decisions:
                        quotes = self.__addElemSetToDict(quotes, docId, quotedId)
                        quotedBy = self.__addElemSetToDict(quotedBy, quotedId, docId)

#                 similares são decisões (nós) virtuais que apontam para citacoes de 'docId'
                if compute_similars == "S":
                    for similar in doc["similares"]:
                        similarId = similar["acordaoId"]
                        if similarId not in removed_decisions:
                            for quotedId in doc["citacoesObs"]:
                                quotes = self.__addElemSetToDict(
                                    quotes, similarId, quotedId
                                )
                                quotedBy = self.__addElemSetToDict(
                                    quotedBy, quotedId, similarId
                                )

                                similars = self.__addElemSetToDict(
                                    similars, similarId, docId
                                )
                                similars = self.__addElemSetToDict(
                                    similars, docId, similarId
                                )

                            if similarId not in acordaos:
                                acordaos[similarId] = Acordao(
                                    similarId, doc["tribunal"], similar["relator"], True
                                )

                acordaos[docId] = Acordao(docId, doc["tribunal"], doc["relator"], False)
                self.__printProgress()

            print("")

        return [acordaos, quotes, quotedBy, similars]

    def insertNodes(self, acordaos, quotes, quotedBy, similars, pageRanks):
        nDocs = len(acordaos)
        self.onePercent = nDocs / 100
        self.count = self.progress = 0
        insertStep = nDocs
        if nDocs > 10000:
            insertStep = 10000

        print("n acordaos %s to be inserted" % nDocs)

        i = 0
        docs2Insert = []
        for docId, doc in acordaos.items():
            docQuotedBy = list(quotedBy.get(docId, set()))
            docQuotes = list(quotes.get(docId, set()))
            docSimilars = list(similars.get(docId, set()))
            docPageRank = float(pageRanks.get(docId, 0.0))
            docs2Insert.append(
                {
                    "acordaoId": docId,
                    "citacoes": docQuotes,
                    "citadoPor": docQuotedBy,
                    "similares": docSimilars,
                    "relator": doc.getRelator(),
                    "tribunal": doc.getTribunal(),
                    "pageRank": docPageRank,
                    "virtual": doc.getVirtual(),
                }
            )
            i += 1
            self.__printProgress()
            if i >= insertStep:
                self.collectionOut.insert_one(docs2Insert)
                docs2Insert = []
                i = 0

        print("")
        if i > 0:
            self.collectionOut.insert_one(docs2Insert)

    def __printProgress(self):
        self.count += 1
        if self.count >= self.onePercent:
            self.count = 0
            self.progress += 1
            sys.stdout.write("\r%d%%" % self.progress)
            sys.stdout.flush()


In [12]:
i = 1
query = {"relator": "MARCO AURÉLIO"}
collections_name = "acordaos"
compute_similars = "S"
collection_out_iter_name = "stf_kleinberg_acordaos_10_rel" + "_1" + "_%d" % i
percentage = 10
page_rank_mode = 1

In [21]:

decisions_ids, collections = get_decisions_ids(
    collections_name, query
)

MONGO_URI = os.getenv("MONGO_URI")
MONGO_DATABASE = os.getenv("MONGO_DATABASE")
graphMaker = GraphMaker(
    MONGO_URI,
    MONGO_DATABASE,
    collections,
    collection_out_iter_name,
)
if i == 1:
    removed_decisions = []
else:
    removed_decisions = get_removed_decisions(decisions_ids, percentage)

graphMaker.save_removed_decisions(i, removed_decisions, collection_out_iter_name)
[acordaos, quotes, quotedBy, similars] = graphMaker.buildDicts(
    query, removed_decisions, compute_similars
)
len(acordaos)

building map
10%


21904

In [25]:
i = 1
query = {"relator": "MARCO AURÉLIO"}
collections_name = "acordaos"
compute_similars = "S"
collection_out_iter_name = "stf_kleinberg_acordaos_10_rel" + "_1" + "_%d" % i
percentage = 10
print("execution: ", i, collection_out_iter_name)

decisions_ids, collections = get_decisions_ids(
    collections_name, query
)

MONGO_URI = os.getenv("MONGO_URI")
MONGO_DATABASE = os.getenv("MONGO_DATABASE")
graph = NetworkXDigraph(
    MONGO_URI,
    MONGO_DATABASE,
    collections,
    collection_out_iter_name,
)
if i == 1:
    removed_decisions = []
else:
    removed_decisions = get_removed_decisions(decisions_ids, percentage)

graph.save_removed_decisions(i, removed_decisions, collection_out_iter_name)

# Construct graph
[G, acordaos] = graph.build_graph(query, removed_decisions, compute_similars)
len(acordaos)

execution:  1 stf_kleinberg_acordaos_10_rel_1_1
building map
10%


27130