# Imports

In [2]:
%load_ext autoreload
%autoreload 2
import sys
from nlppen.extraccion.utils.Txt2Numbers import Txt2Numbers
from nlppen.analisis import Analisis
from nlppen.seleccion import Seleccion
from nlppen.spark_udfs import solo_portanto, solo_considerando, solo_resultando, solo_encabezado, spark_get_spacy
from nlppen.sentencias_estructurales import SentenciasEstructurales
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from nlppen.spacy_internationals import extractInternational
from nlppen.spacy_derechos import extractDerechos


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Initialize spark context

In [3]:
spark = (SparkSession
         .builder
         .appName("Transforming sentences")
         .config("spark.num.executors", "1")
         .config("spark.executor.memory", "6g")
         .config("spark.executor.cores", "1")
         .config("spark.driver.memory", "12g")
         .config("spark.memory.offHeap.enabled", True)
         .config("spark.memory.offHeap.size", "64g")
         .config("spark.sql.execution.arrow.pyspark.enabled", "true")
         .getOrCreate())

sc = spark.sparkContext
sc.uiWebUrl

21/10/21 16:37:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


KeyboardInterrupt: 

In [3]:
#df = spark.read.parquet('./datasetRepartido')
#writer = df.write.partitionBy("anno").mode('Overwrite').parquet('./datasetRepartido/split')

# Buscar terminos en la sección de por lo tanto de la sentencia y aplicar filtro del dataset

In [None]:
terminos = {
    'seguimiento': [r'\bseguimiento\b'],
    'se ordena': [r'\bse ordena\b', r'\bse le ordena\b', r'\bse les ordena\b'],
    'plan': [r'\bplan\b'],
    'plazo': [r'\bplazo\b']
}
seleccion = Seleccion(terminos, spark, parquet_path='./dataset', datasets_path='./datasets/estructuralesNormal')
#seleccion = Seleccion(terminos, spark, parquet_path='./datasetRepartido', datasets_path='./datasets/estructurales')
print("Cantidad elementos originales : " + str(seleccion.sdf.count()))
seleccion.filtrar_sentencias(preprocess=solo_portanto, keepRowEmpty=True)
print("Cantidad elementos despues de filtrados : " + str(seleccion.sdf.count()))
estructurales = SentenciasEstructurales(seleccion)

# Formar dataset de sentencias estructurales

In [None]:
from pyspark.sql.types import *
columnas = {
    'se ordena PER' : ArrayType(StringType()),
    'se ordena LOC' : ArrayType(StringType()),
    'se ordena ORG' : ArrayType(StringType()),
    'se ordena MISC' : ArrayType(StringType()),
    'se ordena GPE' : ArrayType(StringType()),
    'se ordena Ent Pub' : ArrayType(StringType())
}
estructurales.separarSeOrdena(columnas, True, True)

columnas = {
    'extension sentencia' : IntegerType(),
    'extension por lo tanto' : IntegerType()
}
estructurales.extraerExtension(columnas, True)

columnas = {
    'plazosDefinidos' : ArrayType(TimestampType())
}
estructurales.plazosDefinidos(columnas, True)

columnas = {
    'FechaSolicitud' : TimestampType(),
}

estructurales.extrarFechaRecibido(columnas, True)

columnas = {
    'num resolucion' : StringType()
}

estructurales.extraerNumeroSentencia(columnas, True)

columnas = {
    'inst internacionales' : ArrayType(StringType())
}

estructurales.extraerInstrumentosInternacionales(columnas, True)

columnas = {
    'derechos Norm' : ArrayType(StringType()),
    'derechos GenXPat' : ArrayType(StringType())
}
estructurales.extraerDerechos(columnas, True)

columnas = {
    'derechos Acotados' : ArrayType(StringType()),
    'derechos General' : ArrayType(StringType()),
    'derechos Fundamental' : ArrayType(StringType()),
    'derechos Humano' : ArrayType(StringType())
}

estructurales.extraerDerechosSinNormalizar(columnas, True)
#Sobreescribir el dataset de filtro de sentencias con las nuevas columnas
estructurales.seleccion.guardarDatos()

In [None]:
s = estructurales.seleccion.sdf.repartition(1)
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 25)
pd.set_option('display.width', 80)
pd.set_option('display.max_colwidth', 80)
s.limit(10).toPandas()

In [None]:
from pyspark.sql.functions import explode, desc
from pyspark.sql import functions as F
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)


In [None]:

df2 = (s
 .where(s.derechos_Norm.isNotNull())
 .select(explode('derechos_Norm').alias('derechos_Norm'), 'anno')
 .groupby('derechos_Norm')
 .count()
 .sort(F.desc('count'))
).toPandas()

file_name = 'derechos_Norm.xlsx'
  
# saving the excel
df2.to_excel(file_name)

In [None]:
s.filter(s.inst_internacionales.isNotNull()).limit(10).toPandas().iloc[8,-1]

In [None]:
s.limit(10).toPandas()

In [3]:

from nlppen.spark_udfs import solo_portanto, solo_considerando, solo_resultando, solo_encabezado, spark_get_spacy

In [4]:
nlp = spark_get_spacy("es_core_news_lg")

In [5]:
txt = """
Derecho a tener posesiones
"""
doc = nlp(txt)
words = []
for word in doc:
    words.append(word.text)
    print(word.text, word.pos_, word.is_stop)
print("----")
union = list(set().union(words, words))
for word in union:
    print(word)


 SPACE False
derecho NOUN False
fundamental ADJ False
de ADP True
todo DET True
imputado ADJ False
de ADP True
solicitar VERB False
o CCONJ False
pedir VERB False
al ADP True
Juez PROPN False
Penal PROPN False

 SPACE False
----
de
al
Juez
derecho
todo


Penal
imputado
pedir
o
fundamental
solicitar


In [1]:

from nlppen.spacy_derechos import extractDerechos
txt = """
Derecho a tener posesiones

"""
doc = extractDerechos(txt)
entidadesFiltradas = [ent.ent_id_ for ent in doc.ents]
entidadesSacadas = [ent.text for ent in doc.ents]
print(entidadesFiltradas)
print(entidadesSacadas)

['Derecho Fundamental a tener Posesiones']
['Derecho a tener posesiones']


In [38]:
import re 
derechos = ["derecho   a la salud", "derecho a la salud  ", "derecho a la salud "]
derechos = [re.sub(r'[^\w]', ' ', x) for x in derechos]
derechos = [re.sub(r'(\s)+', ' ', x.strip()) for x in derechos]
union = list(set().union(derechos, derechos))
print(union)

['derecho a la salud']


Derecho   a laa   salud


In [36]:
ss = s.filter(s.inst_internacionales.isNull()).filter(s.anno == "1990")

In [14]:
from spacy.matcher import Matcher
from spacy.tokens import Span

import re

def limpiarDerechos(derechos):
    union = list(set().union(derechos, derechos))
    return union
    
def crearDerechos(txt):
    derechos = []
    se_ordena_pattern = [
                         {"LOWER": "derecho"}, {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "+"},
                         {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "*"},
                         {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "*"}
                        ]
    fundamental = [
                         {"LOWER": "derecho"}, {"LOWER": "fundamental"}, {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "+"},
                         {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "*"},
                         {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "*"}
                        ]
    humano = [
                         {"LOWER": "derecho"}, {"LOWER": "humano"}, {"POS": {"IN":["PRON", "VERB", "DET"]}, "OP": "*"}, {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "+"},
                         {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "*"},
                         {"POS": {"IN":["ADP", "DET"]}, "OP": "*"},
                         {"LOWER": {"IN": ["y", "o"]}, "OP": "?"},
                         {"POS": {"IN": ["VERB", "ADJ", "NOUN"]}, "OP": "*"}
                        ]

    patterns = [se_ordena_pattern]
    nlp = spark_get_spacy("es_core_news_lg")
    doc = nlp(txt)
    matcher = Matcher(nlp.vocab)
    matcher.add("Derecho General", patterns, greedy="FIRST")
    matcher.add("Derecho Fundamental", [fundamental], greedy="FIRST")
    matcher.add("Derecho Humano", [humano], greedy="FIRST")
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = Span(doc, start, end, label=match_id)
        if span.label_ == "Derecho Humano":
            print(span.label_)
            print(span.text)
        derechos.append(span.text)
        print("*-*-*-*")
        
    #for derecho in derechos:
    #    print(derecho)
    
    derechos = limpiarDerechos(derechos)
    return derechos

#v = s.count()
v = 150
pand = s.limit(v).toPandas()
derechosExtraidos = []
for i in range(50, v):
    print("************************")
    print(pand.iloc[i, 4])
    derechosExtraidos += crearDerechos(solo_considerando(pand.iloc[i,1]).lower())

derechosExtraidos = [re.sub(r'[^\w]', ' ', x) for x in derechosExtraidos]
derechosExtraidos = limpiarDerechos(derechosExtraidos)
#for derecho in derechosExtraidos:
#    print(derecho)

************************
180034830007CO
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
************************
9616A00
*-*-*-*
*-*-*-*
*-*-*-*
************************
180047710007CO
*-*-*-*
************************
180063250007CO
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
************************
180084130007CO
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
Derecho Humano
derecho humano que tiene el derecho a la salud de todos los ciudadanos
*-*-*-*
************************
180010110007CO
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
*-*-*-*
Derecho Humano
derecho humano para la población adulta mayor y el precepto
*-*-*-*
*-*-*-*
************************
180008910007CO
*-*-*-*
*-*-*-*
************************
180004010007CO
*-*-*-*
*-*-*-*
************************
180053820007CO
*-*-*-*
************************
180040570007CO
*-*-*-*
*-*-*-*
*-*-*-*
**********************

In [41]:
a = ["Abc", "Abc", "CCC"]
union=list(set().union(a, a))
print("\nThe union of three list is:",union)


The union of three list is: ['CCC', 'Abc']


In [15]:
a = [1, 2, 3]

In [16]:
b = [4, 5, 6]

c = [a, b]

In [17]:
print(c)

[[1, 2, 3], [4, 5, 6]]
