# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5


In [1]:
import sys
import unicodedata
import uuid

from pyspark.context import SparkContext
from pyspark.sql.functions import col, lit, udf, regexp_replace
from pyspark.sql.types import StringType
from awsglue.context import GlueContext
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.job import Job

# Inicializar Spark y Glue Context
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

# Parámetros de entrada
args = getResolvedOptions(sys.argv, ['JOB_NAME', 's3_input_path', 's3_output_path'])
job.init(args['JOB_NAME'], args)

# Cargar los datos desde S3
df = spark.read.csv(args['s3_input_path'], header=True, inferSchema=True)

# Función para quitar tildes
def quitar_tildes(texto):
    return ''.join(
        c for c in unicodedata.normalize('NFD', texto) 
        if unicodedata.category(c) != 'Mn'
    )

# Registrar la función como UDF
quitar_tildes_udf = udf(quitar_tildes, StringType())

# Aplicar la función para eliminar tildes en la columna 'departamento'
df = df.withColumn('departamento', quitar_tildes_udf(col('departamento')))

# Lista de departamentos válidos en Colombia
departamentos_colombia = [
    'AMAZONAS', 'ANTIOQUIA', 'ARAUCA', 'ATLANTICO', 'BOLIVAR', 'BOYACA', 'CALDAS', 'CAQUETA', 'CASANARE', 'CAUCA',
    'CESAR', 'CHOCO', 'CORDOBA', 'CUNDINAMARCA', 'GUAINIA', 'GUAVIARE', 'HUILA', 'LA GUAJIRA', 'MAGDALENA', 'META',
    'NARINO', 'NORTE DE SANTANDER', 'PUTUMAYO', 'QUINDIO', 'RISARALDA', 'SAN ANDRES Y PROVIDENCIA', 'SANTANDER',
    'SUCRE', 'TOLIMA', 'VALLE DEL CAUCA', 'VAUPES', 'VICHADA'
]

# Filtrar solo los departamentos válidos
df = df.filter(df['departamento'].isin(departamentos_colombia))

# Renombrar columnas
df = (
    df.withColumnRenamed("armas medios", "armas_medios")
      .withColumnRenamed("fecha hecho", "fecha_hecho")
)

# Eliminar columna no deseada
df = df.drop("CODIGO DANE")

# Eliminar filas con valores nulos
df = df.dropna()

# Agregar una columna 'id' con UUID (igual para todas las filas, si quieres uno único por fila, usa un UDF con uuid4())
df = df.withColumn('id', lit(str(uuid.uuid4())))

# Limpiar texto en la columna 'municipio'
df = df.withColumn('municipio', regexp_replace(col('municipio'), r'\(CT\)', ''))

# Guardar el DataFrame en S3 en formato Parquet
df.write.mode('overwrite').parquet(args['s3_output_path'])

# Finalizar el trabajo
job.commit()


Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.8 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Session ID: 54dba1bc-e3df-48bc-a82d-2598ebe81a3f
Applying the following default arguments:
--glue_kernel_version 1.0.8
--enable-glue-datacatalog true
Waiting for session 54dba1bc-e3df-48bc-a82d-2598ebe81a3f to get into ready status...
Session 54dba1bc-e3df-48bc-a82d-2598ebe81a3f has been created.
GlueArgumentError: the following arguments are required: --JOB_NAME, --s3_input_path, --s3_output_path_medellin, --s3_output_path_otros
