In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sodapy import Socrata
import datetime

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when, lit, udf

In [2]:
%load_ext nb_black
%matplotlib inline

In [3]:
def remove_tildes(string: str):
    """Remove spanish accentuation mark for string standarization"""
    return (
        string.replace("á", "a")
        .replace("é", "e")
        .replace("ó", "o")
        .replace("í", "i")
        .replace("ú", "u")
    )

In [4]:
secop_int = catalog.load("secop_int")

                                                                                

In [5]:
secop_int.cache()

In [6]:
secop_int.columns

In [7]:
secop_int = secop_int.withColumn("nivel_entidad", F.lower(col("nivel_entidad")))

In [8]:
secop_int.groupBy("nivel_entidad").count().show()



+-------------+------+
|nivel_entidad| count|
+-------------+------+
|  no definido| 24448|
|     nacional| 31294|
|  territorial|711881|
+-------------+------+



                                                                                

In [9]:
secop_int = secop_int.withColumn(
    "estado_del_proceso", udf(remove_tildes)(F.lower(col("estado_del_proceso")))
)

In [10]:
count_process_state = secop_int.groupBy("estado_del_proceso").count().toPandas()
count_process_state.sort_values("count", ascending=False)

                                                                                

Unnamed: 0,estado_del_proceso,count
15,celebrado,387825
0,liquidado,209632
19,convocado,70435
6,en ejecucion,27603
16,terminado anormalmente despues de convocado,16690
18,terminado sin liquidar,13483
1,modificado,13266
20,adjudicado,9772
10,borrador,4373
5,terminado,3691


In [11]:
secop_int = secop_int.withColumn(
    "modalidad_de_contrataci_n",
    udf(remove_tildes)(F.lower(col("modalidad_de_contrataci_n"))),
)

In [12]:
count_process_type = secop_int.groupBy("modalidad_de_contrataci_n").count().toPandas()
count_process_type["prop"] = (
    count_process_type["count"] / count_process_type["count"].sum()
)

                                                                                

https://www.funcionpublica.gov.co/eva/gerentes/Modulo4/tema-2/1-modalidades.html

In [13]:
def clean_modalidad_contratacion(mod: str):
    """Clean and group modalidad de contratacion"""
    if ("concurso de meritos" in mod) or ("concurso_meritos" in mod):
        return "concurso de meritos abiertos"
    elif "regimen especial" in mod:
        return "regimen especial"
    elif ("minima cuantia" in mod) or ("menor cuantia" in mod):
        return "minima cuantia"
    elif "contratacion directa" in mod:
        return "contratacion directa"
    elif "subasta" in mod:
        return "subasta"
    elif ("licitacion publica" in mod) or ("licitacion obra publica" in mod):
        return "licitacion publica"
    else:
        return "Otro"

In [14]:
count_process_type["modalidad_clean"] = count_process_type[
    "modalidad_de_contrataci_n"
].apply(clean_modalidad_contratacion)
count_process_type

Unnamed: 0,modalidad_de_contrataci_n,count,prop,modalidad_clean
0,invitacion ofertas cooperativas o asociaciones...,37,4.8e-05,Otro
1,licitacion publica,10250,0.013353,licitacion publica
2,seleccion abreviada del literal h del numeral ...,118,0.000154,Otro
3,contratacion minima cuantia,157560,0.205257,minima cuantia
4,contratacion directa,50228,0.065433,contratacion directa
5,seleccion abreviada servicios de salud,203,0.000264,Otro
6,contratacion directa (con ofertas),497,0.000647,contratacion directa
7,minima cuantia,1716,0.002235,minima cuantia
8,contratacion directa menor cuantia,3418,0.004453,minima cuantia
9,subasta,8402,0.010945,subasta


In [15]:
secop_int = secop_int.withColumn(
    "modalidad_de_contrataci_n",
    udf(clean_modalidad_contratacion)(col("modalidad_de_contrataci_n")),
)

In [16]:
count_process_type_clean = (
    secop_int.groupBy("modalidad_de_contrataci_n").count().toPandas()
)
count_process_type_clean["prop"] = (
    count_process_type_clean["count"] / count_process_type_clean["count"].sum()
)
count_process_type_clean.sort_values("prop", ascending=False)

                                                                                

Unnamed: 0,modalidad_de_contrataci_n,count,prop
2,contratacion directa,373953,0.487157
4,minima cuantia,189242,0.24653
6,regimen especial,173875,0.226511
1,licitacion publica,12158,0.015839
5,subasta,8695,0.011327
3,concurso de meritos abiertos,7375,0.009608
0,Otro,2325,0.003029


In [17]:
secop_int = secop_int.withColumn(
    "tipo_de_contrato", udf(remove_tildes)(F.lower(col("tipo_de_contrato")))
)

In [18]:
count_process_type_cont = secop_int.groupBy("tipo_de_contrato").count().toPandas()
count_process_type_cont["prop"] = (
    count_process_type_cont["count"] / count_process_type_cont["count"].sum()
)
count_process_type_cont.sort_values("prop", ascending=False)

                                                                                

Unnamed: 0,tipo_de_contrato,count,prop
3,prestacion de servicios,535714,0.697887
0,suministro,97743,0.127332
13,obra,47713,0.062157
2,compraventa,33059,0.043067
11,otro tipo de contrato,25946,0.0338
15,consultoria,9446,0.012306
18,interventoria,6574,0.008564
8,arrendamiento,5891,0.007674
9,otro,1359,0.00177
17,decreelaw092/2017,1357,0.001768


In [22]:
def clean_tipo_contrato(tip: str):
    """Clean and group tipo de contrato"""
    if ("suministro" in tip) or (tip in ["compraventa", "venta muebles"]):
        return "suministro"
    elif ("arrendamiento" in tip) or ("comodato" in tip):
        return "arrendamiento"
    elif tip in [
        "servicios financieros",
        "credito",
        "fiducia",
        "seguros",
        "emprestito",
    ]:
        return "servicios financieros"
    elif tip in ["obra", "consultoria", "prestacion de servicios",'interventoria','concesion']:
        return tip
    else:
        return "Otro"

In [23]:
count_process_type_cont["tipo_de_contrato_clean"] = count_process_type_cont[
    "tipo_de_contrato"
].apply(clean_tipo_contrato)

In [24]:
count_process_type_cont

Unnamed: 0,tipo_de_contrato,count,prop,tipo_de_contrato_clean
0,suministro,97743,0.127332,suministro
1,servicios financieros,3,4e-06,servicios financieros
2,compraventa,33059,0.043067,suministro
3,prestacion de servicios,535714,0.697887,prestacion de servicios
4,concesion,311,0.000405,Otro
5,arrendamiento de inmuebles,140,0.000182,arrendamiento
6,comodato,752,0.00098,arrendamiento
7,no definido,307,0.0004,Otro
8,arrendamiento,5891,0.007674,arrendamiento
9,otro,1359,0.00177,Otro


In [18]:
count_process_type_cont["tipo_de_contrato"].unique()

<a href="https://www.funcionpublica.gov.co/eva/gestornormativo/norma.php?i=304">Ley 80</a>