# Variable dependiente: Monto Corrupción Intensa (numérica)

## 1. Data

#### 1.1. Import libraries and data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as  pd, numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [3]:
import funciones as fun
import variables_nombres as vn

In [4]:
path = r'..\..\output\data_preprocess\dfs_0_i_mci.csv'
data = pd.read_csv( path )

In [5]:
# Borrar otras variables dependientes. Solo nos quedamos con 'monto_corrup1'
data = data.drop( [ 'monto_examinado', 'monto_objeto_servicio', 'corrup_intensa', 
                    'corrup_amplia', 'per_corrup1', 'per_corrup2' ], 
                  axis = 1 )

In [6]:
# Borrar aquellas filas en las que la variable dependiente tenga missing
data = data.dropna( axis = 0, subset = [ 'monto_corrup1' ] )

In [7]:
data = data.dropna( axis = 1 )

#### 1.2. Split data into training and test set

In [8]:
np.random.seed( 22 )
training = np.random.choice( data.index, size=int( len( data )*( 3/4 )), replace = False )

data_train = data.loc[ training, : ]
data_test = data.drop( training, axis = 0 )

## 2. Definir el modelo

In [9]:
import statsmodels.formula.api as smf
import patsy

#### 2.1. Construir modelos

In [10]:
%%time

formula_basic = "monto_corrup1 ~ tejgfun_ct05pgercon + devppimfun_ct05pgercon + devppimfun_ct05trab + tejgfun_ct05come + tdvgfun_ct05come + piagfun_ct05agro + pimgfun_ct05agro + tejgfun_ct05agro + devppimfun_ct05agro + piagfun_ct05energia + pimgfun_ct05energia + tejgfun_ct05energia + devppimfun_ct05energia + piagfun_ct05ind + pimgfun_ct05ind + devppimfun_ct05ind + tejgfun_ct05trans + tdvgfun_ct05trans + tejgfun_ct05san + devppimfun_ct05san + devppimfun_ct05viv + devppimfun_ct05cydep + devppimfun_ct05protsoc + devppimfun_ct05dpub + tejgfun_ct06opseg + tdvgfun_ct06opseg + tejgfun_ct06come + devppimfun_ct06come + piagfun_ct06turi + tdvgfun_ct06turi + devppimfun_ct06turi + devppimfun_ct06ind + devppimfun_ct06trans + devppimfun_ct06amb + piagfun_ct06san + pimgfun_ct06san + devppimfun_ct06san + tejgfun_ct06viv + tdvgfun_ct06viv + devppimfun_ct06viv + piagfun_ct06salud + pimgfun_ct06salud + devppimfun_ct06salud + tejgfun_ct06cydep + piagfun_ct06edu + pimgfun_ct06protsoc + devppimft_rdet + tejgrb_reod + tdvgrb_reod + devppimrb_reod + devppimrb_redr + devppimrb_rooc + devppimrb_fonc + devppimrb_impm + devppimrb_canr + devppimct_r00gstcr + dfgdevpiagct_r19gstcr + tejgct_r19gstcp + tejgct_r13gstcr + tdvgct_r13gstcr + devppimct_r13gstcr + tejgct_r07gstcp + piagct_r07srdeu + pimgct_r07srdeu + tejgct_r07srdeu + devppimct_r07srdeu + piagct_r18gstcr + devppimct_r18gstcr + pimgge_r00ct05pobso + tejgge_r00ct05popso + devppimge_r00ct05popso + piagge_r00ct05biser + tejgge_r00ct05biser + devppimge_r00ct05biser + devppimgge_r00ct05otgst + dfgdevpiagge_r00ct06dotra + devppimgge_r00ct06otgst + dfgpimpiagge_r00ct06acanf + devppimgge_r09ct05pobso + piagge_r09ct05popso + devppimgge_r09ct05popso + pimgge_r09ct05dotra + tdvgge_r09ct05dotra + devppimgge_r09ct05dotra + piagge_r09ct05otgst + tejgge_r09ct05otgst + devppimgge_r09ct05otgst + pimgge_r09ct06dotra + devppimgge_r09ct06dotra + piagge_r09ct06otgst + tejgge_r09ct06otgst + tdvgge_r09ct06otgst + devppimgge_r09ct06otgst + tejgge_r09ct06acanf + tdvgge_r09ct06acanf + devppimgge_r09ct06acanf + dfgpimpiagge_r09ct07sedpu + devppimgge_r09ct07sedpu + dfgpimpiagge_r19ct06dotra + dfgdevpiagge_r19ct06dotra + devppimgge_r19ct06dotra + dfgpimpiagge_r13ct05pobso + tejgge_r13ct05popso + dfgpimpiagge_r13ct05popso + devppimgge_r13ct05popso + piagge_r13ct05dotra + pimgge_r13ct05dotra + tejgge_r13ct05dotra + tejgge_r13ct05otgst + dfgpimpiagge_r13ct05otgst + devppimgge_r13ct05otgst + dfgpimpiagge_r13ct06dotra + dfgdevpiagge_r13ct06dotra + piagge_r07ct05pobso + tejgge_r07ct05pobso + devppimgge_r07ct05pobso + tejgge_r07ct05popso + devppimgge_r07ct05popso + tejgge_r07ct05biser + tdvgge_r07ct05biser + devppimgge_r07ct05biser + pimgge_r07ct05dotra + tejgge_r07ct05dotra + devppimgge_r07ct05dotra + pimgge_r07ct05otgst + tejgge_r07ct05otgst + devppimgge_r07ct05otgst + devppimge_r08ct05pobso + devppimge_r08ct05popso + pimgge_r08ct05dotra + tejgge_r08ct05dotra + devppimge_r08ct05dotra + tejgge_r08ct05otgst + devppimgge_r08ct05otgst + devppimgge_r08ct06acanf + piagge_r08ct07sedpu + piagge_r18ct05pobso + pimgge_r18ct05pobso + devppimge_r18ct05pobso + devppimge_r18ct05popso + pimgge_r18ct05dotra + devppimge_r18ct05dotra + piagge_r18ct05otgst + pimgge_r18ct05otgst + devppimgge_r18ct05otgst + dfgpimpiagge_r18ct06acacf + piagge_r18ct07sedpu + piagkft_reod + devppimgkft_reod + piagkft_redr + piagkft_rooc + dfgpimpiagkft_rooc + piagkft_dotr + tejgkft_dotr + devppimgkft_dotr + devppimgkft_rdet + devppimgkftr07_rdet + tejgfun_f1ct05agro + tejgfun_f1ct05amb + tejgfun_f1ct05come + devppimfun_f1ct05come + tejgfun_f1ct05cydep + devppimfun_f1ct05cydep + devppimfun_f1ct05edu + tejgfun_f1ct05opseg + dfgpimpiafun_f1ct05opseg + devppimfun_f1ct05opseg + tdvgfun_f1ct05pgercon + devppimfun_f1ct05pgercon + tejgfun_f1ct05salud + tdvgfun_f1ct05salud + devppimfun_f1ct05salud + tejgfun_f1ct05san + dfgdevpiagfun_f1ct05san + devppimfun_f1ct05san + piagfun_f1ct05trans + pimgfun_f1ct05trans + tejgfun_f1ct05trans + dfgdevpiagfun_f1ct05trans + devppimfun_f1ct05trans + devppimfun_f1ct05turi + tejgfun_f1ct05viv + tdvgfun_f1ct05viv + devppimfun_f1ct05viv + tdvgfun_f1ct06agro + devppimfun_f1ct06agro + pimgfun_f1ct06amb + devppimfun_f1ct06amb + tejgfun_f1ct06come + devppimfun_f1ct06come + piagfun_f1ct06edu + tejgfun_f1ct06edu + dfgpimpiafun_f1ct06edu + devppimfun_f1ct06edu + devppimfun_f1ct06opseg + dfgpimpiafun_f1ct06pgercon + devppimfun_f1ct06pgercon + tejgfun_f1ct06protsoc + dfgpimpiafun_f1ct06protsoc + devppimfun_f1ct06protsoc + devppimfun_f1ct06salud + devppimfun_f1ct06trans + devppimfun_f1ct06viv + piagfun_f2ct05agro + pimgfun_f2ct05agro + devppimfun_f2ct05agro + devppimfun_f2ct05amb + piagfun_f2ct05come + devppimfun_f2ct05cydep + pimgfun_f2ct05edu + devppimfun_f2ct05edu + devppimfun_f2ct05energia + devppimfun_f2ct05opseg + devppimfun_f2ct05pgercon + piagfun_f2ct05protsoc + tejgfun_f2ct05protsoc + devppimfun_f2ct05salud + piagfun_f2ct05san + pimgfun_f2ct05san + tejgfun_f2ct05san + devppimfun_f2ct05san + tejgfun_f2ct05trans + tdvgfun_f2ct05trans + devppimfun_f2ct05trans + pimgfun_f2ct05turi + devppimfun_f2ct05turi + tejgfun_f2ct05viv + tdvgfun_f2ct05viv + devppimfun_f2ct05viv + piagfun_f2ct06agro + tejgfun_f2ct06agro + devppimfun_f2ct06agro + piagfun_f2ct06amb + pimgfun_f2ct06amb + tejgfun_f2ct06amb + devppimfun_f2ct06amb + piagfun_f2ct06come + pimgfun_f2ct06come + tejgfun_f2ct06come + devppimfun_f2ct06come + tejgfun_f2ct06comunica + dfgpimpiafun_f2ct06comunica + piagfun_f2ct06cydep + tejgfun_f2ct06cydep + tdvgfun_f2ct06cydep + devppimfun_f2ct06cydep + devppimfun_f2ct06edu + piagfun_f2ct06energia + pimgfun_f2ct06energia + tdvgfun_f2ct06energia + pimgfun_f2ct06ind + devppimfun_f2ct06ind + devppimfun_f2ct06opseg + piagfun_f2ct06pgercon + pimgfun_f2ct06pgercon + tejgfun_f2ct06pgercon + devppimfun_f2ct06pgercon + piagfun_f2ct06protsoc + pimgfun_f2ct06protsoc + devppimfun_f2ct06protsoc + piagfun_f2ct06salud + pimgfun_f2ct06salud + tdvgfun_f2ct06salud + devppimfun_f2ct06salud + piagfun_f2ct06san + tdvgfun_f2ct06san + devppimfun_f2ct06san + pimgfun_f2ct06trans + devppimfun_f2ct06trans + piagfun_f2ct06turi + pimgfun_f2ct06turi + tejgfun_f2ct06turi + devppimfun_f2ct06turi + tejgfun_f2ct06viv + devppimfun_f2ct06viv + pimgfun_f3ct05ind + devppimfun_f3ct05opseg + tejgfun_f3ct05pgercon + dfgpimpiafun_f3ct05pgercon + dfgdevpiagfun_f3ct06opseg + tejgfun_f3ct06pgercon + devppimfun_f3ct06pgercon + tejgfun_f3ct06san + dfgpimpiafun_f4ct05agro + devppimfun_f4ct05agro + tejgfun_f4ct05amb + tdvgfun_f4ct05amb + devppimfun_f4ct05amb + tdvgfun_f4ct05come + dfgpimpiafun_f4ct05come + tdvgfun_f4ct05cydep + devppimfun_f4ct05cydep + devppimfun_f4ct05edu + dfgpimpiafun_f4ct05opseg + devppimfun_f4ct05opseg + tdvgfun_f4ct05pgercon + tejgfun_f4ct05protsoc + dfgdevpiagfun_f4ct05protsoc + devppimfun_f4ct05protsoc + pimgfun_f4ct05salud + tejgfun_f4ct05salud + devppimfun_f4ct05salud + dfgpimpiafun_f4ct05san + dfgpimpiafun_f4ct05trans + devppimfun_f4ct05trans + tejgfun_f4ct05turi + dfgpimpiafun_f4ct05turi + devppimfun_f4ct05turi + tdvgfun_f4ct05viv + dfgpimpiafun_f4ct05viv + devppimfun_f4ct05viv + devppimfun_f4ct06agro + tejgfun_f4ct06amb + dfgpimpiafun_f4ct06amb + dfgdevpiagfun_f4ct06come + tejgfun_f4ct06cydep + devppimfun_f4ct06cydep + tejgfun_f4ct06edu + tejgfun_f4ct06opseg + devppimfun_f4ct06opseg + piagfun_f4ct06pgercon + tdvgfun_f4ct06pgercon + devppimfun_f4ct06pgercon + tejgfun_f4ct06protsoc + devppimfun_f4ct06protsoc + devppimfun_f4ct06salud + devppimfun_f4ct06trab + devppimfun_f4ct06trans + dfgpimpiafun_f4ct06turi + dfgdevpiagfun_f4ct06turi + devppimfun_f4ct06turi + dfgpimpiafun_f4ct06viv + devppimfun_f4ct06viv + tejgfun_f5ct05agro + piagfun_f5ct05amb + tejgfun_f5ct05amb + devppimfun_f5ct05amb + piagfun_f5ct05come + tejgfun_f5ct05come + devppimfun_f5ct05comunica + piagfun_f5ct05cydep + tejgfun_f5ct05cydep + tejgfun_f5ct05edu + tdvgfun_f5ct05edu + devppimfun_f5ct05edu + tdvgfun_f5ct05energia + tejgfun_f5ct05opseg + devppimfun_f5ct05opseg + tejgfun_f5ct05prevsoc + tdvgfun_f5ct05prevsoc + tejgfun_f5ct05protsoc + tdvgfun_f5ct05protsoc + devppimfun_f5ct05protsoc + piagfun_f5ct05salud + devppimfun_f5ct05salud + piagfun_f5ct05san + devppimfun_f5ct05trab + devppimfun_f5ct05trans + tdvgfun_f5ct05turi + devppimfun_f5ct05turi + tejgfun_f5ct05viv + devppimfun_f5ct06agro + piagfun_f5ct06come + pimgfun_f5ct06come + tdvgfun_f5ct06comunica + devppimfun_f5ct06cydep + devppimfun_f5ct06opseg + tejgfun_f5ct06pgercon + devppimfun_f5ct06pgercon + devppimfun_f5ct06protsoc + tejgfun_f5ct06trab + devppimfun_f5ct06trab + piagfun_f5ct06trans + pimgfun_f5ct06turi + devppimfun_f5energia + pimgfun_f5pesca + devppimfun_f5pesca + piagfun_f5r07ct05agro + tejgfun_f5r07ct05agro + tdvgfun_f5r07ct05agro + devppimfun_f5r07ct05agro + piagfun_f5r07ct05amb + tejgfun_f5r07ct05amb + devppimfun_f5r07ct05amb + tejgfun_f5r07ct05come + tdvgfun_f5r07ct05come + devppimfun_f5r07ct05come + piagfun_f5r07ct05comunica + tejgfun_f5r07ct05comunica + devppimfun_f5r07ct05comunica + piagfun_f5r07ct05cydep + tejgfun_f5r07ct05cydep + devppimfun_f5r07ct05cydep + devppimfun_f5r07ct05dpub + pimgfun_f5r07ct05edu + tejgfun_f5r07ct05edu + devppimfun_f5r07ct05edu + tejgfun_f5r07ct05energia + tdvgfun_f5r07ct05energia + devppimfun_f5r07ct05energia + pimgfun_f5r07ct05ind + devppimfun_f5r07ct05opseg + tejgfun_f5r07ct05pgercon + piagfun_f5r07ct05protsoc + pimgfun_f5r07ct05protsoc + devppimfun_f5r07ct05protsoc + tejgfun_f5r07ct05salud + tdvgfun_f5r07ct05salud + devppimfun_f5r07ct05salud + pimgfun_f5r07ct05san + tejgfun_f5r07ct05san + devppimfun_f5r07ct05san + piagfun_f5r07ct05trans + tejgfun_f5r07ct05trans + tdvgfun_f5r07ct05trans + devppimfun_f5r07ct05trans + piagfun_f5r07ct05turi + tdvgfun_f5r07ct05turi + tejgfun_f5r07ct05viv + tdvgfun_f5r07ct05viv + devppimfun_f5r07ct05viv + tejgfun_f5r07ct06agro + devppimfun_f5r07ct06agro + piagfun_f5r07ct06amb + tejgfun_f5r07ct06amb + devppimfun_f5r07ct06amb + pimgfun_f5r07ct06come + tejgfun_f5r07ct06come + tdvgfun_f5r07ct06come + devppimfun_f5r07ct06come + devppimfun_f5r07ct06comunica + tejgfun_f5r07ct06cydep + devppimfun_f5r07ct06cydep + piagfun_f5r07ct06edu + piagfun_f5r07ct06energia + pimgfun_f5r07ct06energia + tejgfun_f5r07ct06energia + tdvgfun_f5r07ct06ind + devppimfun_f5r07ct06ind + piagfun_f5r07ct06opseg + tejgfun_f5r07ct06opseg + devppimfun_f5r07ct06opseg + tejgfun_f5r07ct06pgercon + devppimfun_f5r07ct06pgercon + dfgdevpiagfun_f5r07ct06prevsoc + piagfun_f5r07ct06protsoc + pimgfun_f5r07ct06protsoc + tejgfun_f5r07ct06protsoc + devppimfun_f5r07ct06protsoc + piagfun_f5r07ct06salud + pimgfun_f5r07ct06salud + devppimfun_f5r07ct06salud + pimgfun_f5r07ct06trab + devppimfun_f5r07ct06trab + piagfun_f5r07ct06trans + devppimfun_f5r07ct06trans + tejgfun_f5r07ct06turi + devppimfun_f5r07ct06turi + piagfun_f5r07ct06viv + devppimfun_f5r07ct06viv + pimgfun_f5r08ct05agro + devppimfun_f5r08ct05agro + tdvgfun_f5r08ct05dpub + tejgfun_f5r08ct05edu + devppimfun_f5r08ct05edu + piagfun_f5r08ct05energia + devppimfun_f5r08ct05energia + piagfun_f5r08ct05opseg + piagfun_f5r08ct05pgercon + piagfun_f5r08ct05protsoc + tejgfun_f5r08ct05salud + devppimfun_f5r08ct05salud + piagfun_f5r08ct05san + pimgfun_f5r08ct05san + tejgfun_f5r08ct05san + devppimfun_f5r08ct05san + tejgfun_f5r08ct05trab + devppimfun_f5r08ct05trans + piagfun_f5r08ct05turi + piagfun_f5r08ct05viv + piagfun_f5r08ct06agro + devppimfun_f5r08ct06agro + piagfun_f5r08ct06amb + pimgfun_f5r08ct06amb + tejgfun_f5r08ct06amb + devppimfun_f5r08ct06amb + tejgfun_f5r08ct06come + tdvgfun_f5r08ct06come + devppimfun_f5r08ct06come + piagfun_f5r08ct06comunica + pimgfun_f5r08ct06comunica + devppimfun_f5r08ct06comunica + piagfun_f5r08ct06cydep + pimgfun_f5r08ct06cydep + tejgfun_f5r08ct06cydep + devppimfun_f5r08ct06cydep + piagfun_f5r08ct06edu + devppimfun_f5r08ct06edu + piagfun_f5r08ct06energia + devppimfun_f5r08ct06energia + piagfun_f5r08ct06opseg + pimgfun_f5r08ct06opseg + devppimfun_f5r08ct06opseg + piagfun_f5r08ct06pgercon + devppimfun_f5r08ct06pgercon + piagfun_f5r08ct06protsoc + pimgfun_f5r08ct06protsoc + tejgfun_f5r08ct06protsoc + devppimfun_f5r08ct06protsoc + pimgfun_f5r08ct06salud + tejgfun_f5r08ct06salud + devppimfun_f5r08ct06salud + devppimfun_f5r08ct06san + dfgpimpiafun_f5r08ct06trab + tejgfun_f5r08ct06trans + devppimfun_f5r08ct06trans + tejgfun_f5r08ct06turi + dfgpimpiafun_f5r08ct06turi + devppimfun_f5r08ct06turi + piagfun_f5r08ct06viv + tejgfun_f5r08ct06viv + devppimfun_f5r08ct06viv + piagfun_f5r18ct05agro + devppimfun_f5r18ct05agro + tejgfun_f5r18ct05amb + tdvgfun_f5r18ct05amb + devppimfun_f5r18ct05amb + piagfun_f5r18ct05come + tejgfun_f5r18ct05come + devppimfun_f5r18ct05come + tdvgfun_f5r18ct05comunica + pimgfun_f5r18ct05cydep + devppimfun_f5r18ct05cydep + piagfun_f5r18ct05edu + tdvgfun_f5r18ct05edu + devppimfun_f5r18ct05edu + piagfun_f5r18ct05energia + tdvgfun_f5r18ct05energia + devppimfun_f5r18ct05energia + dfgpimpiafun_f5r18ct05ind + devppimfun_f5r18ct05ind + piagfun_f5r18ct05opseg + tejgfun_f5r18ct05opseg + devppimfun_f5r18ct05opseg + dfgpimpiafun_f5r18ct05pesca + devppimfun_f5r18ct05pgercon + pimgfun_f5r18ct05prevsoc + devppimfun_f5r18ct05prevsoc + piagfun_f5r18ct05protsoc + pimgfun_f5r18ct05protsoc + tejgfun_f5r18ct05protsoc + devppimfun_f5r18ct05protsoc + tejgfun_f5r18ct05salud + devppimfun_f5r18ct05salud + piagfun_f5r18ct05san + devppimfun_f5r18ct05san + devppimfun_f5r18ct05trab + piagfun_f5r18ct05trans + tejgfun_f5r18ct05trans + devppimfun_f5r18ct05trans + piagfun_f5r18ct05turi + tejgfun_f5r18ct05turi + tdvgfun_f5r18ct05turi + devppimfun_f5r18ct05turi + piagfun_f5r18ct05viv + tejgfun_f5r18ct05viv + tdvgfun_f5r18ct05viv + devppimfun_f5r18ct05viv + tejgfun_f5r18ct06agro + devppimfun_f5r18ct06amb + devppimfun_f5r18ct06come + devppimfun_f5r18ct06cydep + piagfun_f5r18ct06opseg + devppimfun_f5r18ct06opseg + piagfun_f5r18ct06pgercon + devppimfun_f5r18ct06pgercon + piagfun_f5r18ct06prevsoc + devppimfun_f5r18ct06prevsoc + tejgfun_f5r18ct06protsoc + devppimfun_f5r18ct06protsoc + devppimfun_f5r18ct06salud + dfgpimpiafun_f5r18ct06trab + devppimfun_f5r18ct06trab + devppimfun_f5r18ct06trans + tdvgfun_f5r18ct06viv + devppimfun_f5r18ct06viv + piagtotfun_f1agro + tdvgtotfun_f1agro + devppimtotfun_f1agro + devppimtotfun_f1amb + tejgtotfun_f1cydep + dfgpimpiatotfun_f1cydep + devppimtotfun_f1cydep + tejgtotfun_f1energia + devppimtotfun_f1energia + piagtotfun_f1opseg + pimgtotfun_f1opseg + tdvgtotfun_f1opseg + piagtotfun_f1prevsoc + devppimtotfun_f1prevsoc + piagtotfun_f1protsoc + piagtotfun_f1salud + pimgtotfun_f1salud + piagtotfun_f1san + tejgtotfun_f1san + tdvgtotfun_f1san + devppimtotfun_f1san + dfgdevpiagtotfun_f1trab + pimgtotfun_f1trans + tejgtotfun_f1trans + tdvgtotfun_f1turi + devppimtotfun_f1turi + pimgtotfun_f1viv + tejgtotfun_f1viv + piagtotfun_f2agro + devppimtotfun_f2agro + tejgtotfun_f2amb + devppimtotfun_f2come + pimgtotfun_f2comunica + piagtotfun_f2cydep + pimgtotfun_f2cydep + tejgtotfun_f2cydep + dfgpimpiatotfun_f2dpub + piagtotfun_f2edu + pimgtotfun_f2edu + devppimtotfun_f2edu + piagtotfun_f2energia + tejgtotfun_f2energia + devppimtotfun_f2energia + devppimtotfun_f2ind + piagtotfun_f2pgercon + tejgtotfun_f2pgercon + devppimtotfun_f2prevsoc + devppimtotfun_f2protsoc + tejgtotfun_f2salud + piagtotfun_f2san + tdvgtotfun_f2san + devppimtotfun_f2san + tejgtotfun_f2trab + devppimtotfun_f2trab + devppimtotfun_f2trans + tejgtotfun_f2turi + piagtotfun_f2viv + piagtotfun_f3agro + tejgtotfun_f3agro + dfgpimpiatotfun_f3agro + devppimtotfun_f3agro + tdvgtotfun_f3amb + devppimtotfun_f3amb + dfgpimpiatotfun_f3comunica + tejgtotfun_f3cydep + dfgpimpiatotfun_f3cydep + devppimtotfun_f3cydep + tejgtotfun_f3edu + devppimtotfun_f3edu + tejgtotfun_f3energia + dfgpimpiatotfun_f3energia + devppimtotfun_f3energia + tejgtotfun_f3opseg + dfgpimpiatotfun_f3opseg + dfgdevpiagtotfun_f3opseg + devppimtotfun_f3opseg + piagtotfun_f3pgercon + dfgpimpiatotfun_f3pgercon + devppimtotfun_f3pgercon + tejgtotfun_f3protsoc + dfgpimpiatotfun_f3protsoc + devppimtotfun_f3protsoc + tejgtotfun_f3salud + devppimtotfun_f3salud + piagtotfun_f3san + pimgtotfun_f3san + devppimtotfun_f3san + devppimtotfun_f3trab + tdvgtotfun_f3trans + devppimtotfun_f3trans + piagtotfun_f3viv + pimgtotfun_f3viv + tejgtotfun_f3viv + devppimtotfun_f3viv + tejgtotfun_f4agro + dfgpimpiatotfun_f4agro + piagtotfun_f4amb + dfgpimpiatotfun_f4amb + devppimtotfun_f4amb + devppimtotfun_f4come + dfgdevpiagtotfun_f4cydep + devppimtotfun_f4cydep + dfgdevpiagtotfun_f4edu + devppimtotfun_f4edu + tejgtotfun_f4energia + dfgpimpiatotfun_f4energia + dfgdevpiagtotfun_f4energia + devppimtotfun_f4energia + piagtotfun_f4pgercon + pimgtotfun_f4pgercon + tdvgtotfun_f4pgercon + devppimtotfun_f4pgercon + dfgpimpiatotfun_f4prevsoc + tejgtotfun_f4protsoc + piagtotfun_f4salud + pimgtotfun_f4salud + devppimtotfun_f4salud + tejgtotfun_f4san + devppimtotfun_f4san + tdvgtotfun_f4trab + devppimtotfun_f4trab + devppimtotfun_f4turi + devppimtotfun_f5agro + devppimtotfun_f5amb + tdvgtotfun_f5come + devppimtotfun_f5come + piagtotfun_f5comunica + pimgtotfun_f5comunica + devppimtotfun_f5comunica + piagtotfun_f5cydep + devppimtotfun_f5cydep + piagtotfun_f5dpub + tejgtotfun_f5dpub + tdvgtotfun_f5dpub + devppimtotfun_f5dpub + pimgtotfun_f5edu + devppimtotfun_f5edu + piagtotfun_f5ind + pimgtotfun_f5ind + tdvgtotfun_f5ind + devppimtotfun_f5ind + piagtotfun_f5opseg + tdvgtotfun_f5opseg + devppimtotfun_f5opseg + piagtotfun_f5pgercon + devppimtotfun_f5pgercon + devppimtotfun_f5prevsoc + piagtotfun_f5protsoc + pimgtotfun_f5protsoc + tejgtotfun_f5protsoc + devppimtotfun_f5protsoc + piagtotfun_f5r07agro + pimgtotfun_f5r07agro + devppimtotfun_f5r07agro + piagtotfun_f5r07comunica + pimgtotfun_f5r07comunica + piagtotfun_f5r07cydep + devppimtotfun_f5r07cydep + pimgtotfun_f5r07edu + devppimtotfun_f5r07edu + devppimtotfun_f5r07energia + pimgtotfun_f5r07ind + tejgtotfun_f5r07opseg + tdvgtotfun_f5r07opseg + piagtotfun_f5r07pesca + tejgtotfun_f5r07pesca + tdvgtotfun_f5r07pgercon + devppimtotfun_f5r07pgercon + tdvgtotfun_f5r07prevsoc + devppimtotfun_f5r07prevsoc + piagtotfun_f5r07protsoc + tejgtotfun_f5r07protsoc + tdvgtotfun_f5r07protsoc + piagtotfun_f5r07san + pimgtotfun_f5r07san + tejgtotfun_f5r07san + devppimtotfun_f5r07san + devppimtotfun_f5r07trab + piagtotfun_f5r07turi + pimgtotfun_f5r07turi + tejgtotfun_f5r07turi + tdvgtotfun_f5r07turi + devppimtotfun_f5r07turi + tejgtotfun_f5r07viv + tdvgtotfun_f5r07viv + devppimtotfun_f5r07viv + piagtotfun_f5r08agro + pimgtotfun_f5r08agro + tejgtotfun_f5r08agro + devppimtotfun_f5r08amb + devppimtotfun_f5r08cydep + tdvgtotfun_f5r08dpub + devppimtotfun_f5r08dpub + piagtotfun_f5r08edu + tdvgtotfun_f5r08edu + devppimtotfun_f5r08edu + piagtotfun_f5r08energia + pimgtotfun_f5r08energia + devppimtotfun_f5r08energia + tdvgtotfun_f5r08opseg + devppimtotfun_f5r08opseg + devppimtotfun_f5r08prevsoc + devppimtotfun_f5r08protsoc + piagtotfun_f5r08salud + pimgtotfun_f5r08san + tejgtotfun_f5r08san + tdvgtotfun_f5r08san + devppimtotfun_f5r08trab + devppimtotfun_f5r08turi + devppimtotfun_f5r08viv + devppimtotfun_f5r18agro + piagtotfun_f5r18amb + tejgtotfun_f5r18amb + devppimtotfun_f5r18amb + devppimtotfun_f5r18come + piagtotfun_f5r18cydep + tdvgtotfun_f5r18cydep + tejgtotfun_f5r18dpub + devppimtotfun_f5r18dpub + devppimtotfun_f5r18ind + tejgtotfun_f5r18opseg + devppimtotfun_f5r18opseg + devppimtotfun_f5r18pgercon + pimgtotfun_f5r18prevsoc + pimgtotfun_f5r18protsoc + devppimtotfun_f5r18protsoc + devppimtotfun_f5r18salud + devppimtotfun_f5r18trab + devppimtotfun_f5r18turi + piagtotfun_f5r18viv + devppimtotfun_f5r18viv + devppimtotfun_f5salud + devppimtotfun_f5san + piagtotfun_f5trab + pimgtotfun_f5trab + tejgtotfun_f5trab + devppimtotfun_f5trab + tejgtotfun_f5trans + devppimtotfun_f5trans + tdvgtotfun_f5turi + devppimtotfun_f5turi + piagtotfun_f5viv + tejgtotfun_f5viv + tdvgtotfun_f5viv + devppimtotfun_f5viv"


y_basic_train, x_basic_train = patsy.dmatrices( formula_basic, data_train, return_type = 'dataframe' )
y_basic_test, x_basic_test = patsy.dmatrices( formula_basic, data_test, return_type = 'dataframe' )
p_basic = x_basic_train.shape[ 1 ]

Wall time: 2.29 s


#### 2.2. Generar variables dependientes

In [11]:
Y_train = data_train[ 'monto_corrup1' ]
Y_test = data_test[ 'monto_corrup1' ]

In [12]:
p_basic

832

## 3. OLS

In [13]:
x_basic_train.head()

Unnamed: 0,Intercept,tejgfun_ct05pgercon,devppimfun_ct05pgercon,devppimfun_ct05trab,tejgfun_ct05come,tdvgfun_ct05come,piagfun_ct05agro,pimgfun_ct05agro,tejgfun_ct05agro,devppimfun_ct05agro,...,tejgtotfun_f5trab,devppimtotfun_f5trab,tejgtotfun_f5trans,devppimtotfun_f5trans,tdvgtotfun_f5turi,devppimtotfun_f5turi,piagtotfun_f5viv,tejgtotfun_f5viv,tdvgtotfun_f5viv,devppimtotfun_f5viv
32,1.0,0.0,4.480664,0.0,0.0,0.0,4.615121,4.615121,0.0,0.0,...,0.0,0.0,0.0,4.001274,0.0,0.0,0.0,0.0,0.0,0.0
280,1.0,12.459883,4.486964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.974745,8.974745,4.615121
644,1.0,0.0,4.585845,0.0,0.0,0.0,10.916596,12.341774,0.0,4.603001,...,0.0,0.0,0.0,4.522061,9.480368,4.615121,0.0,0.0,11.419352,4.583194
394,1.0,0.0,4.461252,4.261539,0.0,13.6117,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.090518,0.0,0.0,14.189002,0.0,13.50188,4.293248
391,1.0,15.931886,4.327794,0.0,0.0,0.0,0.0,11.626263,11.560772,4.550299,...,0.0,0.0,13.551032,3.600408,0.0,0.0,13.481895,13.186339,13.159042,4.33377


#### 3.1. Ajustar el modelo

In [14]:
fit_lm_basic = smf.ols( formula_basic, data = data_train ).fit()

#### 3.2. Calcular out-of-sample MSE y Standard Error

In [15]:
# Only MSE

yhat_lm_basic = fit_lm_basic.predict( data_test )
print("The mean squared error (MSE) using the basic model is equal to:", ( ( Y_test - yhat_lm_basic )**2 ).mean() )

The mean squared error (MSE) using the basic model is equal to: 9024337174724718.0


In [16]:
# MSE and SE

resid_basic  = ( Y_test - yhat_lm_basic )**2

MSE_lm_basic = sm.OLS( resid_basic , np.ones( resid_basic.shape[ 0 ] ) ).fit().summary2().tables[ 1 ].iloc[ 0, 0:2 ]
MSE_lm_basic

Coef.       9.024337e+15
Std.Err.    1.299489e+15
Name: const, dtype: float64

#### 3.3. Calcular R Cuadrado out-of-sample

In [17]:
R2_lm_basic = 1 - ( MSE_lm_basic[ 0 ]/Y_test.var() )
print( f"The R^2 using the basic model is equal to: { R2_lm_basic }" )

The R^2 using the basic model is equal to: -963.2140915242526


## 4. Lasso, Ridge and Elastic Net

#### 4.1. Theoretical Lasso from hdm package

In [48]:
import hdmpy as hdm

In [49]:
fit_rlasso = hdm.rlasso( x_basic_train.to_numpy() , Y_train.to_numpy().reshape( Y_train.size , 1 ) , post = False )
fit_rlasso_post = hdm.rlasso( x_basic_train.to_numpy() , Y_train.to_numpy().reshape( Y_train.size , 1 ) , post = True )

In [50]:
# Getting mean of each variable
meanx = x_basic_test.mean( axis = 0 ).values.\
                        reshape( x_basic_test.shape[ 1 ] , 1 )

# Reducing the mean
new_x1 = x_basic_test.to_numpy() - \
                    ( np.ones( ( x_basic_test.shape[ 0 ] , 1 ) ) @ meanx.T )

# Getting the significant variables
x1_est_rlasso = new_x1[ :, fit_rlasso.est[ 'index' ].iloc[ :, 0 ].to_list() ]

# Convert fit_rlasso.est[ 'beta' ] to dataframe
fit_rlasso_est = pd.DataFrame( fit_rlasso.est[ 'beta' ] )

# Getting the coef. from significant variables
beta_rlasso = fit_rlasso_est.loc[ fit_rlasso.est[ 'index' ].\
                                  loc[ :, 0 ].to_list(), ].to_numpy()

# yhat
yhat_rlasso = ( x1_est_rlasso @ beta_rlasso ) + np.mean( Y_test.to_numpy() )
residuals_rlasso = Y_test.to_numpy().reshape( Y_test.to_numpy().size, 1)  - yhat_rlasso

In [50]:
# Getting mean of each variable
meanx = x_basic_test.mean( axis = 0 ).values.\
                        reshape( x_basic_test.shape[ 1 ] , 1 )

# Reducing the mean
new_x1 = x_basic_test.to_numpy() - \
                    ( np.ones( ( x_basic_test.shape[ 0 ] , 1 ) ) @ meanx.T )

# Getting the significant variables
x1_est_rlasso = new_x1[ :, fit_rlasso.est[ 'index' ].iloc[ :, 0 ].to_list() ]

# Convert fit_rlasso.est[ 'beta' ] to dataframe
fit_rlasso_est = pd.DataFrame( fit_rlasso.est[ 'beta' ] )

# Getting the coef. from significant variables
beta_rlasso = fit_rlasso_est.loc[ fit_rlasso.est[ 'index' ].\
                                  loc[ :, 0 ].to_list(), ].to_numpy()

# yhat
yhat_rlasso = ( x1_est_rlasso @ beta_rlasso ) + np.mean( Y_test.to_numpy() )
residuals_rlasso = Y_test.to_numpy().reshape( Y_test.to_numpy().size, 1)  - yhat_rlasso

In [50]:
# Getting mean of each variable
meanx = x_basic_test.mean( axis = 0 ).values.\
                        reshape( x_basic_test.shape[ 1 ] , 1 )

# Reducing the mean
new_x1 = x_basic_test.to_numpy() - \
                    ( np.ones( ( x_basic_test.shape[ 0 ] , 1 ) ) @ meanx.T )

# Getting the significant variables
x1_est_rlasso = new_x1[ :, fit_rlasso.est[ 'index' ].iloc[ :, 0 ].to_list() ]

# Convert fit_rlasso.est[ 'beta' ] to dataframe
fit_rlasso_est = pd.DataFrame( fit_rlasso.est[ 'beta' ] )

# Getting the coef. from significant variables
beta_rlasso = fit_rlasso_est.loc[ fit_rlasso.est[ 'index' ].\
                                  loc[ :, 0 ].to_list(), ].to_numpy()

# yhat
yhat_rlasso = ( x1_est_rlasso @ beta_rlasso ) + np.mean( Y_test.to_numpy() )
residuals_rlasso = Y_test.to_numpy().reshape( Y_test.to_numpy().size, 1)  - yhat_rlasso

In [52]:
# Getting mean of each variable
meanx = x_basic_test.mean( axis = 0 ).values.\
                        reshape(x_basic_test.shape[ 1 ] , 1 )

# Reducing the mean
new_x1 = x_basic_test.to_numpy() - \
                    (np.ones( (x_basic_test.shape[ 0 ] , 1 ) ) @ meanx.T)

# Getting the significant variables
x1_est_rlasso_post = new_x1[ :, fit_rlasso_post.est['index'].iloc[:, 0].to_list()]

# Convert fit_rlasso.est[ 'beta' ] to dataframe
fit_rlasso_post_est = pd.DataFrame( fit_rlasso_post.est[ 'beta' ] )

# Getting the coef. from significant variables
beta_rlasso_post = fit_rlasso_post_est.loc[ fit_rlasso_post.est['index'].\
                                            iloc[:, 0].to_list(), ].to_numpy()

# yhat
yhat_rlasso_post = (x1_est_rlasso_post @ beta_rlasso_post) + np.mean( Y_test.to_numpy() )
residuals_rlasso_post = Y_test.to_numpy().reshape( Y_test.to_numpy().size, 1)  - yhat_rlasso_post

In [54]:
MSE_lasso = sm.OLS( ( residuals_rlasso )**2 , np.ones( yhat_rlasso.size )  ).fit().summary2().tables[1].round(3)
MSE_lasso_post = sm.OLS( ( residuals_rlasso_post )**2  , np.ones( yhat_rlasso_post.size )  ).fit().summary2().tables[1].round(3)

R2_lasso = 1 - MSE_lasso.iloc[0, 0]/ np.var( Y_test )
R2_lasso_post = 1 - MSE_lasso_post.iloc[0, 0]/ np.var( Y_test )

print( f"The R^2 using the basic model is equal to {R2_lasso},for lasso and {R2_lasso_post} for post-lasso")

The R^2 using the basic model is equal to -4.440892098500626e-16,for lasso and -4.440892098500626e-16 for post-lasso


#### 4.2. Cross Validated Lasso, Ridge and Elastic Net

In [37]:
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV

In [38]:
fit_lasso_cv = LassoCV( cv = 10, fit_intercept = True, normalize = False, random_state = 0 ).fit( x_basic_train, Y_train )
fit_ridge = RidgeCV( cv = 10, fit_intercept = True, normalize = False, scoring = None ).fit( x_basic_train, Y_train )
fit_elnet = ElasticNetCV( cv = 10, fit_intercept = True, normalize = False, random_state = 0 ).fit( x_basic_train, Y_train )

yhat_lasso_cv = fit_lasso_cv.predict( x_basic_test )
yhat_ridge = fit_ridge.predict( x_basic_test )
yhat_elnet = fit_elnet.predict( x_basic_test )

residual_lasso = ( yhat_lasso_cv - Y_test )**2
residual_ridge = ( yhat_ridge - Y_test )**2
residual_elnet = ( yhat_elnet - Y_test )**2

MSE_lasso_cv = sm.OLS( residual_lasso, np.ones( Y_test.size )).fit().summary2().tables[ 1 ].round( 3 )
MSE_ridge = sm.OLS( residual_ridge, np.ones( Y_test.size )).fit().summary2().tables[ 1 ].round( 3 )
MSE_elnet = sm.OLS( residual_elnet, np.ones( Y_test.size )).fit().summary2().tables[ 1 ].round( 3 )

R2_lasso_cv = 1 - MSE_lasso_cv.iloc[ 0, 0 ] / np.var( Y_test )
R2_ridge = 1 - MSE_ridge.iloc[ 0, 0 ]  / np.var( Y_test )
R2_elnet = 1 - MSE_elnet.iloc[ 0, 0 ]  / np.var( Y_test )

print( "R^2 using cross-validation for lasso, ridge, and elastic net in the basic model: {:.5f}, {:.5f}, {:.5f}".format( R2_lasso_cv, R2_ridge, R2_elnet ) )

R^2 using cross-validation for lasso, ridge, and elastic net in the basic model: -1.48132, -729.22509, -1.48361


## 5. Non Linear Models

#### 5.1. Regression Trees

In [39]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

In [40]:
# Fit the model

fit_trees = DecisionTreeRegressor( random_state = 0, min_impurity_decrease = 0.001 )
fit_trees.fit( x_basic_train, y_basic_train )

DecisionTreeRegressor(min_impurity_decrease=0.001, random_state=0)

In [None]:
# Plot the tree

# from sklearn.tree import plot_tree

# plt.figure( figsize=( 30, 20 ) )
# plot_tree( fit_trees, filled = True )
# plt.show()

In [41]:
# Determine the optimar complexity of the regression tree

s = pd.DataFrame( fit_trees.cost_complexity_pruning_path( y_basic_train, x_basic_train ) )
s.head()

Unnamed: 0,ccp_alphas,impurities
0,0.0,3.659034
1,0.006523,3.665557
2,0.006967,3.672524
3,0.007917,3.680441
4,0.00811,3.688551


In [None]:
### PREGUNTA: DE DÓNDE SALE ESE VALOR DE ALPHA

In [42]:
# Prune the tree

fit_prunnedtree = DecisionTreeRegressor( ccp_alpha = 0.00188444410871555 )
fit_prunnedtree.fit( x_basic_train, y_basic_train )

DecisionTreeRegressor(ccp_alpha=0.00188444410871555)

In [None]:
# Plot the prunned tree

# plot_tree(fit_prunnedtree, filled=True)
# plt.show()

In [43]:
# Calculate MSE and R2 for prunned tree

y_hat_pt = fit_prunnedtree.predict( x_basic_test )
residual_pt = ( y_hat_pt - Y_test )**2
MSE_pt = sm.OLS( residual_pt, np.ones( y_hat_pt.size )).fit().summary2().tables[ 1 ].round( 3 )
R2_pt = 1 - MSE_pt.iloc[ 0, 0 ]/np.var( Y_test )
print( f"R^2 of the pruned tree: { R2_pt }" )

R^2 of the pruned tree: -8.14709318760311


#### 5.2. Random Forest and Booested Trees

In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor 

In [45]:
# random forest
fit_rf = RandomForestRegressor( n_estimators = 2000, min_samples_leaf = 5).fit( x_basic_train, Y_train )

# boosting
fit_boost = GradientBoostingRegressor( loss = 'ls', learning_rate = 0.01, n_estimators = 1000, max_depth = 2, subsample = 0.5).fit( x_basic_train, Y_train )

# Evaluating the methods
yhat_rf = fit_rf.predict( x_basic_test )
residual_rf = ( yhat_rf - Y_test )**2
yhat_boost = fit_boost.predict( x_basic_test )
residual_bst = ( yhat_boost - Y_test )**2

# Calculate MSE
MSE_rf = sm.OLS( residual_pt, np.ones( yhat_rf.size )).fit().summary2().tables[ 1 ].round( 3 )
MSE_bst = sm.OLS( residual_bst, np.ones( yhat_boost.size )).fit().summary2().tables[ 1 ].round( 3 )

# Calculate R2
R2_rf = 1 - MSE_rf.iloc[ 0, 0 ] / Y_test.var()
R2_boost = 1 - MSE_bst.iloc[ 0, 0 ] / Y_test.var()

In [46]:
print( "R^2 of the random forest and boosted trees:{:.5f}, {:.5f}".format( R2_rf, R2_boost ) )

R^2 of the random forest and boosted trees:-8.08357, -77.62733


## 6. Resultados

In [47]:
table = pd.DataFrame(columns=["MSE", "S.E for MSE", "R-squared"]) 
table.loc[0]  = [MSE_lm_basic[0], MSE_lm_basic[1], R2_lm_basic]
table.loc[1]  = [MSE_lasso.iloc[0, 0], MSE_lasso.iloc[0, 1], R2_lasso]
table.loc[2]  = [MSE_lasso.iloc[0, 0], MSE_lasso_post.iloc[0, 1], R2_lasso_post]
table.loc[3]  = [MSE_lasso_cv.iloc[0, 0], MSE_lasso_cv.iloc[0, 1], R2_lasso_cv]
table.loc[4]  = [MSE_ridge.iloc[0, 0], MSE_ridge.iloc[0, 1], R2_ridge]
table.loc[5]  = [MSE_elnet.iloc[0, 0], MSE_elnet.iloc[0, 1], R2_elnet]
table.loc[6] = [MSE_rf.iloc[0, 0], MSE_rf.iloc[0, 1], R2_rf]
table.loc[7] = [MSE_bst.iloc[0, 0], MSE_bst.iloc[0, 1], R2_boost]
table.loc[8] = [MSE_pt.iloc[0, 0], MSE_pt.iloc[0, 1], R2_pt]
models_row = [ "Least Squares (basic)", "Lasso", "Post_Lasso", "Cross-Validated lasso", 
               "Cross-Validated ridge", "Cross-Validated elnet", "Random Forest", 
               "Boosted Trees", "Pruned Tree" ]
table.insert( 0, "Models", models_row )
table

Unnamed: 0,Models,MSE,S.E for MSE,R-squared
0,Least Squares (basic),9024337000000000.0,1299489000000000.0,-963.2141
1,Lasso,9294272000000.0,2536563000000.0,-4.440892e-16
2,Post_Lasso,9294272000000.0,2536563000000.0,-4.440892e-16
3,Cross-Validated lasso,23062080000000.0,1382896000000.0,-1.481321
4,Cross-Validated ridge,6786911000000000.0,989505800000000.0,-729.2251
5,Cross-Validated elnet,23083300000000.0,1393331000000.0,-1.483605
6,Random Forest,85015570000000.0,23630410000000.0,-8.083572
7,Boosted Trees,735894100000000.0,332725500000000.0,-77.62733
8,Pruned Tree,85015570000000.0,23630410000000.0,-8.147093
