# Bajar-Microdatos.ipynb
El objetivo de esta notebook es presentar código que permite sistematizar la descarga de datos de todos los trimestres de 2016 en adelante de la Encuesta Permanente de Hogares, dando o cómo resultado un archivo en formato .csv para los datos correspondientes a los hogares y otro igual para los datos correspondientes a individuos. Por último, se presenta un breve análisis como un pequeño ejemplo de un análisis posible con estos datos.
La fuente de los datos se puede encontrar en [este](https://www.indec.gob.ar/indec/web/Institucional-Indec-BasesDeDatos) sitio web.

In [1]:
import os
import time
import zipfile
import shutil
import re
from tqdm import tqdm
import polars as pl
import numpy as np
# import matplotlib.pyplot as plt

from create_urls import *

In [2]:
with open('last_known_quarter.txt') as f:
    last_known_quarter = f.read()

last_known_quarter

'24-1T'

In [3]:
known_key_list = create_key_list(last_quarter=last_known_quarter)
known_key_list

['16-2T',
 '16-3T',
 '16-4T',
 '17-1T',
 '17-2T',
 '17-3T',
 '17-4T',
 '18-1T',
 '18-2T',
 '18-3T',
 '18-4T',
 '19-1T',
 '19-2T',
 '19-3T',
 '19-4T',
 '20-1T',
 '20-2T',
 '20-3T',
 '20-4T',
 '21-1T',
 '21-2T',
 '21-3T',
 '21-4T',
 '22-1T',
 '22-2T',
 '22-3T',
 '22-4T',
 '23-1T',
 '23-2T',
 '23-3T',
 '23-4T',
 '24-1T']

In [4]:
known_url_list = create_url_list(known_key_list)
known_url_list

['https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2doTrim_2016_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3erTrim_2016_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4toTrim_2016_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1er_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperi

In [5]:
known_url_dict = create_known_urls(last_known_quarter)
known_url_dict

{'16-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2doTrim_2016_txt.zip',
 '16-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3erTrim_2016_txt.zip',
 '16-4T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4toTrim_2016_txt.zip',
 '17-1T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1er_Trim_2017_txt.zip',
 '17-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2017_txt.zip',
 '17-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2017_txt.zip',
 '17-4T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2017_txt.zip',
 '18-1T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1_Trim_2018_txt.zip',
 '18-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2018_txt.zip',
 '18-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2018_txt.zip',
 '18-4T': 'https://www.indec.gob.ar/ftp/cuadr

In [6]:
unknown_url_dict = create_unknown_urls(5, last_known_quarter)
unknown_url_dict

{'24-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2024_txt.zip',
 '24-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2024_txt.zip',
 '24-4T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2024_txt.zip',
 '25-1T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1_Trim_2025_txt.zip',
 '25-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2025_txt.zip'}

In [7]:
unknown_valid_url_dict = test_urls(unknown_url_dict)
unknown_valid_url_dict

{'24-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2024_txt.zip'}

In [8]:
# Create the "datos" folder if it doesn't exist
if 'datos' not in os.listdir():
    os.makedirs('datos')
    os.makedirs('datos/zip')

# Download the files
for key in tqdm(known_url_dict, desc='Downloading files'):
    url = known_url_dict[key]
    response = requests.get(url, stream=True)
    
    # If the website responds 
    if response.status_code == 200:
        with open(f'datos/zip/{key}.zip', 'wb') as f:
            f.write(response.content)

        # Try to avoid DDoS detection
        time.sleep(1)

    else:
        print(f'Failed to download {key}')

Downloading files: 100%|██████████| 32/32 [00:48<00:00,  1.53s/it]


In [9]:
# Checking the files
error_files = 0
zip_dir = 'datos/zip'

for file in os.listdir(zip_dir):
    zip_path = os.path.join(zip_dir, file)
    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.testzip()
    except zipfile.BadZipFile:
        error_files += 1
        print(f'{file} is not a valid zip file')

if error_files == 0:
    print('It looks like all the zip files are valid')
else:
    print(f'Found {error_files} invalid zip file(s)')

It looks like all the zip files are valid


In [10]:
# Create the folders inside the "datos" folder
if 'individual' not in os.listdir('datos'):
    os.mkdir('datos/individual')
if 'hogar' not in os.listdir('datos'):
    os.mkdir('datos/hogar')

In [11]:
def extract_rename_and_move(key):
    """This function extracts the .txt files from the .zip file, renames them 
    and moves them to the correct folders (datos/individual and datos/hogar). To
    simplify the code a bit it creates a folder called extracted, which it uses 
    to store the extracted files, and it deletes the folder at the end 
    Args:
        key (str): the key corresponding to the quarter. Its format should be
        yy-QT, so that, for example, 23-2T is the second quarter of 2023.
    """    
    # Creating the "extracted" directory
    extr_dir = 'datos/extracted'
    if not os.path.exists(extr_dir):
        os.makedirs(extr_dir)
    
    zip_dir = f'datos/zip/{key}.zip'
    
    # Extracting the files without the folders
    with zipfile.ZipFile(zip_dir, 'r') as z:
        for i in z.namelist():
            filename = os.path.basename(i)
            # Skip directories
            if not filename:
                continue
            
            source = z.open(i)
            target_path = os.path.join(extr_dir, filename)
            with open(target_path, "wb") as target:
                shutil.copyfileobj(source, target)
            source.close()
    
    # Regex patterns to try to avoid problems with filenames
    hogar_re = r'[hH][oO][gG][aA][rR]'
    individual_re = r'[iI][nN][dD][iI][vV][iI][dD][uU][aA][lL]'
    
    # Looping over the recently extracted files
    for filename in os.listdir(extr_dir):
        old_path = os.path.join(extr_dir, filename)
        
        # Finding the "hogar" data
        if re.search(hogar_re, filename) and filename.endswith('.txt'):
            new_filename = f'{key}-hogar.txt'
            new_path = os.path.join('datos', 'hogar', new_filename)
            os.rename(old_path, new_path)
        
        # Finding the "individual" data
        elif ((re.search(individual_re, filename) or 'personas' in filename) and 
              filename.endswith('.txt')):
            new_filename = f'{key}-individual.txt'
            new_path = os.path.join('datos', 'individual', new_filename)
            os.rename(old_path, new_path)
    
    # Deleting the "extracted" folder with all of its contents
    shutil.rmtree(extr_dir)

In [12]:
# Extracting, renaming and moving the data. Also deletes the zip folder
for key, value in known_url_dict.items():
    extract_rename_and_move(key)
shutil.rmtree('datos/zip')

In [13]:
# Checking if all the files for the individual datasets are there
equal_counter = 0
for file in os.listdir('datos/individual'):
    for key, value in known_url_dict.items():
        if file[:5] == key:
            equal_counter += 1
            continue

if equal_counter == len(known_url_dict):
    print('All the files for the individual dataset are available.')
else:
    print(f'There are {len(known_url_dict) - equal_counter} missing files')

# Checking if all the files for the hogar dataset are there
equal_counter = 0
for file in os.listdir('datos/hogar'):
    for key, value in known_url_dict.items():
        if file[:5] == key:
            equal_counter += 1
            continue

if equal_counter == len(known_url_dict):
    print('All the files for the hogar dataset are available.')
else:
    print(f'There are {len(known_url_dict) - equal_counter} missing files')

All the files for the individual dataset are available.
All the files for the hogar dataset are available.


In [14]:
def read_and_format(path):
    df = pl.read_csv(path, separator=';')
    # Cast all i64 columns to i32
    df = df.with_columns([
        df[col].cast(pl.Int32) for col in df.columns
        if df[col].dtype == pl.Int64
])
    # Cast all f64 columns to f32
    df = df.with_columns([
    df[col].cast(pl.Float32) for col in df.columns
    if df[col].dtype == pl.Float64
])
    return df

In [17]:
read_and_format('datos/hogar/16-3T-hogar.txt')

CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,REALIZADA,REGION,MAS_500,AGLOMERADO,PONDERA,IV1,IV1_ESP,IV2,IV3,IV3_ESP,IV4,IV5,IV6,IV7,IV7_ESP,IV8,IV9,IV10,IV11,IV12_1,IV12_2,IV12_3,II1,II2,II3,II3_1,II4_1,II4_2,II4_3,II5,II5_1,II6,II6_1,…,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19_A,V19_B,IX_TOT,IX_MEN10,IX_MAYEQ10,ITF,DECIFR,IDECIFR,RDECIFR,GDECIFR,PDECIFR,ADECIFR,IPCF,DECCFR,IDECCFR,RDECCFR,GDECCFR,PDECCFR,ADECCFR,PONDIH,VII1_1,VII1_2,VII2_1,VII2_2,VII2_3,VII2_4
str,i32,i32,i32,i32,i32,str,i32,i32,i32,str,i32,i32,str,i32,i32,i32,i32,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,str,i32,str,i32,i32,i32,i32,str,i32,i32,i32,i32,i32,i32,i32,i32
"""TQRMNOQPQHJMLSCDEFIAH00487651""",2016,3,1,1,43,"""S""",2,521,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,1,2,0,1,2,1,2,0,2,0,…,2,2,2,2,2,1,2,1,2,2,2,2,2,2,0,2,13500,5,5,5,5,""" """,5,"""6750""",6,6,6,6,""" """,5,822,2,0,1,0,0,0
"""TQRMNOQPSHJMLSCDEFIAH00469127""",2016,3,1,1,43,"""S""",2,521,1,,4,1,,4,1,1,1,,1,1,1,1,2,2,2,4,3,2,0,1,2,1,2,0,2,0,…,2,2,2,2,2,2,2,2,2,2,2,2,2,5,0,5,0,12,12,12,12,""" """,12,"""0""",12,12,12,12,""" """,12,0,2,0,98,0,0,0
"""TQRMNOQPUHJMLSCDEFIAH00469128""",2016,3,1,1,43,"""S""",2,521,1,,4,1,,9,1,1,1,,1,1,1,1,2,2,2,4,1,2,0,1,2,2,2,0,2,0,…,2,2,2,2,2,2,2,2,2,1,2,2,2,2,0,2,13750,5,5,5,5,""" """,5,"""6875""",6,6,6,6,""" """,5,684,1,0,2,0,0,0
"""TQRMNORUTHJMLSCDEFIAH00469130""",2016,3,1,1,43,"""S""",2,521,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,1,2,0,1,2,2,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,2,0,2,22350,7,7,7,7,""" """,7,"""11175""",8,9,8,8,""" """,8,897,1,0,98,0,0,0
"""TQRMNOPPVHJOLTCDEFIAH00469154""",2016,3,1,1,43,"""S""",2,622,1,,3,1,,4,2,1,1,,1,1,1,1,2,2,2,3,2,2,0,1,2,2,2,0,2,0,…,2,2,2,2,2,2,1,1,2,2,2,2,2,4,1,3,7500,2,2,2,2,""" """,2,"""1875""",1,1,1,1,""" """,1,661,1,0,98,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""TQRMNOTRRHKLKUCDEIMBF00497054""",2016,3,1,1,43,"""N""",36,141,1,,2,1,,1,1,1,1,,1,1,1,1,2,2,2,2,2,2,0,1,2,1,2,0,2,0,…,2,2,2,2,1,2,1,1,1,2,2,2,2,4,2,2,18500,6,7,7,,"""07""",7,"""4625""",4,4,4,,"""04""",4,210,2,0,1,0,0,0
"""TQRMNOTRSHKLKUCDEIMBF00497055""",2016,3,1,1,43,"""N""",36,141,1,,6,1,,1,1,1,1,,1,1,1,1,2,2,2,6,1,2,0,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,2,2,2,2,2,1,0,1,3000,1,1,1,,"""01""",1,"""3000""",2,3,2,,"""03""",2,170,1,0,96,0,0,0
"""TQRMNOQSSHKMKRCDEIMBF00497056""",2016,3,1,1,43,"""N""",36,127,1,,2,1,,2,1,1,1,,1,1,1,1,2,2,2,2,2,2,0,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,5,2,3,0,12,12,12,,"""12""",12,"""0""",12,12,12,,"""12""",12,0,1,2,3,0,0,0
"""TQRMNOSPYHKLKTCDEIMBF00497057""",2016,3,1,1,43,"""N""",36,165,1,,3,1,,1,1,1,1,,1,1,1,1,2,2,2,3,1,2,0,1,2,2,2,0,2,0,…,2,2,2,2,2,1,2,2,2,2,2,2,2,1,0,1,50000,10,10,10,,"""10""",10,"""50000""",10,10,10,,"""10""",10,228,1,0,96,0,0,0


In [20]:
hogar_file_list = sorted(os.listdir('datos/hogar'))

for i in enumerate(hogar_file_list):
    index = i[0]
    file = i[1]
    path_str = 'datos/hogar/' + file
    
    current_df = read_and_format(path_str)
    
    if index == 0:
        hogar = current_df
    else:
        # Ensure column schemas match
        for col_name, col_type in hogar.schema.items():
            if col_name in current_df.schema:
                current_df = current_df.with_columns(pl.col(col_name).cast(col_type))
        hogar = pl.concat([hogar, current_df])
    
    print(file)

hogar


16-2T-hogar.txt
16-3T-hogar.txt
16-4T-hogar.txt
17-1T-hogar.txt
17-2T-hogar.txt
17-3T-hogar.txt
17-4T-hogar.txt
18-1T-hogar.txt
18-2T-hogar.txt
18-3T-hogar.txt
18-4T-hogar.txt


InvalidOperationError: conversion from `str` to `i32` failed in column 'IDECIFR' for 26 out of 76 values: ["  ", "  ", … "  "]

## Leyendo los datos a una tabla
En esta sección se leen los archivos para crear una tabla para los datos de hogares y otra para los datos de individuos.
Dado que la cantidad de datos puede causar que las computadoras se queden sin memoria (sobre todo si la PC tiene 8gb de RAM o menos) se recomienda usar las variables hogar_cols_to_keep y ind_cols_to_keep para listar las variables que se necesitan en los datasets de hogares e individuos, respectivamente, y eliminar las restantes.   
Los nombres de las variables se pueden conseguir en el diseño de registro de la EPH, aunque en algunos casos los nombres de las variables pueden tener pequeños errores (por ejemplo, en el registro existe una variable llamada "IX_Tot", pero su nombre en los archivos es "IX_TOT").
Al final de esta sección se guardan los archivos hogar.csv e individuos.csv en la carpeta "datos", ambos conteniendo los datos de sus respectivos datasets.   
En este caso, a modo de ejemplo, se eligen las variables "CODUSU", "NRO_HOGAR", "IX_TOT", "DECIFR" y"DECCFR" en el dataset de hogares y "CODUSU", "NRO_HOGAR" y "DECINDR" en dataset de individuos. El movtivo por el cual se eligen estas variables de detalla en la próxima sección.

In [17]:
def read_data(key, hogar_or_individual):
    """This function reads a txt file as formatted by INDEC and returns a pandas
    DataFrame with the data and a few small optimizations to save memory. It is
    recommended to use cols_to_keep to reduce memory footprint.
    Args:
        key (str): the key corresponding to the quarter. It's format should be
        yy-QT, so that, for example, 23-2T is the second quarter of 2023.
        hogar_or_individual (str): should be either "hogar" or "individual".
        cols_to_keep (list, optional): a list of columns to keep, and the rest
        are discarded. If None, all the columns are kept. Defaults to None.

    Returns:
        pandas.DataFrame: a DataFrame containing the data from the .txt file.
    """   
    # Reading the data
    df = pd.read_csv(f'datos/{hogar_or_individual}/{key}-' +
                     f'{hogar_or_individual}.txt', sep=';',
                     low_memory=False)
    
    # Converting int64 to int16 to save memory
    d = dict.fromkeys(df.select_dtypes('int').columns, 'int16')
    df = df.astype(d).copy()

    # Converting fp64 to fp16 to save memory
    d = dict.fromkeys(df.select_dtypes('float').columns, 'float16')
    df = df.astype(d).copy()


    # Converting the date data to quarters
    date_df = df[['ANO4', 'TRIMESTRE']].astype(str).copy()
    quarter = pd.to_datetime(date_df['ANO4'] + 'Q' + date_df['TRIMESTRE'])
    quarter = quarter.dt.to_period('Q')
    df['quarter'] = quarter.copy()
    df = df.drop(columns=['ANO4', 'TRIMESTRE'])

    return df