# Bajar-Microdatos.ipynb
El objetivo de esta notebook es presentar código que permite sistematizar la descarga de datos de todos los trimestres de 2016 en adelante de la Encuesta Permanente de Hogares, dando o cómo resultado un archivo en formato .csv para los datos correspondientes a los hogares y otro igual para los datos correspondientes a individuos. Por último, se presenta un breve análisis como un pequeño ejemplo de un análisis posible con estos datos.
La fuente de los datos se puede encontrar en [este](https://www.indec.gob.ar/indec/web/Institucional-Indec-BasesDeDatos) sitio web.

In [1]:
import os
import time
import zipfile
import shutil
import re
import io
from tqdm import tqdm
import polars as pl
import numpy as np
# import matplotlib.pyplot as plt

from create_urls import *

In [2]:
with open('last_known_quarter.txt') as f:
    last_known_quarter = f.read()

last_known_quarter

'24-1T'

In [3]:
known_key_list = create_key_list(last_quarter=last_known_quarter)
known_key_list

['16-2T',
 '16-3T',
 '16-4T',
 '17-1T',
 '17-2T',
 '17-3T',
 '17-4T',
 '18-1T',
 '18-2T',
 '18-3T',
 '18-4T',
 '19-1T',
 '19-2T',
 '19-3T',
 '19-4T',
 '20-1T',
 '20-2T',
 '20-3T',
 '20-4T',
 '21-1T',
 '21-2T',
 '21-3T',
 '21-4T',
 '22-1T',
 '22-2T',
 '22-3T',
 '22-4T',
 '23-1T',
 '23-2T',
 '23-3T',
 '23-4T',
 '24-1T']

In [4]:
known_url_list = create_url_list(known_key_list)
known_url_list

['https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2doTrim_2016_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3erTrim_2016_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4toTrim_2016_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1er_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2017_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2018_txt.zip',
 'https://www.indec.gob.ar/ftp/cuadros/menusuperi

In [5]:
known_url_dict = create_known_urls(last_known_quarter)
known_url_dict

{'16-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2doTrim_2016_txt.zip',
 '16-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3erTrim_2016_txt.zip',
 '16-4T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4toTrim_2016_txt.zip',
 '17-1T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1er_Trim_2017_txt.zip',
 '17-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2017_txt.zip',
 '17-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2017_txt.zip',
 '17-4T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2017_txt.zip',
 '18-1T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1_Trim_2018_txt.zip',
 '18-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2018_txt.zip',
 '18-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2018_txt.zip',
 '18-4T': 'https://www.indec.gob.ar/ftp/cuadr

In [6]:
unknown_url_dict = create_unknown_urls(5, last_known_quarter)
unknown_url_dict

{'24-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2024_txt.zip',
 '24-3T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_3_Trim_2024_txt.zip',
 '24-4T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_4_Trim_2024_txt.zip',
 '25-1T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_1_Trim_2025_txt.zip',
 '25-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2025_txt.zip'}

In [7]:
unknown_valid_url_dict = test_urls(unknown_url_dict)
unknown_valid_url_dict

{'24-2T': 'https://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/EPH_usu_2_Trim_2024_txt.zip'}

In [8]:
# Create the "datos" folder if it doesn't exist
if 'datos' not in os.listdir():
    os.makedirs('datos')
    os.makedirs('datos/zip')

# Download the files
for key in tqdm(known_url_dict, desc='Downloading files'):
    url = known_url_dict[key]
    response = requests.get(url, stream=True)
    
    # If the website responds 
    if response.status_code == 200:
        with open(f'datos/zip/{key}.zip', 'wb') as f:
            f.write(response.content)

        # Try to avoid DDoS detection
        time.sleep(1)

    else:
        print(f'Failed to download {key}')

Downloading files: 100%|██████████| 32/32 [01:13<00:00,  2.31s/it]


In [9]:
# Checking the files
error_files = 0
zip_dir = 'datos/zip'

for file in os.listdir(zip_dir):
    zip_path = os.path.join(zip_dir, file)
    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.testzip()
    except zipfile.BadZipFile:
        error_files += 1
        print(f'{file} is not a valid zip file')

if error_files == 0:
    print('It looks like all the zip files are valid')
else:
    print(f'Found {error_files} invalid zip file(s)')

It looks like all the zip files are valid


In [10]:
# Create the folders inside the "datos" folder
if 'individual' not in os.listdir('datos'):
    os.mkdir('datos/individual')
if 'hogar' not in os.listdir('datos'):
    os.mkdir('datos/hogar')

In [11]:
def extract_rename_and_move(key):
    """This function extracts the .txt files from the .zip file, renames them 
    and moves them to the correct folders (datos/individual and datos/hogar). To
    simplify the code a bit it creates a folder called extracted, which it uses 
    to store the extracted files, and it deletes the folder at the end 
    Args:
        key (str): the key corresponding to the quarter. Its format should be
        yy-QT, so that, for example, 23-2T is the second quarter of 2023.
    """    
    # Creating the "extracted" directory
    extr_dir = 'datos/extracted'
    if not os.path.exists(extr_dir):
        os.makedirs(extr_dir)
    
    zip_dir = f'datos/zip/{key}.zip'
    
    # Extracting the files without the folders
    with zipfile.ZipFile(zip_dir, 'r') as z:
        for i in z.namelist():
            filename = os.path.basename(i)
            # Skip directories
            if not filename:
                continue
            
            source = z.open(i)
            target_path = os.path.join(extr_dir, filename)
            with open(target_path, "wb") as target:
                shutil.copyfileobj(source, target)
            source.close()
    
    # Regex patterns to try to avoid problems with filenames
    hogar_re = r'[hH][oO][gG][aA][rR]'
    individual_re = r'[iI][nN][dD][iI][vV][iI][dD][uU][aA][lL]'
    
    # Looping over the recently extracted files
    for filename in os.listdir(extr_dir):
        old_path = os.path.join(extr_dir, filename)
        
        # Finding the "hogar" data
        if re.search(hogar_re, filename) and filename.endswith('.txt'):
            new_filename = f'{key}-hogar.txt'
            new_path = os.path.join('datos', 'hogar', new_filename)
            os.rename(old_path, new_path)
        
        # Finding the "individual" data
        elif ((re.search(individual_re, filename) or 'personas' in filename) and 
              filename.endswith('.txt')):
            new_filename = f'{key}-individual.txt'
            new_path = os.path.join('datos', 'individual', new_filename)
            os.rename(old_path, new_path)
    
    # Deleting the "extracted" folder with all of its contents
    shutil.rmtree(extr_dir)

In [12]:
# Extracting, renaming and moving the data. Also deletes the zip folder
for key, value in known_url_dict.items():
    extract_rename_and_move(key)
shutil.rmtree('datos/zip')

In [13]:
# Checking if all the files for the individual datasets are there
equal_counter = 0
for file in os.listdir('datos/individual'):
    for key, value in known_url_dict.items():
        if file[:5] == key:
            equal_counter += 1
            continue

if equal_counter == len(known_url_dict):
    print('All the files for the individual dataset are available.')
else:
    print(f'There are {len(known_url_dict) - equal_counter} missing files')

# Checking if all the files for the hogar dataset are there
equal_counter = 0
for file in os.listdir('datos/hogar'):
    for key, value in known_url_dict.items():
        if file[:5] == key:
            equal_counter += 1
            continue

if equal_counter == len(known_url_dict):
    print('All the files for the hogar dataset are available.')
else:
    print(f'There are {len(known_url_dict) - equal_counter} missing files')

All the files for the individual dataset are available.
All the files for the hogar dataset are available.


In [14]:
def read_and_format(path, schema_overrides=None):
    if schema_overrides is not None:
        df = pl.read_csv(path,
                         separator=';',
                         null_values=["NA"],
                         decimal_comma=True,
                         infer_schema_length=10000)
        
    else:
        df = pl.read_csv(path,
                         separator=';',
                         null_values=["NA"],
                         decimal_comma=True,
                         infer_schema_length=10000,
                         schema_overrides=schema_overrides)
        
    # Cast all i64 columns to i32
    df = df.with_columns([
        df[col].cast(pl.Int32) for col in df.columns
        if df[col].dtype == pl.Int64
])
    # Cast all f64 columns to f32
    df = df.with_columns([
    df[col].cast(pl.Float32) for col in df.columns
    if df[col].dtype == pl.Float64
])
    
    # Remove unnamed columns
    unnamed_columns = [col for col in df.columns if col == ""]
    if unnamed_columns:
        df = df.drop(unnamed_columns)
        
    return df

In [15]:
def process_txt_files(file_paths, schema_overrides=None):
    final_df = None

    for index, path_str in enumerate(file_paths):
        # Read and format the current file
        current_df = read_and_format(path_str, schema_overrides)

        if index == 0:
            # Initialize with the first file
            final_df = current_df
        else:
            # Clean invalid values and align column schemas
            for col_name, col_type in final_df.schema.items():
                if col_name in current_df.schema:
                    # Clean and align columns expected to be Int32
                    if col_type == pl.Int32:
                        current_df = current_df.with_columns(
                            pl.when(pl.col(col_name).cast(pl.Utf8).str.strip_chars() == "")
                            .then(None)
                            .otherwise(pl.col(col_name))
                            .alias(col_name)
                        )
                    # Cast the column to the expected type
                    current_df = current_df.with_columns(pl.col(col_name).cast(col_type))
            
            # Concatenate the aligned DataFrame
            final_df = pl.concat([final_df, current_df])

            # Align schemas
            for col in final_df.columns:
                if col not in current_df.columns:
                    current_df = current_df.with_columns(pl.lit(None).alias(col))
            
            for col in current_df.columns:
                if col not in final_df.columns:
                    current_df = current_df.drop(col)

        print(f"Processing file: {path_str}")

    return final_df

In [16]:
def save_dataframe_as_zip(df, zip_filepath, csv_filename):
    """
    Saves a Polars DataFrame as a CSV file compressed inside a ZIP archive
    directly to disk.

    Args:
        df (pl.DataFrame): The DataFrame to save.
        zip_filepath (str): The path for the ZIP file to save.
        csv_filename (str): The name of the CSV file inside the ZIP archive.
    """
    # Create a BytesIO object to hold the CSV data in memory
    csv_buffer = io.BytesIO()

    # Write the DataFrame to the CSV buffer in memory
    df.write_csv(csv_buffer)

    # Open the ZIP file and write the CSV content directly into it
    with zipfile.ZipFile(zip_filepath, mode='w', 
                         compression=zipfile.ZIP_DEFLATED) as zf:
        # Ensure the cursor is at the start of the buffer
        csv_buffer.seek(0)
        # Write the CSV buffer as a file inside the ZIP archive
        zf.writestr(csv_filename, csv_buffer.getvalue())

    print(f"DataFrame saved and compressed as {zip_filepath}")


In [17]:
base_path = 'datos/hogar'
hogar_file_list = sorted(os.listdir(base_path))
hogar_file_paths = [os.path.join(base_path, file) for file in hogar_file_list]

hogar = process_txt_files(hogar_file_paths)
hogar

Processing file: datos/hogar/16-2T-hogar.txt
Processing file: datos/hogar/16-3T-hogar.txt
Processing file: datos/hogar/16-4T-hogar.txt
Processing file: datos/hogar/17-1T-hogar.txt
Processing file: datos/hogar/17-2T-hogar.txt
Processing file: datos/hogar/17-3T-hogar.txt
Processing file: datos/hogar/17-4T-hogar.txt
Processing file: datos/hogar/18-1T-hogar.txt
Processing file: datos/hogar/18-2T-hogar.txt
Processing file: datos/hogar/18-3T-hogar.txt
Processing file: datos/hogar/18-4T-hogar.txt
Processing file: datos/hogar/19-1T-hogar.txt
Processing file: datos/hogar/19-2T-hogar.txt
Processing file: datos/hogar/19-3T-hogar.txt
Processing file: datos/hogar/19-4T-hogar.txt
Processing file: datos/hogar/20-1T-hogar.txt
Processing file: datos/hogar/20-2T-hogar.txt
Processing file: datos/hogar/20-3T-hogar.txt
Processing file: datos/hogar/20-4T-hogar.txt
Processing file: datos/hogar/21-1T-hogar.txt
Processing file: datos/hogar/21-2T-hogar.txt
Processing file: datos/hogar/21-3T-hogar.txt
Processing

CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,REALIZADA,REGION,MAS_500,AGLOMERADO,PONDERA,IV1,IV1_ESP,IV2,IV3,IV3_ESP,IV4,IV5,IV6,IV7,IV7_ESP,IV8,IV9,IV10,IV11,IV12_1,IV12_2,IV12_3,II1,II2,II3,II3_1,II4_1,II4_2,II4_3,II5,II5_1,II6,II6_1,…,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19_A,V19_B,IX_TOT,IX_MEN10,IX_MAYEQ10,ITF,DECIFR,IDECIFR,RDECIFR,GDECIFR,PDECIFR,ADECIFR,IPCF,DECCFR,IDECCFR,RDECCFR,GDECCFR,PDECCFR,ADECCFR,PONDIH,VII1_1,VII1_2,VII2_1,VII2_2,VII2_3,VII2_4
str,i32,i32,i32,i32,i32,str,i32,i32,i32,str,i32,i32,str,i32,i32,i32,i32,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,str,i32,str,str,i32,f32,i32,str,i32,str,str,i32,i32,i32,i32,i32,i32,i32,i32
"""TQRMNOQUXHKOKMCDEGKDB00475140""",2016,2,1,1,43,"""N""",14,77,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,2,1,1,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,4,0,4,0,12,"""12""",12,""" ""","""12""",12,0.0,12,"""12""",12,""" ""","""12""",12,0,2,0,98,0,0,0
"""TQRMNOQQYHMMKTCDEGKDB00475009""",2016,2,1,1,43,"""N""",14,66,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,2,2,0,1,2,1,2,0,2,0,…,2,2,2,2,2,1,2,2,1,2,2,2,2,3,0,3,19000,7,"""07""",7,""" ""","""07""",8,6333.330078,6,"""07""",6,""" ""","""07""",7,79,2,0,98,0,0,0
"""TQRMNORVUHJMKSCDEGKDB00475326""",2016,2,1,1,43,"""N""",14,59,1,,2,1,,4,1,1,1,,1,1,1,1,2,2,2,2,1,2,0,1,1,2,2,0,2,0,…,2,2,2,2,1,2,2,2,2,2,2,2,2,2,0,2,13800,5,"""06""",6,""" ""","""06""",6,6900.0,7,"""07""",7,""" ""","""07""",8,66,1,0,2,0,0,0
"""TQRMNOPQUHMMKTCDEGKDB00475005""",2016,2,1,1,43,"""N""",14,66,1,,4,1,,4,1,1,1,,1,1,1,1,2,2,2,4,1,2,0,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,2,0,2,25000,8,"""09""",9,""" ""","""09""",9,12500.0,9,"""09""",9,""" ""","""09""",10,81,2,0,98,0,0,0
"""TQRMNORVRHJMKSCDEGKDB00475324""",2016,2,1,1,43,"""N""",14,59,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,1,2,0,1,1,1,2,0,1,1,…,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,6000,2,"""02""",2,""" ""","""02""",2,3000.0,3,"""03""",3,""" ""","""03""",4,65,2,0,1,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""TQRMNOPXXHLKKUCDEFNFF00852324""",2024,1,1,1,41,"""N""",7,336,2,"""""",3,1,"""""",9,1,1,1,"""""",1,1,1,1,2,2,2,3,2,2,0,1,1,2,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,3,0,3,0,12,"""12""",12,"""""","""12""",12,0.0,12,"""12""",12,"""""","""12""",12,0,96,0,2,3,0,0
"""TQRMNOPQTHKMLMCDEHMHF00861771""",2024,1,1,1,42,"""N""",26,141,1,"""""",3,1,"""""",2,1,1,1,"""""",1,1,1,1,2,2,2,3,2,2,0,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,213569,2,"""2""",2,"""""","""2""",2,106784.5,3,"""3""",3,"""""","""3""",2,156,97,0,1,2,0,0
"""TQRMNOSYXHMMLNCDEFNFF00798999""",2024,1,1,1,41,"""N""",7,305,1,"""""",4,2,"""""",4,1,1,1,"""""",1,1,1,3,2,2,2,4,3,2,0,1,2,2,2,0,2,0,…,2,2,2,1,2,2,2,2,2,2,2,2,2,6,1,5,253700,2,"""3""",4,"""""","""3""",3,42283.328125,1,"""1""",1,"""""","""1""",1,353,2,0,1,3,4,0
"""TQRMNOPTSHLOLQCDEFNFF00852331""",2024,1,1,1,41,"""N""",7,331,1,"""""",4,1,"""""",4,1,1,1,"""""",1,1,1,2,2,2,2,4,1,2,0,1,2,2,2,0,2,0,…,2,2,2,2,2,1,2,2,1,2,2,2,2,1,0,1,213500,2,"""2""",3,"""""","""2""",3,213500.0,6,"""6""",8,"""""","""6""",7,420,97,0,98,0,0,0


In [18]:
zip_file_path = "datos/hogar_data.zip"
csv_file_name = "hogar_data.csv"

save_dataframe_as_zip(hogar, zip_file_path, csv_file_name)

DataFrame saved and compressed as datos/hogar_data.zip


In [20]:
base_path = 'datos/individual'
individual_file_list = sorted(os.listdir(base_path))
individual_file_paths = [os.path.join(base_path, file)
                         for file in individual_file_list]

schema_overrides = {'PP3E_TOT': pl.Float32, 'PP3F_TOT':pl.Float32}

individual = process_txt_files(individual_file_paths, schema_overrides)
individual

Processing file: datos/individual/16-2T-individual.txt
Processing file: datos/individual/16-3T-individual.txt
Processing file: datos/individual/16-4T-individual.txt
Processing file: datos/individual/17-1T-individual.txt
Processing file: datos/individual/17-2T-individual.txt
Processing file: datos/individual/17-3T-individual.txt
Processing file: datos/individual/17-4T-individual.txt
Processing file: datos/individual/18-1T-individual.txt
Processing file: datos/individual/18-2T-individual.txt
Processing file: datos/individual/18-3T-individual.txt
Processing file: datos/individual/18-4T-individual.txt
Processing file: datos/individual/19-1T-individual.txt
Processing file: datos/individual/19-2T-individual.txt
Processing file: datos/individual/19-3T-individual.txt
Processing file: datos/individual/19-4T-individual.txt
Processing file: datos/individual/20-1T-individual.txt
Processing file: datos/individual/20-2T-individual.txt


ComputeError: could not parse `7,5` as dtype `i64` at column 'PP3F_TOT' (column number 46)

The current offset in the file is 46440 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `schema_overrides` argument
- setting `ignore_errors` to `True`,
- adding `7,5` to the `null_values` list.

Original error: ```remaining bytes non-empty```

In [None]:


for i in enumerate(hogar_file_paths):
    index = i[0]
    path_str = i[1]

    current_df = read_and_format(path_str)

    if index == 0:
        hogar = current_df
    else:
        # Clean invalid values and align column schemas
        for col_name, col_type in hogar.schema.items():
            if col_name in current_df.schema:
                # If the column is expected to be Int32, clean it
                if col_type == pl.Int32:
                    # Ensure the column is string
                    current_df = current_df.with_columns(
                        pl.when(pl.col(col_name).cast(pl.Utf8).str
                                .strip_chars() == "")
                        .then(None)
                        .otherwise(pl.col(col_name))
                        .alias(col_name)
                    )
                # Cast the column to the expected type
                current_df = current_df.with_columns(pl.col(col_name)
                                                     .cast(col_type))
        hogar = pl.concat([hogar, current_df])

        # Align schemas
        for col in hogar.columns:
            if col not in current_df.columns:
                current_df = current_df.with_columns(pl.lit(None).alias(col))
        
        for col in current_df.columns:
            if col not in hogar.columns:
                current_df = current_df.drop(col)

    print(f"Processing file: {path_str}")

hogar

Processing file: datos/hogar/16-2T-hogar.txt
Processing file: datos/hogar/16-3T-hogar.txt
Processing file: datos/hogar/16-4T-hogar.txt
Processing file: datos/hogar/17-1T-hogar.txt
Processing file: datos/hogar/17-2T-hogar.txt
Processing file: datos/hogar/17-3T-hogar.txt
Processing file: datos/hogar/17-4T-hogar.txt
Processing file: datos/hogar/18-1T-hogar.txt
Processing file: datos/hogar/18-2T-hogar.txt
Processing file: datos/hogar/18-3T-hogar.txt
Processing file: datos/hogar/18-4T-hogar.txt
Processing file: datos/hogar/19-1T-hogar.txt
Processing file: datos/hogar/19-2T-hogar.txt
Processing file: datos/hogar/19-3T-hogar.txt
Processing file: datos/hogar/19-4T-hogar.txt
Processing file: datos/hogar/20-1T-hogar.txt
Processing file: datos/hogar/20-2T-hogar.txt
Processing file: datos/hogar/20-3T-hogar.txt
Processing file: datos/hogar/20-4T-hogar.txt
Processing file: datos/hogar/21-1T-hogar.txt
Processing file: datos/hogar/21-2T-hogar.txt
Processing file: datos/hogar/21-3T-hogar.txt
Processing

CODUSU,ANO4,TRIMESTRE,NRO_HOGAR,REALIZADA,REGION,MAS_500,AGLOMERADO,PONDERA,IV1,IV1_ESP,IV2,IV3,IV3_ESP,IV4,IV5,IV6,IV7,IV7_ESP,IV8,IV9,IV10,IV11,IV12_1,IV12_2,IV12_3,II1,II2,II3,II3_1,II4_1,II4_2,II4_3,II5,II5_1,II6,II6_1,…,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19_A,V19_B,IX_TOT,IX_MEN10,IX_MAYEQ10,ITF,DECIFR,IDECIFR,RDECIFR,GDECIFR,PDECIFR,ADECIFR,IPCF,DECCFR,IDECCFR,RDECCFR,GDECCFR,PDECCFR,ADECCFR,PONDIH,VII1_1,VII1_2,VII2_1,VII2_2,VII2_3,VII2_4
str,i32,i32,i32,i32,i32,str,i32,i32,i32,str,i32,i32,str,i32,i32,i32,i32,str,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,…,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,str,str,i32,str,i32,i32,i32,str,str,i32,i32,i32,i32,i32,i32,i32,i32
"""TQRMNOQUXHKOKMCDEGKDB00475140""",2016,2,1,1,43,"""N""",14,77,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,2,1,1,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,4,0,4,0,12,12,12,""" ""","""12""",12,"""0""",12,12,12,""" ""","""12""",12,0,2,0,98,0,0,0
"""TQRMNOQQYHMMKTCDEGKDB00475009""",2016,2,1,1,43,"""N""",14,66,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,2,2,0,1,2,1,2,0,2,0,…,2,2,2,2,2,1,2,2,1,2,2,2,2,3,0,3,19000,7,7,7,""" ""","""07""",8,"""6333,33""",6,7,6,""" ""","""07""",7,79,2,0,98,0,0,0
"""TQRMNORVUHJMKSCDEGKDB00475326""",2016,2,1,1,43,"""N""",14,59,1,,2,1,,4,1,1,1,,1,1,1,1,2,2,2,2,1,2,0,1,1,2,2,0,2,0,…,2,2,2,2,1,2,2,2,2,2,2,2,2,2,0,2,13800,5,6,6,""" ""","""06""",6,"""6900""",7,7,7,""" ""","""07""",8,66,1,0,2,0,0,0
"""TQRMNOPQUHMMKTCDEGKDB00475005""",2016,2,1,1,43,"""N""",14,66,1,,4,1,,4,1,1,1,,1,1,1,1,2,2,2,4,1,2,0,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,2,0,2,25000,8,9,9,""" ""","""09""",9,"""12500""",9,9,9,""" ""","""09""",10,81,2,0,98,0,0,0
"""TQRMNORVRHJMKSCDEGKDB00475324""",2016,2,1,1,43,"""N""",14,59,1,,3,1,,4,1,1,1,,1,1,1,1,2,2,2,3,1,2,0,1,1,1,2,0,1,1,…,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,6000,2,2,2,""" ""","""02""",2,"""3000""",3,3,3,""" ""","""03""",4,65,2,0,1,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""TQRMNOPXXHLKKUCDEFNFF00852324""",2024,1,1,1,41,"""N""",7,336,2,"""""",3,1,"""""",9,1,1,1,"""""",1,1,1,1,2,2,2,3,2,2,0,1,1,2,2,0,2,0,…,2,2,2,2,2,2,2,2,1,2,2,2,2,3,0,3,0,12,12,12,"""""","""12""",12,"""0""",12,12,12,"""""","""12""",12,0,96,0,2,3,0,0
"""TQRMNOPQTHKMLMCDEHMHF00861771""",2024,1,1,1,42,"""N""",26,141,1,"""""",3,1,"""""",2,1,1,1,"""""",1,1,1,1,2,2,2,3,2,2,0,1,1,1,2,0,2,0,…,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,213569,2,2,2,"""""","""2""",2,"""106784,5""",3,3,3,"""""","""3""",2,156,97,0,1,2,0,0
"""TQRMNOSYXHMMLNCDEFNFF00798999""",2024,1,1,1,41,"""N""",7,305,1,"""""",4,2,"""""",4,1,1,1,"""""",1,1,1,3,2,2,2,4,3,2,0,1,2,2,2,0,2,0,…,2,2,2,1,2,2,2,2,2,2,2,2,2,6,1,5,253700,2,3,4,"""""","""3""",3,"""42283,33""",1,1,1,"""""","""1""",1,353,2,0,1,3,4,0
"""TQRMNOPTSHLOLQCDEFNFF00852331""",2024,1,1,1,41,"""N""",7,331,1,"""""",4,1,"""""",4,1,1,1,"""""",1,1,1,2,2,2,2,4,1,2,0,1,2,2,2,0,2,0,…,2,2,2,2,2,1,2,2,1,2,2,2,2,1,0,1,213500,2,2,3,"""""","""2""",3,"""213500""",6,6,8,"""""","""6""",7,420,97,0,98,0,0,0


In [16]:
base_path = 'datos/individual'
individual_file_list = sorted(os.listdir(base_path))
individual_file_paths = [os.path.join(base_path, file) for file
                         in individual_file_list]

for i in enumerate(individual_file_paths):
    index = i[0]
    path_str = i[1]

    current_df = read_and_format(path_str)

    if index == 0:
        individual = current_df
    else:
        # Clean invalid values and align column schemas
        for col_name, col_type in individual.schema.items():
            if col_name in current_df.schema:
                # If the column is expected to be Int32, clean it
                if col_type == pl.Int32:
                    # Ensure the column is string
                    current_df = current_df.with_columns(
                        pl.when(pl.col(col_name).cast(pl.Utf8).str
                                .strip_chars() == "")
                        .then(None)
                        .otherwise(pl.col(col_name))
                        .alias(col_name)
                    )
                # Cast the column to the expected type
                current_df = current_df.with_columns(pl.col(col_name)
                                                     .cast(col_type))
        individual = pl.concat([individual, current_df])

        # Align schemas
        for col in individual.columns:
            if col not in current_df.columns:
                current_df = current_df.with_columns(pl.lit(None).alias(col))
        
        for col in current_df.columns:
            if col not in individual.columns:
                current_df = current_df.drop(col)

    print(f"Processing file: {path_str}")

individual

Processing file: datos/individual/16-2T-individual.txt
Processing file: datos/individual/16-3T-individual.txt
Processing file: datos/individual/16-4T-individual.txt


ComputeError: could not parse `12666,67` as dtype `i64` at column 'IPCF' (column number 170)

The current offset in the file is 53451 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `schema_overrides` argument
- setting `ignore_errors` to `True`,
- adding `12666,67` to the `null_values` list.

Original error: ```remaining bytes non-empty```

## Leyendo los datos a una tabla
En esta sección se leen los archivos para crear una tabla para los datos de hogares y otra para los datos de individuos.
Dado que la cantidad de datos puede causar que las computadoras se queden sin memoria (sobre todo si la PC tiene 8gb de RAM o menos) se recomienda usar las variables hogar_cols_to_keep y ind_cols_to_keep para listar las variables que se necesitan en los datasets de hogares e individuos, respectivamente, y eliminar las restantes.   
Los nombres de las variables se pueden conseguir en el diseño de registro de la EPH, aunque en algunos casos los nombres de las variables pueden tener pequeños errores (por ejemplo, en el registro existe una variable llamada "IX_Tot", pero su nombre en los archivos es "IX_TOT").
Al final de esta sección se guardan los archivos hogar.csv e individuos.csv en la carpeta "datos", ambos conteniendo los datos de sus respectivos datasets.   
En este caso, a modo de ejemplo, se eligen las variables "CODUSU", "NRO_HOGAR", "IX_TOT", "DECIFR" y"DECCFR" en el dataset de hogares y "CODUSU", "NRO_HOGAR" y "DECINDR" en dataset de individuos. El movtivo por el cual se eligen estas variables de detalla en la próxima sección.

In [17]:
def read_data(key, hogar_or_individual):
    """This function reads a txt file as formatted by INDEC and returns a pandas
    DataFrame with the data and a few small optimizations to save memory. It is
    recommended to use cols_to_keep to reduce memory footprint.
    Args:
        key (str): the key corresponding to the quarter. It's format should be
        yy-QT, so that, for example, 23-2T is the second quarter of 2023.
        hogar_or_individual (str): should be either "hogar" or "individual".
        cols_to_keep (list, optional): a list of columns to keep, and the rest
        are discarded. If None, all the columns are kept. Defaults to None.

    Returns:
        pandas.DataFrame: a DataFrame containing the data from the .txt file.
    """   
    # Reading the data
    df = pd.read_csv(f'datos/{hogar_or_individual}/{key}-' +
                     f'{hogar_or_individual}.txt', sep=';',
                     low_memory=False)
    
    # Converting int64 to int16 to save memory
    d = dict.fromkeys(df.select_dtypes('int').columns, 'int16')
    df = df.astype(d).copy()

    # Converting fp64 to fp16 to save memory
    d = dict.fromkeys(df.select_dtypes('float').columns, 'float16')
    df = df.astype(d).copy()


    # Converting the date data to quarters
    date_df = df[['ANO4', 'TRIMESTRE']].astype(str).copy()
    quarter = pd.to_datetime(date_df['ANO4'] + 'Q' + date_df['TRIMESTRE'])
    quarter = quarter.dt.to_period('Q')
    df['quarter'] = quarter.copy()
    df = df.drop(columns=['ANO4', 'TRIMESTRE'])

    return df