In [1]:
import os
import requests
import zipfile
import io
import pandas as pd

# Download, extract, and load .dta file into a DataFrame

In [2]:
# Function to download, extract, and load .dta file into a DataFrame
def load_dta_from_zip(url, year):
    print(f"Processing dataset for year {year}...")
    response = requests.get(url)
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    zip_file.extractall(f"extracted_files_{year}")

    # List all extracted files
    extracted_files = zip_file.namelist()
    print(f"Extracted files for {year}: {extracted_files}")

    # Identify .dta files
    dta_files = [file for file in extracted_files if file.endswith('.dta')]
    print(f".dta files found for {year}: {dta_files}")

    if dta_files:
        dta_file_path = os.path.join(f"extracted_files_{year}", dta_files[0])
        try:
            df = pd.read_stata(dta_file_path)
            print(f"DataFrame for {year} loaded successfully.")
            return df
        except ValueError as e:
            print(f"Error loading .dta file for {year}: {e}")
            return None
    else:
        print(f"No .dta files found for {year}.")
        return None

# URLs for datasets

In [3]:
urls = {
    2017: "https://proyectos.inei.gob.pe/iinei/srienaho/descarga/STATA/615-Modulo1330.zip",
    2018: "https://proyectos.inei.gob.pe/iinei/srienaho/descarga/STATA/650-Modulo1466.zip",
    2019: "https://proyectos.inei.gob.pe/iinei/srienaho/descarga/STATA/701-Modulo1545.zip"
}

# Loading the datasets

In [4]:
df_2017 = load_dta_from_zip(urls[2017], 2017)
df_2018 = load_dta_from_zip(urls[2018], 2018)
df_2019 = load_dta_from_zip(urls[2019], 2019)

Processing dataset for year 2017...
Extracted files for 2017: ['615-Modulo1330/', '615-Modulo1330/01_CUESTIONARIO_PEQUEÑOS_MEDIANOS_2017.pdf', '615-Modulo1330/02_CUESTIONARIO_ESTRATO_ESPECIAL_2017.pdf', '615-Modulo1330/16_Cap900.dta', '615-Modulo1330/Diccionario_Datos_16_CAP900.pdf']
.dta files found for 2017: ['615-Modulo1330/16_Cap900.dta']
DataFrame for 2017 loaded successfully.
Processing dataset for year 2018...
Extracted files for 2018: ['650-Modulo-1466/', '650-Modulo-1466/01_CUESTIONARIO_PEQUEÑOS_MEDIANOS_2018.pdf', '650-Modulo-1466/02_CUESTIONARIO_ESTRATO_ESPECIAL_2018.pdf', '650-Modulo-1466/17_Cap900.dta', '650-Modulo-1466/Diccionario_Datos_17_Cap900.pdf']
.dta files found for 2018: ['650-Modulo-1466/17_Cap900.dta']
DataFrame for 2018 loaded successfully.
Processing dataset for year 2019...
Extracted files for 2019: ['701-Modulo1545/', '701-Modulo1545/01_CUESTIONARIO_PEQUEÑOS_MEDIANOS_2019.pdf', '701-Modulo1545/02_CUESTIONARIO_ESTRATO_ESPECIAL_2019.pdf', '701-Modulo1545/17_Ca

# Column name mapping based on the PDF dictionary

In [5]:
column_mapping_2017 = {
    "ANIO": "Year",
    "CCDD": "Department Code",
    "NOMBREDD": "Department Name",
    "CCPP": "Province Code",
    "NOMBREPV": "Province Name",
    "CCDI": "District Code",
    "NOMBREDI": "District Name",
    "CONGLOMERADO": "Conglomerate",
    "NSELUA": "Agricultural Unit Selection Number",
    "UA": "Agricultural Unit Number",
    "ESTRATO": "Stratum Type",
    "RESFIN": "Survey Final Result",
    "REGION": "Natural Region",
    "DOMINIO": "Geographic Domain",
    "FACTOR": "Expansion Factor",
    "CODIGO": "Identification Code",
    "P901": "Requested Credit",
    "P902": "Obtained Credit",
    "P903_1": "Credit Provider - AGROBANCO",
    "P903_2": "Credit Provider - Municipal Bank",
    "P903_3": "Credit Provider - Rural Bank",
    "P903_4": "Credit Provider - Private Bank",
    "P903_5": "Credit Provider - Financial Institution/EDPYME",
    "P903_6": "Credit Provider - NGO",
    "P903_7": "Credit Provider - Cooperative",
    "P903_8": "Credit Provider - Commercial Establishment",
    "P903_9": "Credit Provider - Money Lender",
    "P903_10": "Credit Provider - State Programs",
    "P903_11": "Credit Provider - Other",
    "P905": "Received Agricultural Insurance",
    "P906": "Insurance Provider",
    "P907": "Has Savings Account",
    "P908_1": "Savings Account - AGROBANCO",
    "P908_2": "Savings Account - Bank of the Nation",
    "P908_3": "Savings Account - Municipal Bank",
    "P908_4": "Savings Account - Rural Bank",
    "P908_5": "Savings Account - Private Bank",
    "P908_6": "Savings Account - Financial Institution/EDPYME",
    "P908_7": "Savings Account - Cooperative",
    "P908_8": "Savings Account - Other",
}

In [6]:
def standardize_columns(df, mapping):
    """
    Standardizes column names in the dataset using a unified mapping and retains only columns present in the DataFrame.

    Parameters:
    df (DataFrame): The dataset to be standardized.
    mapping (dict): The column mapping dictionary.

    Returns:
    DataFrame: The dataset with standardized column names.
    """
    existing_columns = {col: mapping[col] for col in df.columns if col in mapping}
    return df.rename(columns=existing_columns, inplace=False)

In [7]:
# Standardize all datasets
df_2017_standardized = standardize_columns(df_2017, column_mapping_2017)
df_2018_standardized = standardize_columns(df_2018, column_mapping_2017)
df_2019_standardized = standardize_columns(df_2019, column_mapping_2017)

In [8]:
# Align columns across datasets
all_columns_ordered = list(column_mapping_2017.values())

for col in df_2018_standardized.columns:
    if col not in all_columns_ordered:
        all_columns_ordered.append(col)

for col in df_2019_standardized.columns:
    if col not in all_columns_ordered:
        all_columns_ordered.append(col)

df_2017_aligned = df_2017_standardized.reindex(columns=all_columns_ordered)
df_2018_aligned = df_2018_standardized.reindex(columns=all_columns_ordered)
df_2019_aligned = df_2019_standardized.reindex(columns=all_columns_ordered)

In [9]:
# Add 'Year' column to identify datasets
df_2017_aligned['Year'] = 2017
df_2018_aligned['Year'] = 2018
df_2019_aligned['Year'] = 2019

In [None]:
# Combine datasets and add a unique_id column starting at 1
combined_df = pd.concat([df_2017_aligned, df_2018_aligned, df_2019_aligned], ignore_index=True)

# Add unique_id as the first column
combined_df.insert(0, "unique_id", range(1, len(combined_df) + 1))


  combined_df = pd.concat([df_2017_aligned, df_2018_aligned, df_2019_aligned], ignore_index=True)


In [11]:
# Save the combined dataset to a CSV file
### output_path = r"C:\Users\jcbur\OneDrive - peruvianbusinesscouncil.com\csv files\#14_Producer_Geo_Spatial_Analysis\ENA Data Dictionaries\4_Servicios Financieros\ena_financial_services_2017_2019.csv"
### combined_df.to_csv(output_path, index=False)


In [12]:
print(combined_df)

       unique_id  Year Department Code Department Name Province Code  \
0              1  2017              24          TUMBES            02   
1              2  2017              24          TUMBES            02   
2              3  2017              24          TUMBES            02   
3              4  2017              24          TUMBES            02   
4              5  2017              24          TUMBES            02   
...          ...   ...             ...             ...           ...   
85667      85668  2019              15            LIMA            08   
85668      85669  2019              20           PIURA            01   
85669      85670  2019              20           PIURA            01   
85670      85671  2019              20           PIURA            01   
85671      85672  2019              20           PIURA            01   

               Province Name District Code        District Name Conglomerate  \
0      CONTRALMIRANTE VILLAR            03  CANOAS DE P

In [13]:
observations = {}
for col in combined_df.columns:
    observations[col] = {
        'total_count': combined_df[col].size,
        'nan_count': combined_df[col].isna().sum(),
        'dtype': combined_df[col].dtype,
        'mean': combined_df[col].mean() if combined_df[col].dtype in ['int64', 'float64'] else None,
        'unique_count': combined_df[col].nunique(),
        'categories': combined_df[col].value_counts().head(10).to_dict() if combined_df[col].dtype == 'object' or combined_df[col].nunique() <= 10 else None
    }

for feature, details in observations.items():
    print(f"Feature: {feature}")
    for key, value in details.items():
        print(f"  {key}: {value}")
    print()

Feature: unique_id
  total_count: 85672
  nan_count: 0
  dtype: int64
  mean: 42836.5
  unique_count: 85672
  categories: None

Feature: Year
  total_count: 85672
  nan_count: 0
  dtype: int64
  mean: 2017.9949341675226
  unique_count: 3
  categories: {2017: 28804, 2018: 28498, 2019: 28370}

Feature: Department Code
  total_count: 85672
  nan_count: 0
  dtype: object
  mean: None
  unique_count: 25
  categories: {'13': 4331, '21': 4253, '02': 4159, '20': 4097, '06': 3921, '08': 3897, '10': 3888, '15': 3745, '01': 3736, '04': 3679}

Feature: Department Name
  total_count: 85672
  nan_count: 0
  dtype: object
  mean: None
  unique_count: 29
  categories: {'LA LIBERTAD': 4331, 'PUNO': 4253, 'ANCASH': 4159, 'PIURA': 4097, 'CAJAMARCA': 3921, 'CUSCO': 3897, 'LIMA': 3745, 'AMAZONAS': 3736, 'AREQUIPA': 3679, 'AYACUCHO': 3662}

Feature: Province Code
  total_count: 85672
  nan_count: 0
  dtype: object
  mean: None
  unique_count: 20
  categories: {'01': 22144, '02': 12948, '03': 10529, '05': 74