In [1]:
import os
import requests
import zipfile
import io
import pandas as pd

# Download, extract, and load .dta file into a DataFrame

In [2]:
# Function to download, extract, and load .dta file into a DataFrame
def load_dta_from_zip(url, year):
    print(f"Processing dataset for year {year}...")
    response = requests.get(url)
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    zip_file.extractall(f"extracted_files_{year}")

    # List all extracted files
    extracted_files = zip_file.namelist()
    print(f"Extracted files for {year}: {extracted_files}")

    # Identify .dta files
    dta_files = [file for file in extracted_files if file.endswith('.dta')]
    print(f".dta files found for {year}: {dta_files}")

    if dta_files:
        dta_file_path = os.path.join(f"extracted_files_{year}", dta_files[0])
        try:
            df = pd.read_stata(dta_file_path)
            print(f"DataFrame for {year} loaded successfully.")
            return df
        except ValueError as e:
            print(f"Error loading .dta file for {year}: {e}")
            return None
    else:
        print(f"No .dta files found for {year}.")
        return None

# URLs for datasets

In [3]:
urls = {
    2017: "https://proyectos.inei.gob.pe/iinei/srienaho/descarga/STATA/615-Modulo1334.zip",
    2018: "https://proyectos.inei.gob.pe/iinei/srienaho/descarga/STATA/650-Modulo1470.zip",
    2019: "https://proyectos.inei.gob.pe/iinei/srienaho/descarga/STATA/701-Modulo1549.zip"
}

# Loading the datasets

In [4]:
df_2017 = load_dta_from_zip(urls[2017], 2017)
df_2018 = load_dta_from_zip(urls[2018], 2018)
df_2019 = load_dta_from_zip(urls[2019], 2019)

Processing dataset for year 2017...
Extracted files for 2017: ['615-Modulo1334/', '615-Modulo1334/01_CUESTIONARIO_PEQUEÑOS_MEDIANOS_2017.pdf', '615-Modulo1334/02_CUESTIONARIO_ESTRATO_ESPECIAL_2017.pdf', '615-Modulo1334/20_Cap1200a.dta', '615-Modulo1334/Diccionario_Datos_20_CAP1200A.pdf']
.dta files found for 2017: ['615-Modulo1334/20_Cap1200a.dta']
DataFrame for 2017 loaded successfully.
Processing dataset for year 2018...
Extracted files for 2018: ['650-Modulo-1470/', '650-Modulo-1470/01_CUESTIONARIO_PEQUEÑOS_MEDIANOS_2018.pdf', '650-Modulo-1470/02_CUESTIONARIO_ESTRATO_ESPECIAL_2018.pdf', '650-Modulo-1470/21_Cap1200a.dta', '650-Modulo-1470/Diccionario_Datos_21_Cap1200a.pdf']
.dta files found for 2018: ['650-Modulo-1470/21_Cap1200a.dta']
DataFrame for 2018 loaded successfully.
Processing dataset for year 2019...
Extracted files for 2019: ['701-Modulo1549/', '701-Modulo1549/01_CUESTIONARIO_PEQUEÑOS_MEDIANOS_2019.pdf', '701-Modulo1549/02_CUESTIONARIO_ESTRATO_ESPECIAL_2019.pdf', '701-Modu

# Column name mapping based on the PDF dictionary

In [5]:
column_mapping_2017 = {
    "ANIO": "Year",
    "CCDD": "Department Code",
    "NOMBREDD": "Department Name",
    "CCPP": "Province Code",
    "NOMBREPV": "Province Name",
    "CCDI": "District Code",
    "NOMBREDI": "District Name",
    "CONGLOMERADO": "Conglomerate",
    "NSELUA": "Agricultural Unit Selection Number",
    "UA": "Agricultural Unit Number",
    "ESTRATO": "Stratum Type",
    "P1205": "Crop Number",
    "P1205_NOM": "Crop Name",
    "P1205_COD": "Crop Code",
    "P1205_TIPO": "Crop Type",
    "P1206_SUP_1": "Sown Area Integer",
    "P1206_SUP_2": "Sown Area Decimal",
    "P1206A": "Irrigation Type",
    "P1207": "Crop Management",
    "P1207A_MES": "Sowing Month",
    "P1207A_ANIO": "Sowing Year"
}

column_mapping_2018 = column_mapping_2017.copy()
column_mapping_2018.update({
    "P1207B": "Installed Plants"
})

column_mapping_2019 = column_mapping_2018.copy()
column_mapping_2019.update({
    "P1206B": "Seed Certification"
})

In [6]:
# Function to standardize column names
def standardize_columns(df, mapping):
    """
    Standardizes column names in the dataset using a unified mapping and retains only columns present in the DataFrame.

    Parameters:
    df (DataFrame): The dataset to be standardized.
    mapping (dict): The column mapping dictionary.

    Returns:
    DataFrame: The dataset with standardized column names.
    """
    existing_columns = {col: mapping[col] for col in df.columns if col in mapping}
    return df.rename(columns=existing_columns, inplace=False)

In [7]:
# Standardize all datasets
df_2017_standardized = standardize_columns(df_2017, column_mapping_2017)
df_2018_standardized = standardize_columns(df_2018, column_mapping_2018)
df_2019_standardized = standardize_columns(df_2019, column_mapping_2019)

In [8]:
# Align columns across datasets
all_columns_ordered = list(column_mapping_2017.values())

for col in df_2018_standardized.columns:
    if col not in all_columns_ordered:
        all_columns_ordered.append(col)

for col in df_2019_standardized.columns:
    if col not in all_columns_ordered:
        all_columns_ordered.append(col)

df_2017_aligned = df_2017_standardized.reindex(columns=all_columns_ordered)
df_2018_aligned = df_2018_standardized.reindex(columns=all_columns_ordered)
df_2019_aligned = df_2019_standardized.reindex(columns=all_columns_ordered)

In [9]:
# Add 'Year' column to identify datasets
df_2017_aligned['Year'] = 2017
df_2018_aligned['Year'] = 2018
df_2019_aligned['Year'] = 2019

In [10]:
# Combine datasets and add a unique_id column starting at 1
combined_df = pd.concat([df_2017_aligned, df_2018_aligned, df_2019_aligned], ignore_index=True)

# Add unique_id as the first column
combined_df.insert(0, "unique_id", range(1, len(combined_df) + 1))


  combined_df = pd.concat([df_2017_aligned, df_2018_aligned, df_2019_aligned], ignore_index=True)
  combined_df = pd.concat([df_2017_aligned, df_2018_aligned, df_2019_aligned], ignore_index=True)
  combined_df = pd.concat([df_2017_aligned, df_2018_aligned, df_2019_aligned], ignore_index=True)


In [11]:
# Save the combined dataset to a CSV file
### output_path = r"C:\Users\jcbur\OneDrive - peruvianbusinesscouncil.com\csv files\#14_Producer_Geo_Spatial_Analysis\ENA Data Dictionaries\6_Cultivos\ena_production_harvested_2017_2019.csv"
### combined_df.to_csv(output_path, index=False)


In [12]:
print(combined_df)

        unique_id  Year Department Code Department Name Province Code  \
0               1  2017              24          TUMBES            02   
1               2  2017              24          TUMBES            02   
2               3  2017              24          TUMBES            02   
3               4  2017              24          TUMBES            02   
4               5  2017              24          TUMBES            02   
...           ...   ...             ...             ...           ...   
363004     363005  2019              20           PIURA            01   
363005     363006  2019              20           PIURA            01   
363006     363007  2019              20           PIURA            01   
363007     363008  2019              20           PIURA            01   
363008     363009  2019              20           PIURA            01   

                Province Name District Code        District Name Conglomerate  \
0       CONTRALMIRANTE VILLAR            0

In [13]:
observations = {}
for col in combined_df.columns:
    observations[col] = {
        'total_count': combined_df[col].size,
        'nan_count': combined_df[col].isna().sum(),
        'dtype': combined_df[col].dtype,
        'mean': combined_df[col].mean() if combined_df[col].dtype in ['int64', 'float64'] else None,
        'unique_count': combined_df[col].nunique(),
        'categories': combined_df[col].value_counts().head(10).to_dict() if combined_df[col].dtype == 'object' or combined_df[col].nunique() <= 10 else None
    }

for feature, details in observations.items():
    print(f"Feature: {feature}")
    for key, value in details.items():
        print(f"  {key}: {value}")
    print()

Feature: unique_id
  total_count: 363009
  nan_count: 0
  dtype: int64
  mean: 181505.0
  unique_count: 363009
  categories: None

Feature: Year
  total_count: 363009
  nan_count: 0
  dtype: int64
  mean: 2017.9533923401348
  unique_count: 3
  categories: {2017: 127976, 2018: 123976, 2019: 111057}

Feature: Department Code
  total_count: 363009
  nan_count: 0
  dtype: object
  mean: None
  unique_count: 25
  categories: {'06': 23337, '01': 22141, '13': 22025, '18': 19268, '10': 18967, '09': 18017, '22': 17445, '05': 17375, '02': 16142, '03': 15896}

Feature: Department Name
  total_count: 363009
  nan_count: 0
  dtype: object
  mean: None
  unique_count: 29
  categories: {'CAJAMARCA': 23337, 'AMAZONAS': 22141, 'LA LIBERTAD': 22025, 'MOQUEGUA': 19268, 'HUANCAVELICA': 18017, 'AYACUCHO': 17375, 'ANCASH': 16142, 'MADRE DE DIOS': 15573, 'ICA': 15372, 'LIMA': 14774}

Feature: Province Code
  total_count: 363009
  nan_count: 0
  dtype: object
  mean: None
  unique_count: 20
  categories: {'01