In [139]:
import os
import tabula
import pandas as pd
import numpy as np
from openpyxl import Workbook

In [140]:
#Directorio de trabajo
try:
    os.chdir("C:/Users/claudio.pacheco/Documents/GitHub/economia/fichas/")
except:
    os.chdir("C:/Users/claud/Documents/GitHub/economia/fichas/")

In [142]:
def process_pdf(file_path, layout="nuevo", page=5):
    
    def formato_nuevo(file_path, page):
        areas = {
            "Economic Performance": [50, 50, 285, 550],
            "Government Efficiency": [285, 50, 530, 550],
            "Business Efficiency": [530, 50, 680, 550],
            "Infrastructure": [680, 50, 841, 550]
        }

        column_positions = [150, 300, 450, 600]

        combined_df = pd.DataFrame()

        for section, area in areas.items():
            extracted_data = tabula.read_pdf(file_path, pages=page, area=area, columns=column_positions, multiple_tables=True)
            df = pd.concat(extracted_data, ignore_index=True) if isinstance(extracted_data, list) else extracted_data
            df.columns = ["strengths", "rank_strengths", "weakness", "rank_weak"]
            combined_df = pd.concat([combined_df, df], ignore_index=True)

        def process_dataframe(df):
            df.fillna("", inplace=True)
            df["strengths"] = df["strengths"] + df["rank_strengths"].str.extract('([a-zA-Z\s]+)', expand=False).fillna('')
            df["weakness"] = df["weakness"] + df["rank_weak"].str.extract('([a-zA-Z\s]+)', expand=False).fillna('')
            df["rank_strengths"] = df["rank_strengths"].str.extract('(\d+)', expand=False)
            df["rank_weak"] = df["rank_weak"].str.extract('(\d+)', expand=False)
            mask = (~df["strengths"].str.contains("STRENGTHS", case=False) & 
                    ~df["strengths"].str.contains("WEAKNESSES", case=False) & 
                    ~df["strengths"].str.isupper())
            return df[mask].reset_index(drop=True)

        return process_dataframe(combined_df)
    
    def formato_antiguo(file_path):
        tabla = tabula.read_pdf(file_path, pages=5)
        df = pd.DataFrame(tabla[0])
        df.columns = ["strengths", "rank_strengths", "weakness", "rank_weak"]
        df["rank_strengths"] = pd.to_numeric(df["rank_strengths"], errors="coerce")
        df["rank_weak"] = pd.to_numeric(df["rank_weak"], errors="coerce")
        df = df[~df["strengths"].isin(["Government Efficiency", "Business Efficiency", "Infrastructure"])]
        return df
    
    if layout == "nuevo":
        result_df = formato_nuevo(file_path, page)
    elif layout == "anterior":
        result_df = formato_antiguo(file_path)
    else:
        raise ValueError("Layout inválido. Tiene que ser 'nuevo' o 'anterior'.")
    
    if file_path == "Ficha_2021.pdf":
        additional_records = pd.DataFrame({
            "strengths": ["4.4.24 Environmental agreements", ""],
            "rank_strengths": [35, None],
            "weakness": ["4.5.04 Pupil-teacher ratio (primary education)", "4.4.26 Pollution problems"],
            "rank_weak": [59, 59]
        })
        result_df = pd.concat([result_df, additional_records], ignore_index=True)
    elif file_path == "Ficha_2022.pdf":
        result_df = result_df.iloc[:-3]
    
    return result_df


In [156]:
def process_all_tables(file_path, pages):
    try:   
        df=pd.DataFrame()
        extract_data=tabula.read_pdf(file_path, pages=pages,pandas_options={"header":None},multiple_tables=True)
        for dfs in extract_data:
            df=pd.concat([df,dfs],ignore_index=True)
        df.columns=["concept", "value","average","rank", "year"]
        #Sustitur por Nan si contiene "Value", "Average", "Rank" o "Year"
        df["value"]=df["value"].replace(["Value"], np.nan)
        df["average"]=df["average"].replace(["Average"], np.nan)
        df["rank"]=df["rank"].replace(["Rank"], np.nan)
        df["year"]=df["year"].replace(["Year"], np.nan)
        #Eliminar comas en columnas value y average
        df["value"]=df["value"].str.replace(",","")
        df["average"]=df["average"].str.replace(",","")
        #Transformar columnas value, average, rank y year a numéricas
        df["value"]=pd.to_numeric(df["value"],errors="coerce")
        df["average"]=pd.to_numeric(df["average"],errors="coerce")
        df["rank"]=pd.to_numeric(df["rank"],errors="coerce")
        df["year"]=pd.to_numeric(df["year"],errors="coerce")
    except:
        df=pd.DataFrame()
        extract_data=tabula.read_pdf(file_path, pages=pages,pandas_options={"header":None},multiple_tables=True,stream=True)
        for dfs in extract_data:
            df=pd.concat([df,dfs],ignore_index=True)
    return df

In [157]:
# Parámetros para correr las funciones
combined_params = {
    "Ficha_2019.pdf": {
        "primera_func": {"layout": "anterior", "page": 5},
        "segunda_func": {"pages": "6-21"}
    },
    "Ficha_2020.pdf": {
        "primera_func": {"layout": "anterior", "page": 5},
        "segunda_func": {"pages": "6-21"}
    },
    "Ficha_2021.pdf": {
        "primera_func": {"layout": "anterior", "page": "5-6"},
        "segunda_func": {"pages": "7-22"}
    },
    "Ficha_2022.pdf": {
        "primera_func": {"layout": "nuevo", "page": 5},
        "segunda_func": {"pages": "6-22"}
    },
    "Ficha_2023.pdf": {
        "primera_func": {"layout": "nuevo", "page": 5},
        "segunda_func": {"pages": "6-25"}
    }
}


In [159]:
# Iterar sobre cada PDF y sus parámetros
for file_name, params in combined_params.items():
    print(f"Procesando el archivo {file_name}...")

    # Extraer el nombre base del archivo
    base_name = file_name.replace(".pdf", "")
    excel_file = f"{base_name}.xlsx"

    # Procesar el pdf con las funciones
    df1 = process_pdf(file_name, **params["primera_func"])
    df2 = process_all_tables(file_name, **params["segunda_func"])

    # Crear un nuevo archivo Excel
    wb = Workbook()
    wb.save(excel_file)  

    # Abrir el archivo Excel y escribir los datos
    with pd.ExcelWriter(excel_file, engine="openpyxl") as writer:
        df1.to_excel(writer, sheet_name="Competitiveness", index=False)
        df2.to_excel(writer, sheet_name="Indicators", index=False)



Procesando el archivo Ficha_2019.pdf...
Procesando el archivo Ficha_2020.pdf...
Procesando el archivo Ficha_2021.pdf...
Procesando el archivo Ficha_2022.pdf...
Procesando el archivo Ficha_2023.pdf...
