In [67]:
import os
import tabula
import pandas as pd

In [68]:
#Directorio de trabajo
os.chdir("C:/Users/claudio.pacheco/Documents/GitHub/economia/fichas/")

In [69]:
def process_pdf(file_path, layout="nuevo", page=5):
    
    def formato_nuevo(file_path, page):
        areas = {
            "Economic Performance": [50, 50, 285, 550],
            "Government Efficiency": [285, 50, 530, 550],
            "Business Efficiency": [530, 50, 680, 550],
            "Infrastructure": [680, 50, 841, 550]
        }

        column_positions = [150, 300, 450, 600]

        combined_df = pd.DataFrame()

        for section, area in areas.items():
            extracted_data = tabula.read_pdf(file_path, pages=page, area=area, columns=column_positions, multiple_tables=True)
            df = pd.concat(extracted_data, ignore_index=True) if isinstance(extracted_data, list) else extracted_data
            df.columns = ["strengths", "rank_strengths", "weakness", "rank_weak"]
            combined_df = pd.concat([combined_df, df], ignore_index=True)

        def process_dataframe(df):
            df.fillna("", inplace=True)
            df["strengths"] = df["strengths"] + df["rank_strengths"].str.extract('([a-zA-Z\s]+)', expand=False).fillna('')
            df["weakness"] = df["weakness"] + df["rank_weak"].str.extract('([a-zA-Z\s]+)', expand=False).fillna('')
            df["rank_strengths"] = df["rank_strengths"].str.extract('(\d+)', expand=False)
            df["rank_weak"] = df["rank_weak"].str.extract('(\d+)', expand=False)
            mask = (~df["strengths"].str.contains("STRENGTHS", case=False) & 
                    ~df["strengths"].str.contains("WEAKNESSES", case=False) & 
                    ~df["strengths"].str.isupper())
            return df[mask].reset_index(drop=True)

        return process_dataframe(combined_df)
    
    def formato_antiguo(file_path):
        tabla = tabula.read_pdf(file_path, pages=5)
        df = pd.DataFrame(tabla[0])
        df.columns = ["strengths", "rank_strengths", "weakness", "rank_weak"]
        df["rank_strengths"] = pd.to_numeric(df["rank_strengths"], errors="coerce")
        df["rank_weak"] = pd.to_numeric(df["rank_weak"], errors="coerce")
        df = df[~df["strengths"].isin(["Government Efficiency", "Business Efficiency", "Infrastructure"])]
        return df
    
    if layout == "nuevo":
        result_df = formato_nuevo(file_path, page)
    elif layout == "anterior":
        result_df = formato_antiguo(file_path)
    else:
        raise ValueError("Layout inválido. Tiene que ser 'nuevo' o 'anterior'.")
    
    if file_path == "Ficha_2021.pdf":
        additional_records = pd.DataFrame({
            "strengths": ["4.4.24 Environmental agreements", ""],
            "rank_strengths": [35, None],
            "weakness": ["4.5.04 Pupil-teacher ratio (primary education)", "4.4.26 Pollution problems"],
            "rank_weak": [59, 59]
        })
        result_df = pd.concat([result_df, additional_records], ignore_index=True)
    elif file_path == "Ficha_2022.pdf":
        result_df = result_df.iloc[:-3]
    
    return result_df


In [70]:
file_path="Ficha_2022.pdf"
process_pdf(file_path, layout="nuevo", page=5)

Unnamed: 0,strengths,rank_strengths,weakness,rank_weak
0,1.4.08 Long-term unemployment,1.0,1.2.17 Export concentration by partner,62
1,1.4.07 Unemployment rate,9.0,1.2.13 Exports of commercial services (%,56
2,1.4.09 Youth unemployment,10.0,1.5.01 Consumer price inflation,56
3,1.2.08 Exports of goods ($bn,12.0,1.1.19 Resilience of the economy,51
4,1.5.02 Cost-of-living index,14.0,1.5.05 Food costs,50
5,1.3.05 Direct investmentflows inward,14.0,1.1.21 GDP (PPP) per capita,50
6,1.3.07 Direct investmentstocks inward,17.0,1.1.20 GDP per capita,49
7,1.1.18 Gross fixed capitalformation,18.0,1.4.10 Youth exclusion,47
8,1.5.06 Gasoline prices,19.0,1.1.09 Gross fixed capital formation (%),46
9,1.1.13 Economic complexity index,21.0,1.3.04 Direct investment stocks abroad (%of GDP,43
